In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings('ignore')

/kaggle/input/sms-spam-collection-dataset/spam.csv


### Data loading

In [36]:
# df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv')
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding="latin1")


### Data preprocessing

In [37]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [39]:
pd.set_option('display.max_colwidth', None)

In [40]:
print(df[df['Unnamed: 2'].notna()][['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']])

        v1  \
95    spam   
281    ham   
444    ham   
671   spam   
710    ham   
899   spam   
1038   ham   
1127   ham   
1266   ham   
1384   ham   
1428   ham   
1559   ham   
1637  spam   
1669   ham   
1675   ham   
1936   ham   
1937   ham   
1997   ham   
2087   ham   
2170   ham   
2235   ham   
2255   ham   
2406   ham   
2791   ham   
2970   ham   
3110   ham   
3145   ham   
3239   ham   
3506   ham   
3525   ham   
3547   ham   
3640  spam   
3758   ham   
3788   ham   
3793   ham   
3881   ham   
4130   ham   
4224   ham   
4454   ham   
4455   ham   
4570   ham   
4601   ham   
4668   ham   
4859   ham   
4992   ham   
5048   ham   
5079   ham   
5081   ham   
5264   ham   
5268   ham   

                                                                                                                                                                                                                                                                                          v2 

#### The Unnamed columns appear to contain message fragments, likely due to encoding or formatting issues during file import. so, concatenating these columns with the main message column and rename the result as message.

In [41]:
df['message'] = (
    df['v2'].fillna('') + ' ' +
    df['Unnamed: 2'].fillna('') + ' ' +
    df['Unnamed: 3'].fillna('') + ' ' +
    df['Unnamed: 4'].fillna('')
).str.strip()

In [42]:
df = df[['v1', 'message']]
df.columns = ['label', 'message']

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [44]:
df = df[df['message'].str.strip() != ''].reset_index(drop=True)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### Encoding

In [18]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [19]:
df["label"].value_counts()

label
0    4825
1     747
Name: count, dtype: int64

#### Dataset is imbalanced, ~13% of messages are spam, ~87% are ham. Using stratified split

In [20]:
from sklearn.model_selection import train_test_split

X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

### Vectorization

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

### Using Naive bayes 

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

[[966   0]
 [ 35 114]]


### Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train_vec, y_train)
y_probs = model.predict_proba(X_test_vec)[:, 1]
for t in [0.2, 0.3, 0.4, 0.5]:
    y_pred = (y_probs >= t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_test, y_pred))



Threshold: 0.2
              precision    recall  f1-score   support

           0       1.00      0.87      0.93       966
           1       0.53      0.99      0.69       149

    accuracy                           0.88      1115
   macro avg       0.77      0.93      0.81      1115
weighted avg       0.94      0.88      0.90      1115


Threshold: 0.3
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       966
           1       0.77      0.95      0.85       149

    accuracy                           0.96      1115
   macro avg       0.88      0.95      0.91      1115
weighted avg       0.96      0.96      0.96      1115


Threshold: 0.4
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       966
           1       0.90      0.94      0.92       149

    accuracy                           0.98      1115
   macro avg       0.94      0.96      0.95      1115
weighted avg       0.98   

### 0.4 threshold has better Recall and precision

### Pickled the model for new data prediction

In [26]:
import joblib

joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [34]:
def predict_message(text):
    vect_text = vectorizer.transform([text])
    prob = model.predict_proba(vect_text)[0][1]
    return "Spam" if prob >= 0.4 else "Ham", prob

In [35]:
msg = "Free entry in 2 a weekly competition..."
label, confidence = predict_message(msg)
print(f"{label} ({confidence:.2f} confidence)")

Spam (0.61 confidence)
