In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

In [131]:
df=pd.read_csv('spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [132]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [133]:
df.shape

(5572, 2)

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [135]:
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [136]:
df = df.rename(columns={"v1": "label", "v2": "message"})
df = df[["label", "message"]]

In [137]:
df["label"] = df["label"].map({"ham": 0, "spam": 1})

In [138]:
X_train, X_test, y_train, y_test = train_test_split(
    df["message"], df["label"], test_size=0.2, random_state=42)

In [139]:
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [140]:
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_vec, y_train)

In [141]:
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_bal.value_counts())

Before SMOTE: label
0    3860
1     597
Name: count, dtype: int64
After SMOTE: label
0    3860
1    3860
Name: count, dtype: int64


In [142]:
model = MultinomialNB()
model.fit(X_train_bal, y_train_bal)
y_pred = model.predict(X_test_vec)

In [143]:
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.967713004484305

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       965
           1       0.86      0.91      0.88       150

    accuracy                           0.97      1115
   macro avg       0.92      0.94      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
 [[943  22]
 [ 14 136]]


In [144]:
import joblib
joblib.dump(model, 'sms_spam_classifier.pkl')


['sms_spam_classifier.pkl']