In [8]:
import pandas as pd
df = pd.read_csv('spam.csv',encoding_errors= 'replace')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [10]:
df.v1.value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [11]:
spam = df[df["v1"] == "spam"]
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",,,


In [12]:
ham = df[df["v1"] == "ham"]
ham.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
6,ham,Even my brother is not like to speak with me. ...,,,


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['v2'], df['v1'], test_size = 0.2, random_state = 1)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [17]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

clf = GaussianNB()
clf.fit(X_train.toarray(),y_train)

In [18]:
y_true, y_pred = y_test, clf.predict(X_test.toarray())
accuracy_score(y_true, y_pred)

0.9067264573991032

In [19]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         ham       0.99      0.91      0.94       976
        spam       0.58      0.91      0.71       139

    accuracy                           0.91      1115
   macro avg       0.78      0.91      0.83      1115
weighted avg       0.94      0.91      0.92      1115



In [20]:
cmtx = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=['ham', 'spam']), 
    index=['ham', 'spam'], 
    columns=['ham', 'spam']
)
print(cmtx)

      ham  spam
ham   884    92
spam   12   127


In [21]:
from sklearn.model_selection import GridSearchCV
parameters = {"var_smoothing":[1e-9, 1e-5, 1e-1]}
gs_clf = GridSearchCV(
        GaussianNB(), parameters)
gs_clf.fit(X_train.toarray(),y_train)

In [22]:
gs_clf.best_params_

{'var_smoothing': 0.1}

In [23]:
y_true, y_pred = y_test, gs_clf.predict(X_test.toarray())
accuracy_score(y_true, y_pred)

0.9659192825112107

In [24]:
cmtx = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=['ham', 'spam']), 
    index=['ham', 'spam'], 
    columns=['ham', 'spam']
)
print(cmtx)

      ham  spam
ham   942    34
spam    4   135


In [25]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         ham       1.00      0.97      0.98       976
        spam       0.80      0.97      0.88       139

    accuracy                           0.97      1115
   macro avg       0.90      0.97      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [26]:
message = vectorizer.transform(["i'm on my way home"])
message = message.toarray()
gs_clf.predict(message)

array(['ham'], dtype='<U4')

In [27]:
message = vectorizer.transform(["this offer is to good to be true"])
message = message.toarray()
gs_clf.predict(message)

array(['spam'], dtype='<U4')

In [28]:
import joblib
joblib.dump(gs_clf, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']