In [13]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,make_scorer,accuracy_score
import pickle

DATA PREPROCESSING

In [14]:
spam_data = pd.read_csv('data/spam.csv',encoding="latin-1")

#need two labels in numeric format for "ham" and "spam"
spam_data['label'] = spam_data['v1'].map({'ham' : 0, 'spam' : 1})

X = spam_data['v2']
y = spam_data.label

CREATE TEST TRAIN

In [15]:
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=50)


RUN TRAINING

In [16]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

In [17]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}
acc = make_scorer(accuracy_score)

In [18]:
gsearch = GridSearchCV(text_clf,parameters,cv=10,scoring=acc)

In [19]:
gsearch.fit(X_train,y_train)

In [20]:
gsearch.score(X_test,y_test)

0.9806173725771715

In [32]:
lst = [X_test[0]]
print(gsearch.predict(lst))

In [21]:
y_pred = gsearch.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1193
           1       0.97      0.89      0.93       200

    accuracy                           0.98      1393
   macro avg       0.98      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



Saving MODEL and VECTORIZER

In [24]:
pickle.dump(gsearch,open('model.pkl',"wb"))