In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score,accuracy_score,classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('sms_spam.csv')
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.groupby('type').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4827,4518,"Sorry, I'll call later",30
spam,747,642,Please call our customer service representativ...,4


In [4]:
df['spam'] = df['text'].apply(lambda x:1 if x == 'spam' else 0)
df.head()

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
x_train,x_test,y_train,y_test = train_test_split(df.text,df.spam,test_size=0.25)

In [6]:
count = CountVectorizer()
x_train_count = count.fit_transform(x_train.values)
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
model = MultinomialNB()
model.fit(x_train_count,y_train)

MultinomialNB()

In [8]:
x_test_pred = count.transform(x_test)
accuracy_score(model.predict(x_test_pred),y_test)

1.0

In [20]:
print(f"classification report : {classification_report(model.predict(x_test_pred),y_test)}")


classification report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1394

    accuracy                           1.00      1394
   macro avg       1.00      1.00      1.00      1394
weighted avg       1.00      1.00      1.00      1394



In [21]:
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])


In [22]:
clf.fit(x_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [23]:
clf.score(x_test,y_test)

1.0

In [24]:
cross_val_score(MultinomialNB(),x_train_count,y_train)

array([1., 1., 1., 1., 1.])