# Spam SMS Detection

In [2]:
import pandas as pd
df=pd.read_csv("spam.csv",encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2,Unnamed: 2,Unnamed: 2,Unnamed: 2,Unnamed: 2,Unnamed: 3,Unnamed: 3,Unnamed: 3,Unnamed: 3,Unnamed: 4,Unnamed: 4,Unnamed: 4,Unnamed: 4
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
ham,4825,4516,"Sorry, I'll call later",30,45,39,"bt not his girlfrnd... G o o d n i g h t . . .@""",3,10,9,GE,2,6,5,"GNT:-)""",2.0
spam,747,653,Please call our customer service representativ...,4,5,4,PO Box 5249,2,2,1,"MK17 92H. 450Ppw 16""",2,0,0,,


In [31]:
# Creating a new column as category in the dataframe
df['category']=df['v1'].apply(lambda x:1 if x=='spam' else 0)

In [32]:
df.head(11)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,category
0,ham,"Go until jurong point, crazy.. Available only ...",,,,0
1,ham,Ok lar... Joking wif u oni...,,,,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1
3,ham,U dun say so early hor... U c already then say...,,,,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,,1
6,ham,Even my brother is not like to speak with me. ...,,,,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,,0
8,spam,WINNER!! As a valued network customer you have...,,,,1
9,spam,Had your mobile 11 months or more? U R entitle...,,,,1


In [33]:
df=df[['v2','category']]

In [34]:
df.head(11)

Unnamed: 0,v2,category
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [35]:
# Renaming the v2 column to sms
df.rename(columns={'v2':'sms'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'v2':'sms'},inplace=True)


In [36]:
df.head(4)

Unnamed: 0,sms,category
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0


Training & Testing

In [37]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(df.sms, df.category, test_size=0.25)

Feature Extraction

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
cv=CountVectorizer()
x_traincount=cv.fit_transform(x_train.values)
x_traincount.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Naive Bayes Model

In [40]:
from sklearn.naive_bayes import MultinomialNB

In [41]:
model=MultinomialNB()
model.fit(x_traincount, y_train)

In [44]:
emails={
    'Hey Bro, can we watch a match together?',
    'Upto 50% discount on resturant coupons, exclusive offer just for you',
    'I"ve a discount of 80% can we go to the shopping mall'
}
emailscount=cv.transform(emails)
model.predict(emailscount)

array([1, 0, 0], dtype=int64)

check Accuracy

In [45]:
x_testcount=cv.transform(x_test)
model.score(x_testcount, y_test)

0.9870782483847811

# Creating Pipeline

The need for using a pipeline in this case arises from the desire to avoid repeating the preprocessing steps for both the training and test datasets.

In [46]:
from sklearn.pipeline import Pipeline

In [47]:
clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('model',MultinomialNB())
])

In [48]:
clf.fit(x_train, y_train)

In [50]:
clf.score(x_test,y_test)

0.9870782483847811

In [51]:
clf.predict(emails)

array([1, 0, 0], dtype=int64)