## SPAM SMS DETECTION

In [2]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score,precision_score
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [36]:
df = pd.read_csv("data/spam.csv",encoding="cp1252")

In [30]:
with open("data/spam.csv") as f:
    print(f)

<_io.TextIOWrapper name='data/spam.csv' mode='r' encoding='cp1252'>


In [38]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14'],
      dtype='object')

In [40]:
df = df[["v1","v2"]]

In [42]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,Nah I dont think he goes to usf
...,...,...
4998,spam,REMINDER FROM O2: To get 2.50 pounds free call...
4999,spam,This is the 2nd time we have tried 2 contact u...
5000,ham,Will ï¿½_ b going to esplanade fr home?
5001,ham,The guy did some bitching but I acted like i'd...


In [44]:
df.rename(columns={"v1":"sms","v2":"text"},inplace=True)

In [46]:
df.duplicated().sum()

233

In [48]:
df.drop_duplicates(inplace=True)

In [50]:
df.duplicated().sum()

0

In [52]:
df.isna().sum()

sms     0
text    0
dtype: int64

In [54]:
df

Unnamed: 0,sms,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,Nah I dont think he goes to usf
...,...,...
4998,spam,REMINDER FROM O2: To get 2.50 pounds free call...
4999,spam,This is the 2nd time we have tried 2 contact u...
5000,ham,Will ï¿½_ b going to esplanade fr home?
5001,ham,The guy did some bitching but I acted like i'd...


In [56]:
def remove_punctuation(col):
    col = col.lower()
    for i in punctuation:
        col = col.replace(i,"")
    return col
    

In [58]:
df.text[1]

'Ok lar... Joking wif u oni...'

In [60]:
df.text = df.text.apply(remove_punctuation)
df.sample(5)

Unnamed: 0,sms,text
2798,ham,deep sigh i miss you i am really surprised ...
2443,ham,oh baby of the house how come you dont have an...
1872,spam,urgent your mobile no won a ï¿½2000 onus call...
4341,ham,hmmm mayb can try e shoppin area one but forgo...
873,ham,hey no i ad a crap nite was borin without ya 2...


In [63]:
def word_Tokenize(col):
    return word_tokenize(col)

In [65]:
df.text = df.text.apply(word_Tokenize)

In [66]:
df.sample(5)

Unnamed: 0,sms,text
3499,ham,"[oh, ok, wait, 4, me, there, my, lect, havent,..."
4311,ham,"[honey, can, you, pls, find, out, how, much, t..."
3461,ham,"[i, agree, so, i, can, stop, thinkin, about, i..."
1301,ham,"[dont, look, back, at, the, building, because,..."
2249,ham,"[i, dont, thnk, its, a, wrong, calling, betwee..."


In [67]:
def remove_stopwords(col):
    l =[]
    for i in col:
        if i not in stopwords.words("english"):
            l.append(i)
    return l

In [68]:
df.text = df.text.apply(remove_stopwords)

In [69]:
df

Unnamed: 0,sms,text
0,ham,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, dont, think, goes, usf]"
...,...,...
4998,spam,"[reminder, o2, get, 250, pounds, free, call, c..."
4999,spam,"[2nd, time, tried, 2, contact, u, u, ï¿½750, p..."
5000,ham,"[ï¿½, b, going, esplanade, fr, home]"
5001,ham,"[guy, bitching, acted, like, id, interested, b..."


In [70]:
ps = PorterStemmer()
def word_stem(col):
    l = []
    for i in col:
        l.append(ps.stem(i))
    return " ".join(l)


In [71]:
df.text = df.text.apply(word_stem)

In [72]:
x  = df[["text"]]
y= df.sms

In [76]:
oversample = RandomOverSampler(sampling_strategy="minority")
X_over, y_over = oversample.fit_resample(x, y)
print("Oversampled class distribution:", Counter(y_over))


Oversampled class distribution: Counter({'ham': 4142, 'ham"""': 4142, 'spam': 627})


In [77]:
tfidf = TfidfVectorizer(max_features=3000)


In [78]:
x = tfidf.fit_transform(df.text).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [79]:
y = df.sms.values
y

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [80]:
x_train ,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)


In [82]:
mnb = MultinomialNB()
gnb = GaussianNB()

In [83]:
mnb.fit(x_train,y_train)
gnb.fit(x_train,y_train)

In [84]:
y_pred1= mnb.predict(x_test)
y_pred2= gnb.predict(x_test)


In [85]:
y_pred1,y_pred2

(array(['ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham',
        'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
        'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
        'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham',
        'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam',
        'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham',
        'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
        'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
        'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
        'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
        'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham',
        'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham',
        'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
        'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham'

In [86]:
accuracy1 = accuracy_score(y_test,y_pred1)
accuracy2 = accuracy_score(y_test,y_pred2)
print(accuracy1)
print(accuracy2)

0.9675052410901468
0.8417190775681341


In [119]:
mnb.predict(tfidf.transform(["Real stories, Real impact: ₹10,000 Off GUVIs Top Programs"]))[0]

'ham'

In [None]:
mnb.predict(tfidf.transform(["We are thrilled to inform you that you have been selected as the winner of our grand prize: a whopping $1,000,000 cash reward! This is an exclusive offer available only to our valued customers."]))[0]