# gmail spam detection

In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd


In [3]:
dt=pd.read_csv("SPAM.csv",encoding='Windows-1252')
dt.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
import chardet
with open("SPAM.csv","rb") as rawdata:
    result=chardet.detect(rawdata.read(100000))

In [5]:
result

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}

In [6]:
dt['spam']=dt['type'].map({'spam':1,'ham':0}).astype(int)

In [7]:
dt.head(10)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [8]:
print("To print columns in given data")
for col in dt.columns:
    print(col)

To print columns in given data
type
text
spam


In [9]:
t=len(dt['type'])
print("Number of rows in review column:",t)
t=len(dt['text'])
print("Number of rows in liked column:",t)


Number of rows in review column: 116
Number of rows in liked column: 116


# Tokenization

In [10]:
dt['text'][5]

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv"

In [11]:
def tokenizer(text):
    return text.split()

In [12]:
dt['text']=dt['text'].apply(tokenizer)

In [13]:
dt['text'][5]

['FreeMsg',
 'Hey',
 'there',
 'darling',
 "it's",
 'been',
 '3',
 "week's",
 'now',
 'and',
 'no',
 'word',
 'back!',
 "I'd",
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still?',
 'Tb',
 'ok!',
 'XxX',
 'std',
 'chgs',
 'to',
 'send,',
 'Â£1.50',
 'to',
 'rcv']

# Stemming

In [14]:
from nltk.stem.snowball import SnowballStemmer
porter=SnowballStemmer("english",ignore_stopwords=False)

In [15]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [16]:
dt['text']=dt['text'].apply(stem_it)

In [17]:
dt['text'][5]

['freemsg',
 'hey',
 'there',
 'darl',
 'it',
 'been',
 '3',
 'week',
 'now',
 'and',
 'no',
 'word',
 'back!',
 "i'd",
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still?',
 'tb',
 'ok!',
 'xxx',
 'std',
 'chgs',
 'to',
 'send,',
 'â£1.50',
 'to',
 'rcv']

# Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [19]:
def lemmit_it(text):
    return[lemmatizer.lemmatize(word,pos='a') for word in text]

In [20]:
dt['text']=dt['text'].apply(lemmit_it)

In [21]:
dt['text'][5]

['freemsg',
 'hey',
 'there',
 'darl',
 'it',
 'been',
 '3',
 'week',
 'now',
 'and',
 'no',
 'word',
 'back!',
 "i'd",
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still?',
 'tb',
 'ok!',
 'xxx',
 'std',
 'chgs',
 'to',
 'send,',
 'â£1.50',
 'to',
 'rcv']

# Stopword removal

In [22]:
dt['text'][41]

['did',
 'i',
 'forget',
 'to',
 'tell',
 'you',
 '?',
 'i',
 'want',
 'you',
 ',',
 'i',
 'need',
 'you,',
 'i',
 'crave',
 'you',
 '...',
 'but',
 'most',
 'of',
 'all',
 '...',
 'i',
 'love',
 'you',
 'my',
 'sweet',
 'arabian',
 'steed',
 '...',
 'mmmmmm',
 '...',
 'yummi']

In [23]:
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

In [24]:
def stop_it(text):
    review=[word for word in text if not word in stop_words]
    return review

In [25]:
dt['text']=dt['text'].apply(stop_it)

In [26]:
dt['text'][41]

['forget',
 'tell',
 '?',
 'want',
 ',',
 'need',
 'you,',
 'crave',
 '...',
 '...',
 'love',
 'sweet',
 'arabian',
 'steed',
 '...',
 'mmmmmm',
 '...',
 'yummi']

In [27]:
dt.head(10)

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1


In [36]:
dt['text']=dt['text'].apply(' '.join)

In [37]:
dt.head()

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


# Transform text data into TDF/TF-IDF Vectors

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
y=dt.spam.values
X=tfidf.fit_transform(dt['text'])


In [47]:
from sklearn.model_selection import train_test_split

In [53]:
 x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=False)

In [54]:
X

<116x712 sparse matrix of type '<class 'numpy.float64'>'
	with 1081 stored elements in Compressed Sparse Row format>

# 

In [50]:
y

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0])

In [51]:
x_train

<77x712 sparse matrix of type '<class 'numpy.float64'>'
	with 687 stored elements in Compressed Sparse Row format>

# Logistic Regression

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr=LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
acc_log=accuracy_score(y_pred,y_test)*100
print("accuracy:",acc_log)

accuracy: 87.5
