In [1]:
import nltk

In [2]:
#Step 1: Importing Dataset

import numpy as np
import pandas as pd

dt= pd.read_csv("SPAM.csv", encoding='windows-1254')
dt.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
#Step 2: Preprocessing Dataset
#spliting the term spam to 1 and ham to 0 for easy recognization

dt["spam"]= dt['type'].map({'spam': 1, 'ham': 0}).astype(int)
dt.head(10)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [4]:
#display names of all columns present in dataset

print("Total names of column")
for col in dt.columns:
    print(col)

Total names of column
type
text
spam


In [5]:
#tdisplay length of review and liked rows

t= len(dt['type'])
print("Total number of rows in review column:",t)

t= len(dt['text'])
print("Total number of rows in liked column:", t)

Total number of rows in review column: 116
Total number of rows in liked column: 116


In [6]:
#Step 3: tokenization
#diving each word of sentences into tokens for focusing on some specific words

dt['text'][1]

'Ok lar... Joking wif u oni...'

In [7]:
def tokenizer(text):
    return text.split()

In [8]:
dt['text']=dt['text'].apply(tokenizer)

In [9]:
dt['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [10]:
#Step 4: Stemming or removal of suffix of a word 
from nltk.stem.snowball import SnowballStemmer
porter= SnowballStemmer("english", ignore_stopwords= False)

In [11]:
def stem_it(text):
    return[porter.stem(word) for word in text]

In [12]:
dt['text']= dt['text'].apply(stem_it)

In [13]:
dt['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [14]:
dt['text'][109]

['i',
 'know!',
 'grumpi',
 'old',
 'people.',
 'my',
 'mom',
 'was',
 'like',
 'you',
 'better',
 'not',
 'be',
 'lying.',
 'then',
 'again',
 'i',
 'am',
 'alway',
 'the',
 'one',
 'to',
 'play',
 'jokes...']

In [15]:
from nltk.stem import WordNetLemmatizer
Lemmatizer= WordNetLemmatizer()

In [16]:
def lemmit_it(text):
    return[Lemmatizer.lemmatize(word, pos='a')for word in text]

In [17]:
dt['text']= dt['text'].apply(lemmit_it)

In [18]:
dt['text'][109]

['i',
 'know!',
 'grumpi',
 'old',
 'people.',
 'my',
 'mom',
 'was',
 'like',
 'you',
 'good',
 'not',
 'be',
 'lying.',
 'then',
 'again',
 'i',
 'am',
 'alway',
 'the',
 'one',
 'to',
 'play',
 'jokes...']

In [19]:
#Step 5: Stopwords removal
#in this step we remove the words that doesn't matter for detecting whether the mail is spam or not
dt['text'][114]

['gent!',
 'we',
 'are',
 'tri',
 'to',
 'contact',
 'you.',
 'last',
 'weekend',
 'draw',
 'show',
 'that',
 'you',
 'won',
 'a',
 'â£1000',
 'prize',
 'guaranteed.',
 'call',
 '09064012160.',
 'claim',
 'code',
 'k52.',
 'valid',
 '12hrs',
 'only.',
 '150ppm']

In [20]:
from nltk.corpus import stopwords
stop_words= stopwords.words('english')

In [21]:
def stop_it(text):
    review= [word for word in text if not word in stop_words]
    return review

In [22]:
dt['text']= dt['text'].apply(stop_it)

In [23]:
dt['text'][114]

['gent!',
 'tri',
 'contact',
 'you.',
 'last',
 'weekend',
 'draw',
 'show',
 'â£1000',
 'prize',
 'guaranteed.',
 'call',
 '09064012160.',
 'claim',
 'code',
 'k52.',
 'valid',
 '12hrs',
 'only.',
 '150ppm']

In [24]:
dt.head(10)
#as we can see all the articles and other less usefull word are reduced

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1


In [25]:
dt['text']=dt['text'].apply(' '.join)

In [26]:
dt.head(10)
#we join the words and remove , and []

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0
5,spam,freemsg hey darl 3 week word back! i'd like fu...,1
6,ham,even brother like speak me. treat like aid pat...,0
7,ham,per request mell mell (oru minnaminungint nuru...,0
8,spam,winner!! valu network custom select receivea â...,1
9,spam,mobil 11 month more? u r entitl updat late col...,1


In [27]:
#Step 6: Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer()

y= dt.spam.values
x= tfidf.fit_transform(dt['text'])

In [28]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x,y,random_state=1, test_size= 0.2, shuffle= False)

In [29]:
#Step 7: Classification using logistic Regression

from sklearn.linear_model import LogisticRegression
clf= LogisticRegression()
clf.fit(x_train,y_train)
y_pred =clf.predict(x_test)

from sklearn.metrics import accuracy_score
acc_log= accuracy_score(y_pred, y_test)* 100
print("accuracy:", acc_log)

accuracy: 87.5


In [30]:
#Step 7: Classifcation using SVC Accuracy 
#This is an alternative and also more accurate than logistic regression

from sklearn.svm import LinearSVC
linear_SVC= LinearSVC(random_state=0)
linear_SVC.fit (x_train,y_train)
y_pred = linear_SVC.predict(x_test)
acc_linear_SVC= accuracy_score(y_pred,y_test)*100
print("accuracy:", acc_linear_SVC)

accuracy: 87.5
