Importing libraries

In [20]:
import numpy as np
import pandas as pd
import chardet
import nltk

Importing Dataset

In [21]:
u = chardet.UniversalDetector()
dataset = pd.read_csv('spam.csv',usecols=range(2),encoding='iso-8859-1')
dataset.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [22]:
dataset['spam'] = dataset['type'].map({'spam':1,'ham':0}).astype(int)
dataset.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [23]:
print("COLUMNS IN THE GIVEN DATA:")
for col in dataset.columns:
    print(col)

COLUMNS IN THE GIVEN DATA:
type
text
spam


In [24]:
t = len(dataset['type'])
print("NO OF ROWS IN REVIEW COLUMN:",t)
t = len(dataset['text'])
print("NO OF ROWS IN LINKED COLUMN:",t)

NO OF ROWS IN REVIEW COLUMN: 116
NO OF ROWS IN LINKED COLUMN: 116


Tokenizer

In [25]:
dataset['text'][1] #before

'Ok lar... Joking wif u oni...'

In [26]:
def tokenizer(text):
    return text.split()

In [27]:
dataset['text']=dataset['text'].apply(tokenizer)

In [28]:
dataset['text'][1] #after

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

Stemming

In [29]:
dataset['text'][1] #before

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [33]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english",ignore_stopwords=False)

In [34]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [36]:
dataset['text'] = dataset['text'].apply(stem_it)

In [37]:
dataset['text'][1] #after

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

Lemmitization

In [56]:
dataset['text'][100] #before

['pleas',
 "don't",
 'text',
 'me',
 'anymore.',
 'i',
 'have',
 'noth',
 'els',
 'to',
 'say.']

In [57]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [58]:
def lemmit_it(text):
    return [lemmatizer.lemmatize(word, pos = 'a') for word in text]

In [59]:
dataset['text'] = dataset['text'].apply(lemmit_it)

In [60]:
dataset['text']=dataset['text'].apply(lemmit_it)

In [61]:
dataset['text'][100] #after

['pleas',
 "don't",
 'text',
 'me',
 'anymore.',
 'i',
 'have',
 'noth',
 'els',
 'to',
 'say.']

StopWord Removal

In [63]:
dataset['text'][100] #bedore

['pleas',
 "don't",
 'text',
 'me',
 'anymore.',
 'i',
 'have',
 'noth',
 'els',
 'to',
 'say.']

In [64]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [66]:
def stop_it(text):
    review= [word for word in text if not word in stop_words]
    return review

In [67]:
dataset['text']=dataset['text'].apply(stop_it)

In [68]:
dataset['text'][100] #after

['pleas', 'text', 'anymore.', 'noth', 'els', 'say.']

In [69]:
dataset.head(10)

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1


In [90]:
dataset['text']=dataset['text'].apply(' '.join)

In [91]:
dataset.head()

Unnamed: 0,type,text,spam
0,ham,"g o j u r o n g p o i n t , c r a z y . ...",0
1,ham,o k l a r . . . j o k e w i f u o n ...,0
2,spam,f r e e e n t r i 2 w k l i c o m p ...,1
3,ham,u d u n s a y e a r l i h o r . . . ...,0
4,ham,"n a h t h i n k g o e u s f , l i v e ...",0


Transform Text Data into TDF/TF-IDF Vectors

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y =dataset.spam.values
x=tfidf.fit_transform(dataset['text'])

In [83]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1,test_size=0.2,shuffle=False)

Classification using Logistic Regression

In [93]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred,y_test)*100
print("accuracy:",acc_log)

accuracy: 87.5


Classification using LinearSVC Accuracy

In [89]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train,y_train)
y_pred = linear_svc.predict(x_test)
acc_linear_svc = accuracy_score(y_pred,y_test)*100
print("accuracy:",acc_linear_svc)

accuracy: 87.5
