* Importing DataSet

In [1]:
import pandas as pd
data=pd.read_csv('spam.csv',encoding="ISO-8859-1")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
data.shape

(5572, 5)

In [3]:
data['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

* Cleaning Data

Step 1 : Removing Punctuations

In [5]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
def removePunctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
data['cleaned_msg']=data['v2'].apply(lambda x:removePunctuation(x))
data.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,cleaned_msg
0,ham,"Go until jurong point, crazy.. Available only ...",,,,Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,,,,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,Free entry in 2 a wkly comp to win FA Cup fina...


Step 2 : Lowering Text

In [7]:
data['lowerCase_msg']=data['cleaned_msg'].apply(lambda x:x.lower())
data[['cleaned_msg','lowerCase_msg']].set_index(data['v1'])

Unnamed: 0_level_0,cleaned_msg,lowerCase_msg
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...
ham,Ok lar Joking wif u oni,ok lar joking wif u oni
spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
ham,U dun say so early hor U c already then say,u dun say so early hor u c already then say
ham,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...
...,...,...
spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...
ham,Will Ì b going to esplanade fr home,will ì b going to esplanade fr home
ham,Pity was in mood for that Soany other suggest...,pity was in mood for that soany other suggest...
ham,The guy did some bitching but I acted like id ...,the guy did some bitching but i acted like id ...


Step 3 : Tokenization (Word Tokenization)

In [9]:
import nltk 
from nltk.tokenize import word_tokenize
def tokenize_words(text):
 words = word_tokenize(text)
 return words

data['tokenized_msg']=data['lowerCase_msg'].apply(lambda x:tokenize_words(x))

In [11]:
data.head(1)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,cleaned_msg,lowerCase_msg,tokenized_msg
0,ham,"Go until jurong point, crazy.. Available only ...",,,,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."


Step 4 : Removing Stop Words

In [12]:
import nltk
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [13]:
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
data['no_stopwords']= data['tokenized_msg'].apply(lambda x:remove_stopwords(x))

In [14]:
data.head(1)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,cleaned_msg,lowerCase_msg,tokenized_msg,no_stopwords
0,ham,"Go until jurong point, crazy.. Available only ...",,,,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."


Step 5 : Stemming

In [15]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
data['msg_stemmed']=data['no_stopwords'].apply(lambda x: stemming(x))

In [16]:
data.head(1)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,cleaned_msg,lowerCase_msg,tokenized_msg,no_stopwords,msg_stemmed
0,ham,"Go until jurong point, crazy.. Available only ...",,,,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre..."


Step 6 : Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
data['msg_lemmatized']=data['msg_stemmed'].apply(lambda x:lemmatizer(x))

In [19]:
data.head(1)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,cleaned_msg,lowerCase_msg,tokenized_msg,no_stopwords,msg_stemmed,msg_lemmatized
0,ham,"Go until jurong point, crazy.. Available only ...",,,,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[go, jurong, point, crazi, avail, bugi, n, gre..."


In [20]:
list1 = []
for sublist in data['msg_lemmatized']:
    for value in sublist:
        list1.append(value.split(','))
list1

[['go'],
 ['jurong'],
 ['point'],
 ['crazi'],
 ['avail'],
 ['bugi'],
 ['n'],
 ['great'],
 ['world'],
 ['la'],
 ['e'],
 ['buffet'],
 ['cine'],
 ['got'],
 ['amor'],
 ['wat'],
 ['ok'],
 ['lar'],
 ['joke'],
 ['wif'],
 ['u'],
 ['oni'],
 ['free'],
 ['entri'],
 ['2'],
 ['wkli'],
 ['comp'],
 ['win'],
 ['fa'],
 ['cup'],
 ['final'],
 ['tkt'],
 ['21st'],
 ['may'],
 ['2005'],
 ['text'],
 ['fa'],
 ['87121'],
 ['receiv'],
 ['entri'],
 ['questionstd'],
 ['txt'],
 ['ratetc'],
 ['appli'],
 ['08452810075over18'],
 ['u'],
 ['dun'],
 ['say'],
 ['earli'],
 ['hor'],
 ['u'],
 ['c'],
 ['alreadi'],
 ['say'],
 ['nah'],
 ['dont'],
 ['think'],
 ['goe'],
 ['usf'],
 ['live'],
 ['around'],
 ['though'],
 ['freemsg'],
 ['hey'],
 ['darl'],
 ['3'],
 ['week'],
 ['word'],
 ['back'],
 ['id'],
 ['like'],
 ['fun'],
 ['still'],
 ['tb'],
 ['ok'],
 ['xxx'],
 ['std'],
 ['chg'],
 ['send'],
 ['å£150'],
 ['rcv'],
 ['even'],
 ['brother'],
 ['like'],
 ['speak'],
 ['treat'],
 ['like'],
 ['aid'],
 ['patent'],
 ['per'],
 ['request'],
 [

In [21]:
final=[]
for i in list1:
    for j in i:
        final.append(j)
final

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat',
 'ok',
 'lar',
 'joke',
 'wif',
 'u',
 'oni',
 'free',
 'entri',
 '2',
 'wkli',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkt',
 '21st',
 'may',
 '2005',
 'text',
 'fa',
 '87121',
 'receiv',
 'entri',
 'questionstd',
 'txt',
 'ratetc',
 'appli',
 '08452810075over18',
 'u',
 'dun',
 'say',
 'earli',
 'hor',
 'u',
 'c',
 'alreadi',
 'say',
 'nah',
 'dont',
 'think',
 'goe',
 'usf',
 'live',
 'around',
 'though',
 'freemsg',
 'hey',
 'darl',
 '3',
 'week',
 'word',
 'back',
 'id',
 'like',
 'fun',
 'still',
 'tb',
 'ok',
 'xxx',
 'std',
 'chg',
 'send',
 'å£150',
 'rcv',
 'even',
 'brother',
 'like',
 'speak',
 'treat',
 'like',
 'aid',
 'patent',
 'per',
 'request',
 'mell',
 'mell',
 'oru',
 'minnaminungint',
 'nurungu',
 'vettam',
 'set',
 'callertun',
 'caller',
 'press',
 '9',
 'copi',
 'friend',
 'callertun',
 'winner',
 'valu',
 'network',
 