In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

In [2]:
data = pd.read_csv('spam.csv')

In [3]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will �_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
data.isnull().any()

v1            False
v2            False
Unnamed: 2     True
Unnamed: 3     True
Unnamed: 4     True
dtype: bool

In [7]:
# Removing Punctuations

data['punctremov'] = ''

for i in range(0, 5572):
    data['punctremov'][i] = data['v2'][i].translate(str.maketrans('', '', string.punctuation))

In [8]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,punctremov
0,ham,"Go until jurong point, crazy.. Available only ...",,,,Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,,,,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,,,,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,Nah I dont think he goes to usf he lives aroun...


In [9]:
# Lower case 

data['sms_lower'] = ''

for i in range(0, 5572):
    data['sms_lower'][i] = data['punctremov'][i].lower()

In [10]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,punctremov,sms_lower
0,ham,"Go until jurong point, crazy.. Available only ...",,,,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,,,,Ok lar Joking wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,,,,U dun say so early hor U c already then say,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...


In [11]:
# Tokenization

data['sms_token'] = ''

for i in range(0, 5572):
    data['sms_token'][i] = word_tokenize(data['sms_lower'][i])

data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,punctremov,sms_lower,sms_token
0,ham,"Go until jurong point, crazy.. Available only ...",,,,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,,,,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,,,,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [12]:
# List of stop words

stop_words = set(stopwords.words("english"))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [13]:
# Removing Stopwords

data['sms_nostop'] = ''

for i in range(0, 5572):
    filtered_sent = []
    for w in data['sms_token'][i]:
        if w not in stop_words:
            filtered_sent.append(w)
    data['sms_nostop'][i] = filtered_sent

In [14]:
# Stemming

data['sms_stem'] = ''

ps = PorterStemmer()

for i in range(0, 5572):
    stemmed_words = []
    for w in data['sms_nostop'][i]:
        stemmed_words.append(ps.stem(w))
    data['sms_stem'][i] = stemmed_words

In [15]:
data['sms_stem'][0]

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat']

In [17]:
# Lemmatization

lemmatizer = WordNetLemmatizer()

data['sms_lemmat'] = ''

for i in range (5572):
    lemmatized_words = []
    for w in data['sms_nostop'][i]:
        lemmatized_words.append(lemmatizer.lemmatize(w))
    data['sms_lemmat'][i] = lemmatized_words

In [18]:
data['sms_lemmat'][1]

['ok', 'lar', 'joking', 'wif', 'u', 'oni']

In [19]:
# POS Tagging 

data['POS'] = ''

for i in range(0, 5572):
    result = nltk.pos_tag(data['sms_nostop'][i])
    data['POS'][i] = result

data['POS'][1]

[('ok', 'JJ'),
 ('lar', 'JJ'),
 ('joking', 'NN'),
 ('wif', 'NN'),
 ('u', 'JJ'),
 ('oni', 'NN')]

In [20]:
# TF-IDF

N = len(data)
def doc_freq(word):
    df = sum(1 for text in data['sms_stem'] if word in text)
    return df

def calc_tf_idf(doc, token):
    words_count = len(data['sms_stem'][doc])
    counter = dict(nltk.FreqDist(data['sms_stem'][doc]))
    tf = counter[token]/words_count
    df = doc_freq(token)
    idf = np.log(N/(df+1))
    tf_idf = tf*idf
    return tf_idf

In [21]:
for i in data['sms_stem'][0]:
  tf_idf = calc_tf_idf(doc=0, token=i)
  print(i," ", tf_idf)

go   0.16202644456880594
jurong   0.49577263464623444
point   0.31869680064272093
crazi   0.3698411958623429
avail   0.3620184994277175
bugi   0.40912923707624127
n   0.24039911511176043
great   0.2482218115463858
world   0.31688507958814266
la   0.40912923707624127
e   0.26842349966333534
buffet   0.47043106538947416
cine   0.40912923707624127
got   0.1968153614355116
amor   0.49577263464623444
wat   0.24762939004922682
