In [1]:
import nltk
import pandas as pd
import numpy as np

In [2]:
from nltk.tokenize import word_tokenize

In [3]:
raw_data=open("D:/Spyder/Data-Envelopment-Analysis-Excel/Data/NLTK/SMSSpamCollection").read()
# open is for raw data extraction
raw_data[0:500]

"ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\nham\tOk lar... Joking wif u oni...\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tU dun say so early hor... U c already then say...\nham\tNah I don't think he goes to usf, he lives around here though\nspam\tFreeMsg Hey there darling it's been 3 week's now and no word bac"

Parsing

In [4]:
parsed_data= raw_data.replace('\t','\n').split('\n')
# .replace(to_be_replaced, replaced_by_whom), .split() to split all
parsed_data[0:500]

['ham',
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'ham',
 'Ok lar... Joking wif u oni...',
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham',
 'U dun say so early hor... U c already then say...',
 'ham',
 "Nah I don't think he goes to usf, he lives around here though",
 'spam',
 "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv",
 'ham',
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 'ham',
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
 'spam',
 'WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! 

Making two list, one containing the labels : ham & spam
Other for containing the messages

In [5]:
#lable list
label_list=parsed_data[0::2]
# the spam & ham words are on consiqutive & alternate position
# 0,2,4,6... [0::2] means starting from 0, till eof, skipping 2 step
# message list
message_list=parsed_data[1::2]
# message start from odd index 1,3,5... thats why [1::2]
print(label_list[0:5]) # first 5 labels
print(message_list[0:5]) # first five messages

['ham', 'ham', 'spam', 'ham', 'ham']
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though"]


In [6]:
print(len(label_list), len(message_list))

5575 5574


In [7]:
# since we have an extra label, we need to check and then rectify the redundancy
print(label_list[-3:])

['ham', 'ham', '']


Due to wrong append, a blank character has been appendent, so while making the dataframe we will not take the last one

In [8]:
# now combining them to do easy analysis
combined_df= pd.DataFrame({'label':label_list[:-1], 'sms':message_list})
combined_df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Method 2 : using read_csv

In [9]:
dataset=pd.read_csv("D:/Spyder/Data-Envelopment-Analysis-Excel/Data/NLTK/SMSSpamCollection", sep='\t', header=None)
# header = None because if we dont do that, pandas will make the 1st row as the colunm names itself
dataset.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# General Exploration

In [10]:
dataset.columns=['label','sms']

In [11]:
dataset.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
dataset.shape

(5572, 2)

The other way to do the same

In [13]:
print(f'Input data has {len(dataset)} rows, {len(dataset.columns)} columns')

Input data has 5572 rows, 2 columns


In [14]:
# ham/spam
print(f'ham = {len(dataset[dataset["label"]=="ham"])}')
print(f'spam = {len(dataset[dataset["label"]=="spam"])}')

ham = 4825
spam = 747


In [15]:
dataset.isna().sum()

label    0
sms      0
dtype: int64

# NLP Pipeline:
Raw text> Tokenization> Text Cleaning> Vectorization> ML Algorithm> Spam Filter

#### Cleaning consist of clearing/removing of words from token list such as stop words which are those which connects other words. eg : I, am, in, ... & removing of punctuations too
#### steaming: chainging all the child words to their root words: eg: teaching, teaches, teacher -> teach

#### Text to number since ML modes doesnt understand words, this step is called Vectorization
#### making matrix, set, list is a way of doing

## Text Preprocessing: Tokenization + Text cleaning
Things that are done are:
Remove Punctuations, Tokenization, Remove stop words, Stemming/Lemmatizing

### Remove Punctuation

In [16]:
pd.set_option('display.max_colwidth', 100)
dataset.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [17]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
# creating a function that will iterate in the sms, if its a punctuation, discard it otherisw keep it
def remove_punctuation(txt):
    txt_nopunt="".join([c for c in txt if c not in string.punctuation])
    # if we dont use "".join() the function will not only remove the punctuation but also break each word in its individual character
    return txt_nopunt

In [19]:
dataset['mgs_clean']=dataset['sms'].apply(lambda x: remove_punctuation(x))
dataset.head()

Unnamed: 0,label,sms,mgs_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


### Tokenization
#### spliting text into list of words

In [20]:
import re

In [24]:
def tokenize(text):
    tokens = re.split("\W+", text)
    # We are spliting all non words characters like space and all
    return tokens

In [25]:
#applying the function on clean message= msg_clean and stroingit in new column
dataset['msg_clean_tokenized']=dataset['mgs_clean'].apply(lambda x: tokenize(x.lower()))
dataset.head()

Unnamed: 0,label,sms,mgs_clean,msg_clean_tokenized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"


### Removing Stop words
eg: am, is, the, are etc

NLTK has a list of stop words for various languages itself, we will be using that

In [27]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10] #printing the 1st 10 from the whole list

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [30]:
# function to remove stop words
def remove_stopwords(text_tockenized):
    txt_clean=[word for word in text_tockenized if word not in stopwords]
    # if the word is present in text_tockenized, & not present in stopwords, add it to the LHS
    return txt_clean

In [31]:
# creating new column with no stopwords
dataset['msg_no_sw']=dataset['msg_clean_tokenized'].apply(lambda x: remove_stopwords(x))
dataset.head()

Unnamed: 0,label,sms,mgs_clean,msg_clean_tokenized,msg_no_sw
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"


### Stemming
Process of reducing inflected(or derived) words to their root words or word stem. eg: code : code, coder, coders, coding
where code is the stem word, and the rest are children

Errots is stemming: overstemming & Understemming.
overstemming, too much words are lost, 2 words of diff stems reduced to the same stem
understemming, 2 words of stem mapped to diff stems

eg: university, universities, universal, universe. all these maybe mapped to univers which is not right (eg of over)
eg: data-dat, datum-datu not right again (eg of under)

Stemming algos: porter**, snowball, lancaster regex-based

#### Stemming: porter stemmer

In [32]:
from nltk.stem import PorterStemmer
ps= PorterStemmer()
dir(PorterStemmer)

['MARTIN_EXTENSIONS',
 'NLTK_EXTENSIONS',
 'ORIGINAL_ALGORITHM',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_abc_impl',
 '_apply_rule_list',
 '_contains_vowel',
 '_ends_cvc',
 '_ends_double_consonant',
 '_has_positive_measure',
 '_is_consonant',
 '_measure',
 '_replace_suffix',
 '_step1a',
 '_step1b',
 '_step1c',
 '_step2',
 '_step3',
 '_step4',
 '_step5a',
 '_step5b',
 'stem',
 'unicode_repr']

In [33]:
print(ps.stem('coder')) 
print(ps.stem('coding'))
print(ps.stem('code'))
# ps is smart enough to get that coder is a noun, coding and code are sort of actions
# thats why coder is stemed to coder iself but coding and code are stemed to code

coder
code
code


In [34]:
print(ps.stem('data'))
print(ps.stem('datum'))
#but not intelligent enough

data
datum


In [36]:
# our stemming function:
def stemming(tokenized_text):
    text =[ps.stem(word) for word in tokenized_text]
    return text

In [37]:
dataset['msg_stemmed']=dataset['msg_no_sw'].apply(lambda x: stemming(x))
dataset.head()
#in 1st row, available(child) became avail(stem/root)
#in 2nd row, joking(child) became joke(stem/root)

Unnamed: 0,label,sms,mgs_clean,msg_clean_tokenized,msg_no_sw,msg_stemmed
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"


### Lemmatization
process of grouping together the inflected forms of a word to be analyzed as a single root word or lemma

Unlike stemming, it reduces the inflected words properly ensuring that the root word(lemma) belongs to the language

A lemma isa canonical form, dictionary form or citation form of a set of words

It goes vocabulary analysis, slower but more acurate than stemming

In [38]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [39]:
print(ps.stem("goose"))
print(ps.stem("geese"))

goos
gees


In [40]:
print(wn.lemmatize("goose"))
print(wn.lemmatize("geese"))

goose
goose


Lemmitizaton is more accurate

In [41]:
print(wn.lemmatize('cactus'))
print(wn.lemmatize('cacti'))
print(ps.stem('cactus'))
print(ps.stem('cacti'))

cactus
cactus
cactu
cacti


In short, lemmitization is focused in dictionary, where as stemming is more on string and chops the words

In [42]:
def lemmatization(token_text):
    text =[wn.lemmatize(word) for word in token_text]
    return text

In [43]:
dataset['msg_lemmatized']=dataset['msg_no_sw'].apply(lambda x: lemmatization(x))
dataset.head()
#in the last column, goes is converted in go
#and lives to life

Unnamed: 0,label,sms,mgs_clean,msg_clean_tokenized,msg_no_sw,msg_stemmed,msg_lemmatized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"


## Vectorization

process of encoding text as integers to create features vectors
Feature Vector: vector of numerical features that represents an objects

#### Count Vectorization
Creates a document-term matrix

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

Example:

In [48]:
corpus=["This is a sentence is","This is another ",'third document is here']
X = cv.fit(corpus)
print(X.vocabulary_)
print(cv.get_feature_names())

Y = cv.transform(corpus)
print(Y.shape)
print(Y.toarray())
print(Y) #print non-zero values
""" to make it a new dataframe:
df = pd.DataFrame(Y.toarray(), columns=cv.get_feature_names())"""

{'this': 6, 'is': 3, 'sentence': 4, 'another': 0, 'third': 5, 'document': 1, 'here': 2}
['another', 'document', 'here', 'is', 'sentence', 'third', 'this']
(3, 7)
[[0 0 0 2 1 0 1]
 [1 0 0 1 0 0 1]
 [0 1 1 1 0 1 0]]
  (0, 3)	2
  (0, 4)	1
  (0, 6)	1
  (1, 0)	1
  (1, 3)	1
  (1, 6)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1


In [50]:
#now using in our dataset
# Rewriting the clean function with stemming and passing the original
#sms column to count vectorization
def clean_text(txt):
    txt="".join([c for c in txt if c not in string.punctuation])
    tokens = re.split("\W+", txt)
    txt = [ps.stem(word) for word in tokens if word not in stopwords]
    return txt

In [55]:
cv1 = CountVectorizer(analyzer=clean_text)
X= cv1.fit_transform(dataset['sms'])
print(X.shape)
#after cleaning, we have 5572 rows and 8340 unique tokens

(5572, 8340)


In [56]:
print(cv1.get_feature_names())

['', '0', '008704050406', '0089mi', '0121', '01223585236', '01223585334', '0125698789', '02', '020603', '0207', '02070836089', '02072069400', '02073162414', '02085076972', '020903', '021', '050703', '0578', '06', '060505', '061104', '07008009200', '07046744435', '07090201529', '07090298926', '07099833605', '071104', '07123456789', '0721072', '07732584351', '07734396839', '07742676969', '07753741225', '0776xxxxxxx', '07786200117', '077xxx', '078', '07801543489', '07808', '07808247860', '07808726822', '07815296484', '07821230901', '0784987', '0789xxxxxxx', '0794674629107880867867', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '0800', '08000407165', '08000776320', '08000839402', '08000930705', '08000938767', '08001950382', '08002888812', '08002986030', '08002986906', '08002988890', '08006344447', '0808', '08081263000', '08081560665', '0825', '0844', '08448350055', '08448714184', '0845', '08450542832', '08452810071', '08452810073', '08452810075over18', '0870', '08700621170150p', '0870121318

In [57]:
# lets create a smaple data set and work on it
data_sample=dataset[0:10]
cv2=CountVectorizer()
X = cv2.fit_transform(data_sample['sms'])
print(X.shape)

(10, 149)


In [58]:
df = pd.DataFrame(X.toarray(), columns=cv2.get_feature_names())
print(df.head(10))

   08002986030  08452810075over18  09061701461  11  12  2005  21st  50  87121  \
0            0                  0            0   0   0     0     0   0      0   
1            0                  0            0   0   0     0     0   0      0   
2            0                  1            0   0   0     1     1   0      1   
3            0                  0            0   0   0     0     0   0      0   
4            0                  0            0   0   0     0     0   0      0   
5            0                  0            0   0   0     0     0   1      0   
6            0                  0            0   0   0     0     0   0      0   
7            0                  0            0   0   0     0     0   0      0   
8            0                  0            1   0   1     0     0   0      0   
9            1                  0            0   1   0     0     0   0      0   

   900  ...  wif  win  winner  with  wkly  word  world  xxx  you  your  
0    0  ...    0    0       0     0