# Practical 3
## Tokenization: Split the text sentence/paragraph/Data set and generate Tokens.

In [1]:
import nlp_lib #https://github.com/Sh1vam/nlp_lib
from nlp_lib import *

In [2]:
df=pd.read_csv("SMSSpamCollection.tsv",sep='\t')
df.columns = ['label', 'text']

In [3]:
df.head()

Unnamed: 0,label,text
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,ham,"Nah I don't think he goes to usf, he lives aro..."
2,ham,Even my brother is not like to speak with me. ...
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnamin...


### Lower Casing

In [4]:
df['text']=df['text'].str.lower()

In [5]:
df.head()

Unnamed: 0,label,text
0,spam,free entry in 2 a wkly comp to win fa cup fina...
1,ham,"nah i don't think he goes to usf, he lives aro..."
2,ham,even my brother is not like to speak with me. ...
3,ham,i have a date on sunday with will!!
4,ham,as per your request 'melle melle (oru minnamin...


### Removing HTML Tags

In [6]:
df['text']=df['text'].apply(remove_html_tags)
df.head()

Unnamed: 0,label,text
0,spam,free entry in 2 a wkly comp to win fa cup fina...
1,ham,"nah i don't think he goes to usf, he lives aro..."
2,ham,even my brother is not like to speak with me. ...
3,ham,i have a date on sunday with will!!
4,ham,as per your request 'melle melle (oru minnamin...


### Removing URLs

In [7]:
df['text']=df['text'].apply(remove_url)
df.head()

Unnamed: 0,label,text
0,spam,free entry in 2 a wkly comp to win fa cup fina...
1,ham,"nah i don't think he goes to usf, he lives aro..."
2,ham,even my brother is not like to speak with me. ...
3,ham,i have a date on sunday with will!!
4,ham,as per your request 'melle melle (oru minnamin...


### Removing Punctuations

In [8]:
df['text']=df['text'].apply(remove_punctuations)
df.head()

Unnamed: 0,label,text
0,spam,free entry in 2 a wkly comp to win fa cup fina...
1,ham,nah i dont think he goes to usf he lives aroun...
2,ham,even my brother is not like to speak with me t...
3,ham,i have a date on sunday with will
4,ham,as per your request melle melle oru minnaminun...


### Removing Special Characters 

In [9]:
df['text']=df['text'].apply(removes_specials)
df.head()

Unnamed: 0,label,text
0,spam,free entry in 2 a wkly comp to win fa cup fina...
1,ham,nah i dont think he goes to usf he lives aroun...
2,ham,even my brother is not like to speak with me t...
3,ham,i have a date on sunday with will
4,ham,as per your request melle melle oru minnaminun...


### Removing Non-Printable Characters

In [10]:
df['text']=df['text'].apply(removes_non_printables)
df.head()

Unnamed: 0,label,text
0,spam,free entry in 2 a wkly comp to win fa cup fina...
1,ham,nah i dont think he goes to usf he lives aroun...
2,ham,even my brother is not like to speak with me t...
3,ham,i have a date on sunday with will
4,ham,as per your request melle melle oru minnaminun...


### Removing Stop Words

In [11]:
df['stopwords_removed']=df['text'].apply(remove_stopwords)
df.head()

Unnamed: 0,label,text,stopwords_removed
0,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
1,ham,nah i dont think he goes to usf he lives aroun...,nah dont think goes usf lives around though
2,ham,even my brother is not like to speak with me t...,even brother like speak treat like aids patent
3,ham,i have a date on sunday with will,date sunday
4,ham,as per your request melle melle oru minnaminun...,per request melle melle oru minnaminunginte nu...


### Text Tokenization

In [12]:
def tokenized_text(text):
    if pd.isnull(text):  
        return np.nan
    return nltk.word_tokenize(text)
df['tokenized']=df['stopwords_removed'].apply(tokenized_text)
df.head()

Unnamed: 0,label,text,stopwords_removed,tokenized
0,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
1,ham,nah i dont think he goes to usf he lives aroun...,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t..."
2,ham,even my brother is not like to speak with me t...,even brother like speak treat like aids patent,"[even, brother, like, speak, treat, like, aids..."
3,ham,i have a date on sunday with will,date sunday,"[date, sunday]"
4,ham,as per your request melle melle oru minnaminun...,per request melle melle oru minnaminunginte nu...,"[per, request, melle, melle, oru, minnaminungi..."


### Doing Stemming

In [13]:
df['porter_stemmed']=df['tokenized'].apply(porter_stemming)
df.head()

Unnamed: 0,label,text,stopwords_removed,tokenized,porter_stemmed
0,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
1,ham,nah i dont think he goes to usf he lives aroun...,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho..."
2,ham,even my brother is not like to speak with me t...,even brother like speak treat like aids patent,"[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,..."
3,ham,i have a date on sunday with will,date sunday,"[date, sunday]","[date, sunday]"
4,ham,as per your request melle melle oru minnaminun...,per request melle melle oru minnaminunginte nu...,"[per, request, melle, melle, oru, minnaminungi...","[per, request, mell, mell, oru, minnaminungint..."


In [14]:
df['lancaster_stemmed']=df['tokenized'].apply(lancaster_stemming)
df.head()

Unnamed: 0,label,text,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed
0,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[fre, entry, 2, wkly, comp, win, fa, cup, fin,..."
1,ham,nah i dont think he goes to usf he lives aroun...,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, liv, around, though]"
2,ham,even my brother is not like to speak with me t...,even brother like speak treat like aids patent,"[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...","[ev, broth, lik, speak, tre, lik, aid, pat]"
3,ham,i have a date on sunday with will,date sunday,"[date, sunday]","[date, sunday]","[dat, sunday]"
4,ham,as per your request melle melle oru minnaminun...,per request melle melle oru minnaminunginte nu...,"[per, request, melle, melle, oru, minnaminungi...","[per, request, mell, mell, oru, minnaminungint...","[per, request, mel, mel, oru, minnaminungint, ..."


In [15]:
df['snowball_stemmed']=df['tokenized'].apply(snowball_stemming)
df.head()

Unnamed: 0,label,text,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed,snowball_stemmed
0,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[fre, entry, 2, wkly, comp, win, fa, cup, fin,...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
1,ham,nah i dont think he goes to usf he lives aroun...,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, liv, around, though]","[nah, dont, think, goe, usf, live, around, tho..."
2,ham,even my brother is not like to speak with me t...,even brother like speak treat like aids patent,"[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...","[ev, broth, lik, speak, tre, lik, aid, pat]","[even, brother, like, speak, treat, like, aid,..."
3,ham,i have a date on sunday with will,date sunday,"[date, sunday]","[date, sunday]","[dat, sunday]","[date, sunday]"
4,ham,as per your request melle melle oru minnaminun...,per request melle melle oru minnaminunginte nu...,"[per, request, melle, melle, oru, minnaminungi...","[per, request, mell, mell, oru, minnaminungint...","[per, request, mel, mel, oru, minnaminungint, ...","[per, request, mell, mell, oru, minnaminungint..."


In [16]:
df['regexp_stemmed']=df['tokenized'].apply(regexp_stemming)
df.head()

Unnamed: 0,label,text,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed,snowball_stemmed,regexp_stemmed
0,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[fre, entry, 2, wkly, comp, win, fa, cup, fin,...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[fre, entry, 2, wkly, comp, win, fa, cup, fina..."
1,ham,nah i dont think he goes to usf he lives aroun...,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, liv, around, though]","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, live, around, tho..."
2,ham,even my brother is not like to speak with me t...,even brother like speak treat like aids patent,"[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...","[ev, broth, lik, speak, tre, lik, aid, pat]","[even, brother, like, speak, treat, like, aid,...","[even, brother, lik, speak, treat, lik, aid, p..."
3,ham,i have a date on sunday with will,date sunday,"[date, sunday]","[date, sunday]","[dat, sunday]","[date, sunday]","[dat, sunday]"
4,ham,as per your request melle melle oru minnaminun...,per request melle melle oru minnaminunginte nu...,"[per, request, melle, melle, oru, minnaminungi...","[per, request, mell, mell, oru, minnaminungint...","[per, request, mel, mel, oru, minnaminungint, ...","[per, request, mell, mell, oru, minnaminungint...","[per, request, mell, mell, oru, minnaminungint..."


### Applying Lemmatization 

In [17]:
df['lemmatized']=df['tokenized'].apply(wordnet_lemmatizing)
df.head()

Unnamed: 0,label,text,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed,snowball_stemmed,regexp_stemmed,lemmatized
0,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[fre, entry, 2, wkly, comp, win, fa, cup, fin,...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[fre, entry, 2, wkly, comp, win, fa, cup, fina...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
1,ham,nah i dont think he goes to usf he lives aroun...,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, liv, around, though]","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, go, usf, life, around, though]"
2,ham,even my brother is not like to speak with me t...,even brother like speak treat like aids patent,"[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...","[ev, broth, lik, speak, tre, lik, aid, pat]","[even, brother, like, speak, treat, like, aid,...","[even, brother, lik, speak, treat, lik, aid, p...","[even, brother, like, speak, treat, like, aid,..."
3,ham,i have a date on sunday with will,date sunday,"[date, sunday]","[date, sunday]","[dat, sunday]","[date, sunday]","[dat, sunday]","[date, sunday]"
4,ham,as per your request melle melle oru minnaminun...,per request melle melle oru minnaminunginte nu...,"[per, request, melle, melle, oru, minnaminungi...","[per, request, mell, mell, oru, minnaminungint...","[per, request, mel, mel, oru, minnaminungint, ...","[per, request, mell, mell, oru, minnaminungint...","[per, request, mell, mell, oru, minnaminungint...","[per, request, melle, melle, oru, minnaminungi..."


In [19]:
df['procss']=df['lemmatized'].apply(list_joint)

In [20]:
df.head()

Unnamed: 0,label,text,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed,snowball_stemmed,regexp_stemmed,lemmatized,procss
0,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[fre, entry, 2, wkly, comp, win, fa, cup, fin,...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[fre, entry, 2, wkly, comp, win, fa, cup, fina...","[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...
1,ham,nah i dont think he goes to usf he lives aroun...,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, liv, around, though]","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, go, usf, life, around, though]",nah dont think go usf life around though
2,ham,even my brother is not like to speak with me t...,even brother like speak treat like aids patent,"[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...","[ev, broth, lik, speak, tre, lik, aid, pat]","[even, brother, like, speak, treat, like, aid,...","[even, brother, lik, speak, treat, lik, aid, p...","[even, brother, like, speak, treat, like, aid,...",even brother like speak treat like aid patent
3,ham,i have a date on sunday with will,date sunday,"[date, sunday]","[date, sunday]","[dat, sunday]","[date, sunday]","[dat, sunday]","[date, sunday]",date sunday
4,ham,as per your request melle melle oru minnaminun...,per request melle melle oru minnaminunginte nu...,"[per, request, melle, melle, oru, minnaminungi...","[per, request, mell, mell, oru, minnaminungint...","[per, request, mel, mel, oru, minnaminungint, ...","[per, request, mell, mell, oru, minnaminungint...","[per, request, mell, mell, oru, minnaminungint...","[per, request, melle, melle, oru, minnaminungi...",per request melle melle oru minnaminunginte nu...


In [21]:
a=word_pos_tag(df["procss"][0])
a

[('free', 'JJ'),
 ('entry', 'NN'),
 ('2', 'CD'),
 ('wkly', 'JJ'),
 ('comp', 'NN'),
 ('win', 'VBP'),
 ('fa', 'JJ'),
 ('cup', 'NN'),
 ('final', 'JJ'),
 ('tkts', 'NN'),
 ('21st', 'CD'),
 ('may', 'MD'),
 ('2005', 'CD'),
 ('text', 'NN'),
 ('fa', 'NN'),
 ('87121', 'CD'),
 ('receive', 'JJ'),
 ('entry', 'NN'),
 ('questionstd', 'NN'),
 ('txt', 'NN'),
 ('ratetcs', 'NN'),
 ('apply', 'VBP'),
 ('08452810075over18s', 'CD')]

In [22]:
b=get_wordnet_pos('JJ')
b

'a'

In [24]:
df['pos_tagged']=df["procss"].apply(pos_tag)

In [25]:
spacy.explain('CD')

'cardinal number'

In [26]:
spacy.explain('VBD')

'verb, past tense'

In [27]:
def displacy_render(text):
    doc = nlp(text)
    displacy.render(doc, style="dep", jupyter=True, options={"distance": 110,"compact":True})
    return None

#df['procss'].apply(displacy_render)
displacy_render(df['procss'][0])

In [28]:

options={'distance':110,"compact":True,"color":"black","bg":"white","font":"time"}
def displacy_render(text):
    doc = nlp(text)
    displacy.render(doc, style="dep", jupyter=True, options=options)
    return None
displacy_render(df['text'][0])