In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth',100)
stopwords=nltk.corpus.stopwords.words('english')
ln=nltk.WordNetLemmatizer()
ps=nltk.PorterStemmer()

In [2]:
rawdata=pd.read_csv('nlpdata.tsv',sep="\t",header=None)
rawdata.columns=['label','body_text']
rawdata.head()

Unnamed: 0,label,body_text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


# Remove punctuation

In [3]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
#remove punctuation function
def remove_punctuation(text):#text=gautam kumar shahi
    text_nonpunctuation="".join([char for char in text if char not in string.punctuation])
    return text_nonpunctuation

In [5]:
rawdata['text_after_punc']=rawdata['body_text'].apply(lambda x:remove_punctuation(x))
rawdata.head()

Unnamed: 0,label,body_text,text_after_punc
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


In [6]:
rawdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   label            5572 non-null   object
 1   body_text        5572 non-null   object
 2   text_after_punc  5572 non-null   object
dtypes: object(3)
memory usage: 130.7+ KB


# tokenization

In [7]:
from nltk.tokenize import sent_tokenize,word_tokenize
rawdata['text_after_tokenize']=rawdata['text_after_punc'].apply(nltk.word_tokenize)
rawdata

Unnamed: 0,label,body_text,text_after_punc,text_after_tokenize
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[Go, until, jurong, point, crazy, Available, only, in, bugis, n, great, world, la, e, buffet, Ci..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, to..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[Nah, I, dont, think, he, goes, to, usf, he, lives, around, here, though]"
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy...,This is the 2nd time we have tried 2 contact u U have won the £750 Pound prize 2 claim is easy c...,"[This, is, the, 2nd, time, we, have, tried, 2, contact, u, U, have, won, the, £750, Pound, prize..."
5568,ham,Will ü b going to esplanade fr home?,Will ü b going to esplanade fr home,"[Will, ü, b, going, to, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other suggestions?",Pity was in mood for that Soany other suggestions,"[Pity, was, in, mood, for, that, Soany, other, suggestions]"
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week ...,The guy did some bitching but I acted like id be interested in buying something else next week a...,"[The, guy, did, some, bitching, but, I, acted, like, id, be, interested, in, buying, something, ..."


# stopwords

In [8]:
stopwords=nltk.corpus.stopwords.words("English")
rawdata['text_after_stopwords'] = rawdata['text_after_tokenize'].apply(lambda x: [item for item in x if item not in stopwords])
rawdata.head()

Unnamed: 0,label,body_text,text_after_punc,text_after_tokenize,text_after_stopwords
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[Go, until, jurong, point, crazy, Available, only, in, bugis, n, great, world, la, e, buffet, Ci...","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, to...","[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, then, say]","[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[Nah, I, dont, think, he, goes, to, usf, he, lives, around, here, though]","[Nah, I, dont, think, goes, usf, lives, around, though]"


In [9]:
#total words count after stopwords
rawdata["TotalWord_count"] = rawdata["text_after_stopwords"].str.len()

# POS tagging

In [10]:
rawdata['POS']=rawdata['text_after_stopwords'].apply(nltk.pos_tag)

In [11]:
#counting Noun
def NounCounter(x):
    noun=("NN","NNS","NNP","NNPS")
    nouns = []
    for (word, pos) in x:
        if pos.startswith(noun):
            nouns.append(word)
    return nouns

rawdata["nouns"] = rawdata["POS"].apply(NounCounter)
rawdata["noun_count"] = rawdata["nouns"].str.len()

In [12]:
#counting pronoun
def pronoun_count(x):
    pronoun=("PRP","PRP$")
    pronouns = []
    for (word, pos) in x:
        if pos.startswith(pronoun):
            pronouns.append(word)
    return pronouns

rawdata["pronoun"] = rawdata["POS"].apply(pronoun_count)
rawdata["pronoun_count"] = rawdata["pronoun"].str.len()

In [13]:
#counting verb
def verb_count(x):
    verb=("VB","VBD","VBG","VBG","VBN","VBZ")
    verbs = []
    for (word, pos) in x:
        if pos.startswith(verb):
            verbs.append(word)
    return verbs

rawdata["verb"] = rawdata["POS"].apply(verb_count)
rawdata["verb_count"] = rawdata["verb"].str.len()

In [14]:
#counting adverb
def adverb_count(x):
    adverb=("RB","RBR","RBS")
    adverbs = []
    for (word, pos) in x:
        if pos.startswith(adverb):
            adverbs.append(word)
    return adverbs

rawdata["adverb"] = rawdata["POS"].apply(adverb_count)
rawdata["adverb_count"] = rawdata["adverb"].str.len()

In [15]:
#counting adjective
def adjective_count(x):
    adj=("JJ","JJR","JJS")
    adjs = []
    for (word, pos) in x:
        if pos.startswith(adj):
            adjs.append(word)
    return adjs

rawdata["adjective"] = rawdata["POS"].apply(adjective_count)
rawdata["adjective_count"] = rawdata["adjective"].str.len()

In [16]:
#counting conjuction
def conjuction_count(x):
    conjuction = []
    for (word, pos) in x:
        if pos.startswith("cc"):
            conjuction.append(word)
    return conjuction

rawdata["conjuction"] = rawdata["POS"].apply(conjuction_count)
rawdata["conjuction_count"] = rawdata["conjuction"].str.len()

In [17]:
#counting preposition
def preposition_count(x):
    preposition = []
    for (word, pos) in x:
        if pos.startswith("IN"):
            preposition.append(word)
    return preposition

rawdata["preposition"] = rawdata["POS"].apply(preposition_count)
rawdata["preposition_count"] = rawdata["conjuction"].str.len()

In [18]:
#counting interjection
def interjection_count(x):
    interjection = []
    for (word, pos) in x:
        if pos.startswith("UH"):
            interjection.append(word)
    return interjection

rawdata["interjection"] = rawdata["POS"].apply(preposition_count)
rawdata["interjection_count"] = rawdata["interjection"].str.len()

In [19]:
#rawdata.head()

In [20]:
rawdata_set=rawdata[['TotalWord_count','noun_count','pronoun_count','verb_count','adverb_count','adjective_count','conjuction_count','preposition_count','interjection_count']]
rawdata_set.head()

Unnamed: 0,TotalWord_count,noun_count,pronoun_count,verb_count,adverb_count,adjective_count,conjuction_count,preposition_count,interjection_count
0,16,7,0,3,2,4,0,0,0
1,6,5,0,0,0,1,0,0,0
2,23,12,0,2,0,4,0,0,0
3,9,4,0,2,1,2,0,0,0
4,9,2,1,3,0,1,0,0,2


In [21]:
#import matplotlib.pyplot as plt
#rawdata_set.plot(x="TotalWord_count", 
#                y=["noun_count", "pronoun_count","verb_count","adverb_count","adjective_count"], kind="bar") 

In [22]:
def NER_(sent):
    chunked = []
    grammar = "NP: {<DT>?<JJ>*<NN>}"
    cp = nltk.RegexpParser(grammar)
    for s in sent:
        #chunked.append(cp.parse(sent))
        results=cp.parse(sent)
        #print(type(results))
        return results
    #results.draw()
rawdata['ner']=rawdata['POS'].apply(NER_)

In [23]:
rawdata.head()

Unnamed: 0,label,body_text,text_after_punc,text_after_tokenize,text_after_stopwords,TotalWord_count,POS,nouns,noun_count,pronoun,...,adverb_count,adjective,adjective_count,conjuction,conjuction_count,preposition,preposition_count,interjection,interjection_count,ner
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[Go, until, jurong, point, crazy, Available, only, in, bugis, n, great, world, la, e, buffet, Ci...","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]",16,"[(Go, VB), (jurong, JJ), (point, NN), (crazy, NN), (Available, NNP), (bugis, NN), (n, RB), (grea...","[point, crazy, Available, bugis, world, la, Cine]",7,[],...,2,"[jurong, great, buffet, wat]",4,[],0,[],0,[],0,"[(Go, VB), [(jurong, JJ), (point, NN)], [(crazy, NN)], (Available, NNP), [(bugis, NN)], (n, RB),..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]",6,"[(Ok, NNP), (lar, JJ), (Joking, NNP), (wif, NN), (u, NN), (oni, NN)]","[Ok, Joking, wif, u, oni]",5,[],...,0,[lar],1,[],0,[],0,[],0,"[(Ok, NNP), (lar, JJ), (Joking, NNP), [(wif, NN)], [(u, NN)], [(oni, NN)]]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, to...","[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv...",23,"[(Free, JJ), (entry, NN), (2, CD), (wkly, JJ), (comp, NN), (win, VBP), (FA, NNP), (Cup, NNP), (f...","[entry, comp, FA, Cup, tkts, May, Text, FA, entry, questionstd, txt, rateTCs]",12,[],...,0,"[Free, wkly, final, receive]",4,[],0,[],0,[],0,"[[(Free, JJ), (entry, NN)], (2, CD), [(wkly, JJ), (comp, NN)], (win, VBP), (FA, NNP), (Cup, NNP)..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, then, say]","[U, dun, say, early, hor, U, c, already, say]",9,"[(U, JJ), (dun, NNS), (say, VBP), (early, JJ), (hor, NN), (U, NNP), (c, NN), (already, RB), (say...","[dun, hor, U, c]",4,[],...,1,"[U, early]",2,[],0,[],0,[],0,"[(U, JJ), (dun, NNS), (say, VBP), [(early, JJ), (hor, NN)], (U, NNP), [(c, NN)], (already, RB), ..."
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[Nah, I, dont, think, he, goes, to, usf, he, lives, around, here, though]","[Nah, I, dont, think, goes, usf, lives, around, though]",9,"[(Nah, NNP), (I, PRP), (dont, VBP), (think, VB), (goes, VBZ), (usf, JJ), (lives, NNS), (around, ...","[Nah, lives]",2,[I],...,0,[usf],1,[],0,"[around, though]",0,"[around, though]",2,"[(Nah, NNP), (I, PRP), (dont, VBP), (think, VB), (goes, VBZ), (usf, JJ), (lives, NNS), (around, ..."
