In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 100) #setting a custom value for how many characters we can see in a panda's data frame when it is printed out; the default is 50

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t', header=None)
data.columns = ['label', 'body_text']

data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [2]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data['body_text_clean'] = data['body_text'].apply(lambda x: remove_punct(x))

data.head()

Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


### Tokenize

In [4]:
import nltk 
def tokenize_2(text):
    tokens_2 = nltk.word_tokenize(text)
    return tokens_2

data['body_text_tokenized_2'] = data['body_text_clean'].apply(lambda x: tokenize_2(x.lower()))

data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized_2
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"


### Morphological analysis

In [5]:
def tagged(text):
    tag = nltk.pos_tag(text)
    return tag

data['body_text_tagged'] = data['body_text_tokenized_2'].apply(lambda x: tagged(x))

data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized_2,body_text_tagged
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[(ive, JJ), (been, VBN), (searching, VBG), (for, IN), (the, DT), (right, JJ), (words, NNS), (to,..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[(free, JJ), (entry, NN), (in, IN), (2, CD), (a, DT), (wkly, JJ), (comp, NN), (to, TO), (win, VB..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[(nah, NN), (i, NN), (dont, NN), (think, VBP), (he, PRP), (goes, VBZ), (to, TO), (usf, VB), (he,..."
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[(even, RB), (my, PRP$), (brother, NN), (is, VBZ), (not, RB), (like, IN), (to, TO), (speak, VB),..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[(i, NNS), (have, VBP), (a, DT), (date, NN), (on, IN), (sunday, NN), (with, IN), (will, MD)]"


### Semantic analysis

In [6]:
def entities(text):
    e = nltk.chunk.ne_chunk(text)
    return e

data['body_text_entities']= data['body_text_tagged'].apply(lambda x: entities(x))

data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized_2,body_text_tagged,body_text_entities
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[(ive, JJ), (been, VBN), (searching, VBG), (for, IN), (the, DT), (right, JJ), (words, NNS), (to,...","[(ive, JJ), (been, VBN), (searching, VBG), (for, IN), (the, DT), (right, JJ), (words, NNS), (to,..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[(free, JJ), (entry, NN), (in, IN), (2, CD), (a, DT), (wkly, JJ), (comp, NN), (to, TO), (win, VB...","[(free, JJ), (entry, NN), (in, IN), (2, CD), (a, DT), (wkly, JJ), (comp, NN), (to, TO), (win, VB..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[(nah, NN), (i, NN), (dont, NN), (think, VBP), (he, PRP), (goes, VBZ), (to, TO), (usf, VB), (he,...","[(nah, NN), (i, NN), (dont, NN), (think, VBP), (he, PRP), (goes, VBZ), (to, TO), (usf, VB), (he,..."
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[(even, RB), (my, PRP$), (brother, NN), (is, VBZ), (not, RB), (like, IN), (to, TO), (speak, VB),...","[(even, RB), (my, PRP$), (brother, NN), (is, VBZ), (not, RB), (like, IN), (to, TO), (speak, VB),..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[(i, NNS), (have, VBP), (a, DT), (date, NN), (on, IN), (sunday, NN), (with, IN), (will, MD)]","[(i, NNS), (have, VBP), (a, DT), (date, NN), (on, IN), (sunday, NN), (with, IN), (will, MD)]"


### Sysntactic analysis

In [7]:
# Display a parse tree:
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()

In [None]:
grammar = nltk.CFG.fromstring() # input required

In [None]:
parser=nltk.ChartParser(grammar)