# Natural Language Processing-Step-06

### 01.Read the data

In [18]:
import pandas as pd
data_text = pd.read_csv('SMSSpamCollection.tsv', sep='\t',names=['label','body_text'],header=None)
data_text.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### 02.Remove Punctuations

In [19]:
import string
string.punctuation
def remove_punctuation(text):
    text_nopunct="".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data_text['body_text_clean']=data_text['body_text'].apply(lambda x: remove_punctuation(x))
data_text.head()


Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to tha...,Ive been searching for the right words to than...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...
3,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


### 03.Tokenization

In [20]:
import re
def tokenize(text):
    tokens=re.split('\W+',text)
    return tokens
data_text['body_text_tokenized']=data_text['body_text_clean'].apply(lambda x:tokenize(x))
data_text.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized
0,ham,I've been searching for the right words to tha...,Ive been searching for the right words to than...,"[Ive, been, searching, for, the, right, words,..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
2,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l..."
3,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,"[Even, my, brother, is, not, like, to, speak, ..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]"


### 04.Remove stop words

In [21]:
import nltk
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(tokenized_list):
    text=[word for word in tokenized_list if word not in stopword]
    return text
data_text['body_text_nostop']=data_text['body_text_tokenized'].apply(lambda x:remove_stopwords(x))
data_text.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized,body_text_nostop
0,ham,I've been searching for the right words to tha...,Ive been searching for the right words to than...,"[Ive, been, searching, for, the, right, words,...","[Ive, searching, right, words, thank, breather..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
2,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l...","[Nah, I, dont, think, goes, usf, lives, around..."
3,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,"[Even, my, brother, is, not, like, to, speak, ...","[Even, brother, like, speak, They, treat, like..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]","[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]"


### 05.Stemming

In [22]:
ps=nltk.PorterStemmer()
def stemming(tokenized_text):
    text=[ps.stem(word) for word in tokenized_text]
    return text
data_text['body_text_stemmed']=data_text['body_text_nostop'].apply(lambda x:stemming(x))
data_text.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed
0,ham,I've been searching for the right words to tha...,Ive been searching for the right words to than...,"[Ive, been, searching, for, the, right, words,...","[Ive, searching, right, words, thank, breather...","[ive, search, right, word, thank, breather, i,..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
2,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l...","[Nah, I, dont, think, goes, usf, lives, around...","[nah, i, dont, think, goe, usf, live, around, ..."
3,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,"[Even, my, brother, is, not, like, to, speak, ...","[Even, brother, like, speak, They, treat, like...","[even, brother, like, speak, they, treat, like..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]","[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]","[i, have, a, date, on, sunday, with, will]"


### 06.Lemmatization

In [30]:
from nltk.stem.wordnet import WordNetLemmatizer
wn=WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text=[wn.lemmatize(word) for word in tokenized_text]
    return text
data_text['body_text_lemmatized']=data_text['body_text_stemmed'].apply(lambda x: lemmatizing(x))
data_text.head()


BadZipFile: File is not a zip file