# NLP Basic: Implement pipline to clean text

## Pre-Processing text data
1. Remove punctuation 
2. Tokenization 
3. Remove stopwords
4. Lemmatize/Stem (they are helpful but not always critical in an NLP pipeline) 

In [1]:
import pandas as pd
import string 
import re
import nltk

In [2]:
pd.set_option('display.max_colwidth', 100)

In [3]:
data = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header = None)
data.columns = ['label', 'body_text']

data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [4]:
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
def remove_punctuation(text):
    text_nopunct = ''.join([char for char in text if char not in string.punctuation])
    return text_nopunct

In [6]:
data['body_text_clean'] = data['body_text'].apply(lambda x: remove_punctuation(x))
data.head()

Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


## Tokenization

In [7]:
def tokenize(text):
    tokens = re.split('\W+',text)
    return tokens

In [8]:
data['body_text_clean'] = data['body_text_clean'].apply(lambda x: tokenize(x.lower()))

In [9]:
data.head()

Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[i, have, a, date, on, sunday, with, will]"


## Remove Stopwords

In [12]:
stopwords = nltk.corpus.stopwords.words('english')

In [13]:
def remove_stopwords(tokenized_list, stopword):
    text = [word for word in tokenized_list if word not in stopword]
    return text

In [14]:
data['body_text_noStop'] = data['body_text_clean'].apply(lambda x: remove_stopwords(x,stopwords))
data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_noStop
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"


# Stemming  

In [15]:
ps = nltk.PorterStemmer()

In [16]:
raw_data = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header = None)

In [17]:
dir(ps)

['MARTIN_EXTENSIONS',
 'NLTK_EXTENSIONS',
 'ORIGINAL_ALGORITHM',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_apply_rule_list',
 '_contains_vowel',
 '_ends_cvc',
 '_ends_double_consonant',
 '_has_positive_measure',
 '_is_consonant',
 '_measure',
 '_replace_suffix',
 '_step1a',
 '_step1b',
 '_step1c',
 '_step2',
 '_step3',
 '_step4',
 '_step5a',
 '_step5b',
 'mode',
 'pool',
 'stem',
 'vowels']

In [18]:
print(ps.stem('grows'))
print(ps.stem('grown')) # Does not work
print(ps.stem('growing'))

grow
grown
grow


In [19]:
print(ps.stem('hoping'))
print(ps.stem('hoped'))
print(ps.stem('hopeful'))

hope
hope
hope


In [20]:
print(ps.stem('run'))
print(ps.stem('running')) 
print(ps.stem('runner')) # noun

run
run
runner


In [21]:
print(ps.stem('meaning'))
print(ps.stem('meanness')) 

mean
mean


In [22]:
raw_data.columns  = ['label', 'body_text']
raw_data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [23]:
def clean_text(text, stopword):
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+',text) # Split on any character that is not alphanumeric
    text = [word for word in tokens if word not in stopword] # Remove stopwords
    
    return text

In [24]:
raw_data['body_text_clean'] = raw_data['body_text'].apply(lambda x: clean_text(x.lower(),stopwords))
raw_data.head()

Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"


## Stemming
- Note: Stemming does not always work well with text messages (lots of abbreviations & slang) 

In [25]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text 

In [26]:
raw_data['body_text_stemmed'] = raw_data['body_text_clean'].apply(lambda x: stemming(x))
raw_data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_stemmed
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom...","[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]"


In [37]:
# Lemmatizing 

In [38]:
lemmat_data = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header = None)

In [39]:
# Random test

test_str = 'the crowd believed that the man was lying'
test_str = re.split('\s',test_str)
test_str = [word for word in test_str if word not in stopwords]
print(test_str)

['crowd', 'believed', 'man', 'lying']


In [40]:
# WordNet lemmatizer
wn = nltk.WordNetLemmatizer()

In [41]:
dir(wn)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'lemmatize']

In [42]:
print(ps.stem('meanness'))
print(ps.stem('meaning'))

mean
mean


In [43]:
print(wn.lemmatize('meanness'))
print(wn.lemmatize('meaning'))

meanness
meaning


In [44]:
print('Stemming\n')
print(ps.stem('goose'))
print(ps.stem('geese'))
print('\nLemmatize\n')
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))
print(wn.lemmatize('ran','v'))
print(wn.lemmatize('running','v'))
print(wn.lemmatize('runner','n'))

Stemming

goos
gees

Lemmatize

goose
goose
run
run
runner


In [45]:
lemmat_data.columns =['label', 'body_text']
lemmat_data['body_text_clean']=lemmat_data['body_text'].apply(lambda x: clean_text(x.lower(),stopwords))
lemmat_data.head()

Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"


In [46]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

In [47]:
lemmat_data['body_text_lemmat'] = lemmat_data['body_text_clean'].apply(lambda x: lemmatizing(x))
lemmat_data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_lemmat
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom...","[ive, searching, right, word, thank, breather, promise, wont, take, help, granted, fulfil, promi..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, go, usf, life, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]"
