# NLP (Basic)

In [1]:
import pandas as pd


# nltk:
import nltk
import re  # use if tor tokenizing
import string

#sklearn
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
#ML Algorithms:
from sklearn.ensemble  import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier



In [2]:
# Read in the raw text
rawData = open("SMSSpamCollection.tsv").read()
print(rawData[0:300])

ham	I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receiv


In [7]:
pd.set_option('display.max_colwidth',150)
dataSet = pd.read_csv("SMSSpamCollection.tsv", sep= '\t', header = None)
dataSet.columns=['label','body_text']
dataSet[0:10]
dataSet_Steps = dataSet.copy()
dataSet_cleaned = dataSet.copy()
print(len(dataSet))

5568


## Implementation :

### Data Cleaning ( Step by Step)

I ) Removing punctuation:

In [5]:
string.punctuation
def remove_punc (text):
    text_noPunc="".join([char for char in text if char not in string.punctuation])
    return text_noPunc

dataSet_Steps['body_text_noPunc'] = dataSet_Steps['body_text'].apply(lambda x : remove_punc(x))
dataSet_Steps

Unnamed: 0,label,body_text,body_text_noPunc
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. Yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You h...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL
...,...,...,...
5563,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-...",This is the 2nd time we have tried 2 contact u U have won the £750 Pound prize 2 claim is easy call 087187272008 NOW1 Only 10p per minute BTnation...
5564,ham,Will ü b going to esplanade fr home?,Will ü b going to esplanade fr home
5565,ham,"Pity, * was in mood for that. So...any other suggestions?",Pity was in mood for that Soany other suggestions
5566,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free,The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free


II ) Tokenization : split string in to a list of words.

Regular expression or abbreviated Regex is a text string that describes a specific search pattern. Regex is useful for dealing with text data to identify and search for specific criteria. for instance, using Regex in order to tokenize or split a sentence into a list of words.

In [6]:
def tokeize_text(text):
    token = re.split('\W+',text) # '\W+': will split whenever it sees one or more non-word chars
    return token
    
dataSet_Steps['body_text_tokenized'] = dataSet_Steps['body_text_noPunc'].apply(lambda x : tokeize_text(x.lower()))   # lower() has a vital rule here
dataSet_Steps

Unnamed: 0,label,body_text,body_text_noPunc,body_text_tokenized
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. Yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You h...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, i, wont, take, your, help, for, granted, and, will..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, questionstd, txt, rat..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"
...,...,...,...,...
5563,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-...",This is the 2nd time we have tried 2 contact u U have won the £750 Pound prize 2 claim is easy call 087187272008 NOW1 Only 10p per minute BTnation...,"[this, is, the, 2nd, time, we, have, tried, 2, contact, u, u, have, won, the, 750, pound, prize, 2, claim, is, easy, call, 087187272008, now1, onl..."
5564,ham,Will ü b going to esplanade fr home?,Will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]"
5565,ham,"Pity, * was in mood for that. So...any other suggestions?",Pity was in mood for that Soany other suggestions,"[pity, was, in, mood, for, that, soany, other, suggestions]"
5566,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free,The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free,"[the, guy, did, some, bitching, but, i, acted, like, id, be, interested, in, buying, something, else, next, week, and, he, gave, it, to, us, for, ..."


III ) Removing Stop words :

In [7]:
stopWords = nltk.corpus.stopwords.words('english')
def remove_stopWords(text):
    text_noStop=[word for word in text if word not in stopWords]
    return text_noStop
    
dataSet_Steps['body_text_noStop'] = dataSet_Steps['body_text_tokenized'].apply(lambda x : remove_stopWords(x) )
dataSet_Steps

Unnamed: 0,label,body_text,body_text_noPunc,body_text_tokenized,body_text_noStop
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. Yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You h...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, i, wont, take, your, help, for, granted, and, will...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, promise, wonderful, blessing, times]"
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, questionstd, txt, rat...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 084528..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"
...,...,...,...,...,...
5563,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-...",This is the 2nd time we have tried 2 contact u U have won the £750 Pound prize 2 claim is easy call 087187272008 NOW1 Only 10p per minute BTnation...,"[this, is, the, 2nd, time, we, have, tried, 2, contact, u, u, have, won, the, 750, pound, prize, 2, claim, is, easy, call, 087187272008, now1, onl...","[2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1, 10p, per, minute, btnationalrate]"
5564,ham,Will ü b going to esplanade fr home?,Will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]","[ü, b, going, esplanade, fr, home]"
5565,ham,"Pity, * was in mood for that. So...any other suggestions?",Pity was in mood for that Soany other suggestions,"[pity, was, in, mood, for, that, soany, other, suggestions]","[pity, mood, soany, suggestions]"
5566,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free,The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free,"[the, guy, did, some, bitching, but, i, acted, like, id, be, interested, in, buying, something, else, next, week, and, he, gave, it, to, us, for, ...","[guy, bitching, acted, like, id, interested, buying, something, else, next, week, gave, us, free]"


IV ) Stemming and lemmatizing :

In [9]:
# a: Stemming
ps = nltk.PorterStemmer()
#dir(ps)
def stem_text(text):
   stem_tokens= [ps.stem(word) for word in text]
   return stem_tokens
dataSet_Steps['body_text_Stem'] = dataSet_Steps['body_text_noStop'].apply(lambda x: stem_text(x))
dataSet_Steps


Unnamed: 0,label,body_text,body_text_noPunc,body_text_tokenized,body_text_noStop,body_text_Stem
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. Yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You h...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, i, wont, take, your, help, for, granted, and, will...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, promise, wonderful, blessing, times]","[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, wonder, bless, time]"
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, questionstd, txt, rat...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 084528...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv, entri, questionstd, txt, ratetc, appli, 084528100..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]"
...,...,...,...,...,...,...
5563,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-...",This is the 2nd time we have tried 2 contact u U have won the £750 Pound prize 2 claim is easy call 087187272008 NOW1 Only 10p per minute BTnation...,"[this, is, the, 2nd, time, we, have, tried, 2, contact, u, u, have, won, the, 750, pound, prize, 2, claim, is, easy, call, 087187272008, now1, onl...","[2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1, 10p, per, minute, btnationalrate]","[2nd, time, tri, 2, contact, u, u, 750, pound, prize, 2, claim, easi, call, 087187272008, now1, 10p, per, minut, btnationalr]"
5564,ham,Will ü b going to esplanade fr home?,Will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]","[ü, b, going, esplanade, fr, home]","[ü, b, go, esplanad, fr, home]"
5565,ham,"Pity, * was in mood for that. So...any other suggestions?",Pity was in mood for that Soany other suggestions,"[pity, was, in, mood, for, that, soany, other, suggestions]","[pity, mood, soany, suggestions]","[piti, mood, soani, suggest]"
5566,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free,The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free,"[the, guy, did, some, bitching, but, i, acted, like, id, be, interested, in, buying, something, else, next, week, and, he, gave, it, to, us, for, ...","[guy, bitching, acted, like, id, interested, buying, something, else, next, week, gave, us, free]","[guy, bitch, act, like, id, interest, buy, someth, els, next, week, gave, us, free]"


In [10]:
# b: Lemmatizing
wn = nltk.WordNetLemmatizer()
dir(wn)
def lem_text(text):
   lem_tokens= [wn.lemmatize(word) for word in text]
   return lem_tokens
dataSet_Steps['body_text_lem'] = dataSet_Steps['body_text_noStop'].apply(lambda x: lem_text(x))
dataSet_Steps


Unnamed: 0,label,body_text,body_text_noPunc,body_text_tokenized,body_text_noStop,body_text_Stem,body_text_lem
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. Yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You h...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, i, wont, take, your, help, for, granted, and, will...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, promise, wonderful, blessing, times]","[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, wonder, bless, time]","[ive, searching, right, word, thank, breather, promise, wont, take, help, granted, fulfil, promise, wonderful, blessing, time]"
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, questionstd, txt, rat...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 084528...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv, entri, questionstd, txt, ratetc, appli, 084528100...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 084528..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]","[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]","[date, sunday]"
...,...,...,...,...,...,...,...
5563,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-...",This is the 2nd time we have tried 2 contact u U have won the £750 Pound prize 2 claim is easy call 087187272008 NOW1 Only 10p per minute BTnation...,"[this, is, the, 2nd, time, we, have, tried, 2, contact, u, u, have, won, the, 750, pound, prize, 2, claim, is, easy, call, 087187272008, now1, onl...","[2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1, 10p, per, minute, btnationalrate]","[2nd, time, tri, 2, contact, u, u, 750, pound, prize, 2, claim, easi, call, 087187272008, now1, 10p, per, minut, btnationalr]","[2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1, 10p, per, minute, btnationalrate]"
5564,ham,Will ü b going to esplanade fr home?,Will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]","[ü, b, going, esplanade, fr, home]","[ü, b, go, esplanad, fr, home]","[ü, b, going, esplanade, fr, home]"
5565,ham,"Pity, * was in mood for that. So...any other suggestions?",Pity was in mood for that Soany other suggestions,"[pity, was, in, mood, for, that, soany, other, suggestions]","[pity, mood, soany, suggestions]","[piti, mood, soani, suggest]","[pity, mood, soany, suggestion]"
5566,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free,The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free,"[the, guy, did, some, bitching, but, i, acted, like, id, be, interested, in, buying, something, else, next, week, and, he, gave, it, to, us, for, ...","[guy, bitching, acted, like, id, interested, buying, something, else, next, week, gave, us, free]","[guy, bitch, act, like, id, interest, buy, someth, els, next, week, gave, us, free]","[guy, bitching, acted, like, id, interested, buying, something, else, next, week, gave, u, free]"


### Data Cleaning ( in a function )

In [11]:
dataSet_cleaned

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. Yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
...,...,...
5563,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-..."
5564,ham,Will ü b going to esplanade fr home?
5565,ham,"Pity, * was in mood for that. So...any other suggestions?"
5566,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free


In [12]:
def cleaned_text (text):
    text_noPunc="".join([char.lower() for char in text if char not in string.punctuation])
    text_tokenized = re.split('\W+' , text_noPunc)
    text_noStopWords = [ word for word in text_tokenized if word not in stopWords]
    text_lemmatized =  [wn.lemmatize(word) for word in text_noStopWords]
    return text_lemmatized

dataSet_cleaned['body_text_cleaned'] = dataSet_cleaned['body_text'].apply ( lambda x: cleaned_text(x))
dataSet_cleaned

Unnamed: 0,label,body_text,body_text_cleaned
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. Yo...,"[ive, searching, right, word, thank, breather, promise, wont, take, help, granted, fulfil, promise, wonderful, blessing, time]"
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 084528..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, go, usf, life, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"
...,...,...,...
5563,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-...","[2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1, 10p, per, minute, btnationalrate]"
5564,ham,Will ü b going to esplanade fr home?,"[ü, b, going, esplanade, fr, home]"
5565,ham,"Pity, * was in mood for that. So...any other suggestions?","[pity, mood, soany, suggestion]"
5566,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free,"[guy, bitching, acted, like, id, interested, buying, something, else, next, week, gave, u, free]"


### Vectorizing

#### I ) Count_Vectorizing:

In [13]:
# count_Vectorizing allow to to pass data in a function .
cleaned_text_countVect = dataSet_cleaned.copy()
count_vect = CountVectorizer (analyzer = cleaned_text)
x_count_vect = count_vect.fit_transform (cleaned_text_countVect['body_text'])
print (x_count_vect .shape) # how many unique columns we have
print ( count_vect.get_feature_names()[0:100]) # all unique words across the dataset.
x_count_df = pd.DataFrame(x_count_vect.toarray())
x_count_df

(5568, 8914)
['', '0', '008704050406', '0089my', '0121', '01223585236', '01223585334', '0125698789', '02', '020603', '0207', '02070836089', '02072069400', '02073162414', '02085076972', '020903', '021', '050703', '0578', '06', '060505', '061104', '07008009200', '07046744435', '07090201529', '07090298926', '07099833605', '071104', '07123456789', '0721072', '07732584351', '07734396839', '07742676969', '07753741225', '0776xxxxxxx', '07786200117', '077xxx', '078', '07801543489', '07808', '07808247860', '07808726822', '07815296484', '07821230901', '0784987', '0789xxxxxxx', '0794674629107880867867', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '0800', '08000407165', '08000776320', '08000839402', '08000930705', '08000938767', '08001950382', '08002888812', '08002986030', '08002986906', '08002988890', '08006344447', '0808', '08081263000', '08081560665', '0825', '0844', '08448350055', '08448714184', '0845', '08450542832', '08452810071', '08452810073', '08452810075over18s', '0870', '08700621170150p



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8904,8905,8906,8907,8908,8909,8910,8911,8912,8913
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### II ) N-gram_Vectorizing:

In [14]:
cleaned_text_NGram = dataSet_cleaned.copy()
nGram_vect = CountVectorizer(ngram_range=(1, 1)) # search only for unigram
nGram= nGram_vect.fit_transform(cleaned_text_NGram['body_text'])
nGram_df = pd.DataFrame(nGram.toarray())#convert the spares matrix to datafram
nGram_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### III ) TF-IDF Vectorizing:

In [15]:
cleaned_text_TFIDF = dataSet_cleaned.copy()
tfIdf_vect = TfidfVectorizer(analyzer = cleaned_text)
tfIDF = tfIdf_vect.fit_transform(cleaned_text_TFIDF['body_text'])
tfIdf_df = pd.DataFrame ( tfIDF.toarray())
tfIdf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8904,8905,8906,8907,8908,8909,8910,8911,8912,8913
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
5564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329062,0.0,0.0
5565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
5566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


## Machine learning Classifier:

#### Random Forest Classifier : 

##### Using Grid search:

In [16]:
X_train ,X_test , y_train , y_test = train_test_split(tfIdf_df,cleaned_text_TFIDF['label'],test_size =0.2)
def train_RF ( n_est , depth):
    rf = RandomForestClassifier(n_estimators = n_est, max_depth = depth , n_jobs = -1) # n_jobs,bulding Dtrees in parallel
    rf_model = rf.fit(X_train ,y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score (y_test ,y_pred, pos_label = 'spam', average = 'binary')
    print ('Est_num : {} / depth : {}  ---- precision : {}, recall : {}, Accuracy : {}'.format (n_est, depth, round(precision, 3), round(recall , 3), round(((y_pred == y_test).sum() / len(y_pred)),3)))

In [17]:
for n_est in [10,20,50,100]:
    for depth in [10,20,30,50,None]:
        train_RF (n_est , depth)

Est_num : 10 / depth : 10  ---- precision : 1.0, recall : 0.235, Accuracy : 0.909
Est_num : 10 / depth : 20  ---- precision : 1.0, recall : 0.47, Accuracy : 0.937
Est_num : 10 / depth : 30  ---- precision : 1.0, recall : 0.742, Accuracy : 0.969
Est_num : 10 / depth : 50  ---- precision : 0.971, recall : 0.773, Accuracy : 0.97
Est_num : 10 / depth : None  ---- precision : 1.0, recall : 0.803, Accuracy : 0.977
Est_num : 20 / depth : 10  ---- precision : 1.0, recall : 0.235, Accuracy : 0.909
Est_num : 20 / depth : 20  ---- precision : 1.0, recall : 0.538, Accuracy : 0.945
Est_num : 20 / depth : 30  ---- precision : 1.0, recall : 0.697, Accuracy : 0.964
Est_num : 20 / depth : 50  ---- precision : 1.0, recall : 0.765, Accuracy : 0.972
Est_num : 20 / depth : None  ---- precision : 1.0, recall : 0.788, Accuracy : 0.975
Est_num : 50 / depth : 10  ---- precision : 1.0, recall : 0.136, Accuracy : 0.898
Est_num : 50 / depth : 20  ---- precision : 1.0, recall : 0.53, Accuracy : 0.944
Est_num : 50 

##### Tune Hyperparameters by Using Grid SearchCv:

In [18]:
# By using TF-IDF:
rf_CV = RandomForestClassifier()
param = {'n_estimator' : [10,100,300],'max_depth' : [30,50,90,None]}
# dir(GridSearchCV)
rf_GrSearchCV = GridSearchCV( rf_CV,param, cv= 5,n_jobs =-1)
rf_GrSearchCV_fit = rf_GrSearchCV.fit(tfIdf_df,cleaned_text_TFIDF['label'])
GrSearchCV_df = pd.DataFrame(rf_GrSearchCV_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]

##### Tune Hyperparameters by Using Grid SearchCv::

In [None]:
# By using TF-IDF:
gb_CV = GradientBoostingClassifier()
param = {'n_estimator' : [100,150],'max_depth' : [7,10,15]}
# dir(GridSearchCV)
gb_GrSearchCV = GridSearchCV( gb_CV,param, cv= 5,n_jobs =-1)
gb_GrSearchCV_fit = gb_GrSearchCV.fit(tfIdf_df,cleaned_text_TFIDF['label'])
gb_GrSearchCV_df = pd.DataFrame(gb_GrSearchCV_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]