## SMS Spam Analysis using TF-IDF

Data Source:https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [1]:
import numpy as np
np.random.seed(500)
import pandas as pd
df=pd.read_csv('smsspamcollection/SMSSpamCollection',sep='\t',names=["class","text"])

In [2]:
df.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
class_count=df.groupby('class').count()
print(class_count)
import matplotlib.pyplot as plt
plt.bar(class_count.index.values, class_count['text'])
plt.xlabel('Review Sentiments')
plt.ylabel('Number of Review')
plt.show()

       text
class      
ham    4825
spam    747


<Figure size 640x480 with 1 Axes>

In [4]:
#Remove number
import re # import all Regular expression functions
df['text_RN']=[re.sub('\d','', i)for i in df['text']]
df.head(10)

Unnamed: 0,class,text,text_RN
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...,FreeMsg Hey there darling it's been week's no...
6,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...,Had your mobile months or more? U R entitled ...


In [5]:
# Replace punctuations with a white space
import string
df['text_RP']=[re.sub('[%s]' % re.escape(string.punctuation), ' ', i) for i in df['text_RN']]
df.head(10)

Unnamed: 0,class,text,text_RN,text_RP
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final...,Free entry in a wkly comp to win FA Cup final...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,FreeMsg Hey there darling it's been week's no...,FreeMsg Hey there darling it s been week s no...
6,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...,As per your request 'Melle Melle (Oru Minnamin...,As per your request Melle Melle Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...,WINNER!! As a valued network customer you have...,WINNER As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...,Had your mobile months or more? U R entitled ...,Had your mobile months or more U R entitled ...


In [6]:
df['text_lw']=[i.lower() for i in df['text_RN']]
df.head()

Unnamed: 0,class,text,text_RN,text_RP,text_lw
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...,"go until jurong point, crazy.. available only ..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final...,Free entry in a wkly comp to win FA Cup final...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor... u c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...,"nah i don't think he goes to usf, he lives aro..."


In [7]:
# import pandas as pd 
import pandas as pd 
#Word Tokenization
import nltk # import package for tokenization
#nltk.download('punkt') # download all spporting function /files for NLTK package
from nltk.tokenize import word_tokenize
df['text_wt'] = [word_tokenize(i) for i in df['text_lw']]
df.head()

Unnamed: 0,class,text,text_RN,text_RP,text_lw,text_wt
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, ,, crazy.., availab..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final...,Free entry in a wkly comp to win FA Cup final...,free entry in a wkly comp to win fa cup final...,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...,"nah i don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,..."


In [8]:
#To show the stop words
#nltk.download('stopwords') #download Stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
#Remove All Stop Word
df['text_SW'] = [[i for i in j if not i in stop_words] for j in df['text_wt']]# remove the word which is aviable in stopword libr
df.head()

Unnamed: 0,class,text,text_RN,text_RP,text_lw,text_wt,text_SW
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, ,, crazy.., availab...","[go, jurong, point, ,, crazy.., available, bug..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]","[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final...,Free entry in a wkly comp to win FA Cup final...,free entry in a wkly comp to win fa cup final...,"[free, entry, in, a, wkly, comp, to, win, fa, ...","[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea...","[u, dun, say, early, hor, ..., u, c, already, ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...,"nah i don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,...","[nah, n't, think, goes, usf, ,, lives, around,..."


In [9]:
#nltk.download('tagsets')
#nltk.help.upenn_tagset()# tagset documentation
#nltk.download('wordnet')
from collections import defaultdict #Default Dictionary is imported from collections
from nltk.corpus import wordnet as wn #the corpus reader wordnet is imported.
from nltk.tag import pos_tag
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN) #Dictionary is created where pos_tag (first letter) are the key values 
tag_map['J'] = wn.ADJ                   #whose values are mapped with the value 
tag_map['V'] = wn.VERB                  #from wordnet dictionary. We have taken the only first letter as 
tag_map['R'] = wn.ADV
# we will use it later in the loop.
#tag_map

In [10]:
#lemmatization
from nltk.stem import WordNetLemmatizer 
 # Initializing WordNetLemmatizer()
lemmatizer = WordNetLemmatizer()

df['lemma']=[[lemmatizer.lemmatize(word,tag_map[tag[0]]) for word ,tag in pos_tag(i)] for i in df['text_SW']] 
df.head()

Unnamed: 0,class,text,text_RN,text_RP,text_lw,text_wt,text_SW,lemma
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, ,, crazy.., availab...","[go, jurong, point, ,, crazy.., available, bug...","[go, jurong, point, ,, crazy.., available, bug..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]","[ok, lar, ..., joking, wif, u, oni, ...]","[ok, lar, ..., joke, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final...,Free entry in a wkly comp to win FA Cup final...,free entry in a wkly comp to win fa cup final...,"[free, entry, in, a, wkly, comp, to, win, fa, ...","[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea...","[u, dun, say, early, hor, ..., u, c, already, ...","[u, dun, say, early, hor, ..., u, c, already, ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...,"nah i don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,...","[nah, n't, think, goes, usf, ,, lives, around,...","[nah, n't, think, go, usf, ,, live, around, th..."


In [11]:
df['lemma2']= df['lemma'].apply(lambda x: ' '.join(x))

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(max_features=5000)
Tfidf= tf.fit_transform(df['lemma2']).toarray()


In [13]:
pd.DataFrame(Tfidf, columns=tf.get_feature_names()).head()

Unnamed: 0,____,aa,aah,aaniye,aaooooright,aathi,ab,abbey,abdomen,abeg,...,zed,zero,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y1, Test_Y1 = train_test_split(Tfidf,df['class'],test_size=0.3)

In [15]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y1)
Test_Y = Encoder.fit_transform(Test_Y1)
print(Train_Y1[1:5])
print(Train_Y[1:5])
print(Test_Y1[1:5])
print(Test_Y[1:5])

1481     ham
3894     ham
4050     ham
3010    spam
Name: class, dtype: object
[0 0 0 1]
3758    spam
3089     ham
1298     ham
3574    spam
Name: class, dtype: object
[1 0 0 1]


In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(Train_X, Train_Y)
predicted= clf.predict(Test_X)
print("MultinomialNB Accuracy:",round(accuracy_score(predicted,Test_Y)*100,2),"%")

MultinomialNB Accuracy: 96.17 %


In [17]:
from sklearn import model_selection, svm
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",round(accuracy_score(predictions_SVM, Test_Y)*100,2),"%")

SVM Accuracy Score ->  97.79 %


In [18]:
# Fitting Random Forest Classification 
# to the Training set 
from sklearn.ensemble import RandomForestClassifier 

# n_estimators can be said as number of 
# trees, experiment with n_estimators 
# to get better results 
model = RandomForestClassifier(n_estimators = 501, criterion = 'entropy') 
model.fit(Train_X, Train_Y) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=501,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
# Predicting the Test set results 
y_pred = model.predict(Test_X) 
# Making the Confusion Matrix 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(Test_Y, y_pred) 
print("Confusion Matrix ->",cm)
# Use accuracy_score function to get the accuracy
print("Random forest Accuracy Score -> ",round(accuracy_score(y_pred, Test_Y)*100,2),"%")

Confusion Matrix -> [[1436    0]
 [  44  192]]
Random forest Accuracy Score ->  97.37 %


In [20]:
# 1. import
from sklearn.linear_model import LogisticRegression
# 2. instantiate a logistic regression model
logreg = LogisticRegression()
# 3. train the model using X_train_dtm
%time logreg.fit(Train_X, Train_Y)

Wall time: 113 ms




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
#4. make class predictions for Test_X
predictions_log = logreg.predict(Test_X)

print(" ------ Confusion Matrix-----[TN FP  FN TP]")

print(metrics.confusion_matrix(predictions_log, Test_Y))
print(metrics.classification_report(predictions_log, Test_Y))
# Use accuracy_score function to get the accuracy
print("Logistic Regression Accuracy Score -> ",round(accuracy_score(predictions_log, Test_Y)*100,2),"%")
print("Logistic Regression Area under curve -> ",round(metrics.roc_auc_score(predictions_log, Test_Y),2))

 ------ Confusion Matrix-----[TN FP  FN TP]
[[1435   77]
 [   1  159]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.97      1512
           1       0.67      0.99      0.80       160

    accuracy                           0.95      1672
   macro avg       0.84      0.97      0.89      1672
weighted avg       0.97      0.95      0.96      1672

Logistic Regression Accuracy Score ->  95.33 %
Logistic Regression Area under curve ->  0.97
