# SMS Spam Classification using NLP


References for dataset:
https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [1]:
import numpy as np
import pandas as pd
import nltk
import re

In [2]:
dataset = pd.read_csv("SMSspam_dataset.csv", encoding="ISO-8859-1")
dataset.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [3]:
del dataset['Unnamed: 2']
del dataset['Unnamed: 3']
del dataset['Unnamed: 4']

In [4]:
dataset.head(5)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
dataset.shape

(5572, 2)

In [6]:
from nltk.corpus import stopwords

# Removing the punctuations,numbers and special characters

In [7]:
for i in dataset.index:
    dataset['v2'][i] = re.sub('[^a-zA-Z ]','',dataset['v2'][i])
    
dataset.head(10)

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in a wkly comp to win FA Cup final...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...
5,spam,FreeMsg Hey there darling its been weeks now ...
6,ham,Even my brother is not like to speak with me T...
7,ham,As per your request Melle Melle Oru Minnaminun...
8,spam,WINNER As a valued network customer you have b...
9,spam,Had your mobile months or more U R entitled t...


# Lowering the characters and removing the stopwords

In [8]:
text = []
for i in dataset.index:
    dataset['v2'][i] = re.sub('[^a-zA-Z ]','',dataset['v2'][i])
    temp = dataset['v2'][i].lower().split()
    clean = [word for word in temp if word not in stopwords.words('english')]
    clean = " ".join(clean)
    text.append(clean)

In [9]:
text[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [10]:
type(text)

list

# Tokenization

In [11]:
for i in range(len(text)):
    text[i] = text[i].split()

In [12]:
text[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

# Stemmation

In [13]:
from nltk import PorterStemmer

In [14]:
st = PorterStemmer()

In [15]:
for i in range(len(text)):
    text[i] = [st.stem(word) for word in text[i]]

# Recombining Tokens

In [16]:
for i in range(len(text)):
    text[i] = " ".join(text[i])

In [17]:
text[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

# Using Tf-Idf Vectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text)

In [20]:
type(X)

scipy.sparse.csr.csr_matrix

In [21]:
X = X.toarray()

In [22]:
type(X)

numpy.ndarray

In [23]:
X.shape

(5572, 7055)

In [24]:
Y = dataset['v1']
Y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

# Splitting into train & test dataset

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.33,random_state=2)

In [27]:
X_train.shape

(3733, 7055)

In [28]:
Y_train.shape

(3733,)

# Applying Gaussian Naive Bayes

In [29]:
from sklearn.naive_bayes import GaussianNB

In [30]:
nv=GaussianNB()
nv.fit(X_train,Y_train)

GaussianNB()

In [31]:
Y_pred=nv.predict(X_test)

In [32]:
from sklearn.metrics import *

In [33]:
f1_score(Y_test,Y_pred,pos_label="ham")

0.9194187582562748

In [34]:
precision_score(Y_test,Y_pred,pos_label="ham")

0.96398891966759

In [35]:
recall_score(Y_test,Y_pred,pos_label="ham")

0.8787878787878788

In [36]:
accuracy_score(Y_test,Y_pred)

0.8673191952147906

In [37]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

         ham       0.96      0.88      0.92      1584
        spam       0.51      0.80      0.62       255

    accuracy                           0.87      1839
   macro avg       0.74      0.84      0.77      1839
weighted avg       0.90      0.87      0.88      1839



# Applying Logistics Regression

In [38]:
from sklearn.linear_model import LogisticRegression
Log_Reg = LogisticRegression(random_state=0,solver='lbfgs')
Log_Reg.fit(X_train,Y_train)

LogisticRegression(random_state=0)

In [39]:
Y_pred=Log_Reg.predict(X_test)

In [40]:
accuracy_score(Y_test,Y_pred)

0.9494290375203915

In [41]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1584
        spam       0.99      0.64      0.78       255

    accuracy                           0.95      1839
   macro avg       0.97      0.82      0.88      1839
weighted avg       0.95      0.95      0.94      1839



# Applying Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,Y_train)

RandomForestClassifier()

In [43]:
Y_pred=rf.predict(X_test)

In [44]:
f1_score(Y_test,Y_pred,pos_label="ham")

0.9811087023846392

In [45]:
precision_score(Y_test,Y_pred,pos_label="ham")

0.9629179331306991

In [46]:
recall_score(Y_test,Y_pred,pos_label="ham")

1.0

In [47]:
accuracy_score(Y_test,Y_pred)

0.9668297988036977

In [48]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1584
        spam       1.00      0.76      0.86       255

    accuracy                           0.97      1839
   macro avg       0.98      0.88      0.92      1839
weighted avg       0.97      0.97      0.96      1839



# Applying XGBoost

In [49]:
from xgboost import XGBClassifier
xg = XGBClassifier(random_state=22,learning_rate=0.9)
xg.fit(X_train,Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.9, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=22,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [50]:
Y_pred=xg.predict(X_test)

In [51]:
accuracy_score(Y_test,Y_pred)

0.9624796084828712

In [52]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

         ham       0.97      0.99      0.98      1584
        spam       0.92      0.80      0.86       255

    accuracy                           0.96      1839
   macro avg       0.94      0.90      0.92      1839
weighted avg       0.96      0.96      0.96      1839



# Applying rbf kernel SVM

In [53]:
from sklearn.svm import SVC
sv=SVC(kernel='rbf')
sv.fit(X_train,Y_train)

SVC()

In [54]:
Y_pred=sv.predict(X_test)

In [55]:
accuracy_score(Y_test,Y_pred)

0.967373572593801

In [56]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1584
        spam       0.99      0.77      0.87       255

    accuracy                           0.97      1839
   macro avg       0.98      0.89      0.92      1839
weighted avg       0.97      0.97      0.97      1839

