# SMS Spam Classification using NLP


References for dataset:
https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [1]:
import numpy as np
import pandas as pd
import nltk
import re

In [2]:
dataset = pd.read_csv("SMSspam_dataset.csv", encoding="ISO-8859-1")
dataset.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [3]:
del dataset['Unnamed: 2']
del dataset['Unnamed: 3']
del dataset['Unnamed: 4']

In [4]:
dataset.head(5)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
dataset.shape

(5572, 2)

In [6]:
from nltk.corpus import stopwords

In [7]:
for i in dataset.index:
    dataset['v2'][i] = re.sub('[^a-zA-Z ]','',dataset['v2'][i])
    
dataset.head(10)

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in a wkly comp to win FA Cup final...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...
5,spam,FreeMsg Hey there darling its been weeks now ...
6,ham,Even my brother is not like to speak with me T...
7,ham,As per your request Melle Melle Oru Minnaminun...
8,spam,WINNER As a valued network customer you have b...
9,spam,Had your mobile months or more U R entitled t...


In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [10]:
text = []
for i in dataset.index:
    dataset['v2'][i] = re.sub('[^a-zA-Z ]','',dataset['v2'][i])
    temp = dataset['v2'][i].lower().split()
    clean = [word for word in temp if word not in stopwords.words('english')]
    clean = " ".join(clean)
    text.append(clean)

In [11]:
text[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [12]:
type(text)

list

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text)

In [15]:
type(X)

scipy.sparse.csr.csr_matrix

In [16]:
X = X.toarray()

In [17]:
type(X)

numpy.ndarray

In [18]:
X.shape

(5572, 8390)

In [19]:
Y = dataset['v1']
Y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.33,random_state=2)

In [22]:
X_train.shape

(3733, 8390)

In [23]:
Y_train.shape

(3733,)

In [24]:
from sklearn.naive_bayes import GaussianNB

In [25]:
nv=GaussianNB()
nv.fit(X_train,Y_train)

GaussianNB()

In [26]:
Y_pred=nv.predict(X_test)

In [27]:
from sklearn.metrics import *

In [28]:
f1_score(Y_test,Y_pred,pos_label="ham")

0.928430728824688

In [29]:
precision_score(Y_test,Y_pred,pos_label="ham")

0.9671682626538988

In [30]:
recall_score(Y_test,Y_pred,pos_label="ham")

0.8926767676767676

In [31]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,Y_train)

RandomForestClassifier()

In [32]:
Y_pred=rf.predict(X_test)

In [33]:
f1_score(Y_test,Y_pred,pos_label="ham")

0.9829351535836178

In [34]:
precision_score(Y_test,Y_pred,pos_label="ham")

0.9664429530201343

In [35]:
recall_score(Y_test,Y_pred,pos_label="ham")

1.0

In [36]:
accuracy_score(Y_test,Y_pred)

0.9700924415443176

In [38]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1584
        spam       1.00      0.78      0.88       255

    accuracy                           0.97      1839
   macro avg       0.98      0.89      0.93      1839
weighted avg       0.97      0.97      0.97      1839

