# Use-case : SMS Spam Classification

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('SMSSpamCollection' , sep='\t', names = ['label','messages'])

In [3]:
data.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   label     5572 non-null   object
 1   messages  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
#Check whether the dataset is balanced or unbalanced
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [6]:
#NLP packages:
# NLTK (Natural Language ToolKit)
# Spacy
# Sklearn

In [4]:
import nltk

In [8]:
# Download Stopwords
# nltk.download('stopwords')

In [5]:
from nltk.corpus import stopwords

In [10]:
#Text Preprocessing

In [6]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
example="Welcome to the session!!! Nice to meet you!"

In [9]:
# remove_pct_ex = [character for character in example if character not in string.punctuation]
# remove_pct_ex

In [11]:
#''.join(remove_pct_ex).split(" ")

In [20]:
import string
from nltk.corpus import stopwords
def preprocessing(feature):
    # 1. Remove Punctuations
    removePunctuations = [character for character in feature if character not in string.punctuation]
    # 2. Convert Sentences to Words
    sentencesWithoutPunctuations = ''.join(removePunctuations)
    words = sentencesWithoutPunctuations.split(" ")
    # 3. Remove the Stopwords and convert word in lower case
    finalWords = [word.lower() for word in words if word not in stopwords.words('english')]
    return finalWords

In [21]:
preprocessing(example)

['welcome', 'session', 'nice', 'meet']

In [17]:
# Use sklearn to create BOW

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=preprocessing)
#Build the Vocabulory
finalWordVector = wordVector.fit(data['messages'])

In [24]:
# finalWordVector.vocabulary_

In [25]:
#BUILD BOW
bagOfWords = wordVector.transform(data['messages'])

In [26]:
bagOfWords

<5572x9649 sparse matrix of type '<class 'numpy.int64'>'
	with 56616 stored elements in Compressed Sparse Row format>

In [30]:
representBOW = bagOfWords.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [32]:
representBOW.shape

(5572, 9649)

In [34]:
#Apply TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfObject = TfidfTransformer().fit(bagOfWords)

In [35]:
#Final Feature Array
featureArray = tfidfObject.transform(bagOfWords)

In [36]:
featureArray

<5572x9649 sparse matrix of type '<class 'numpy.float64'>'
	with 56616 stored elements in Compressed Sparse Row format>

In [37]:
representFeatureArray = featureArray.toarray()
representFeatureArray

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.1538349, 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [32]:
#Modelling
from sklearn.linear_model import LogisticRegression
modelSMSSpamIdentifier = LogisticRegression()
modelSMSSpamIdentifier.fit(featureArray,data.label.values)

LogisticRegression()

In [38]:
feature=featureArray
label=data.label.values
modelSMSSpamIdentifier.score(feature,label)

0.9723618090452262

In [40]:
predicted=modelSMSSpamIdentifier.predict(featureArray)

In [41]:
from sklearn.metrics import confusion_matrix
confusion_matrix(label,predicted)

array([[4821,    4],
       [ 150,  597]], dtype=int64)

In [43]:
from sklearn.metrics import classification_report
print(classification_report(label,predicted))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      4825
        spam       0.99      0.80      0.89       747

    accuracy                           0.97      5572
   macro avg       0.98      0.90      0.94      5572
weighted avg       0.97      0.97      0.97      5572

