In [None]:
# importing necessary modules for classification
import pandas as pd
import numpy as np

In [14]:
# loading the data on 'spam.csv' onto messages DataFrame
messages = pd.read_csv('spam.csv',usecols=["category","message"],encoding='latin-1')
messages.head()

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
#learning about the size and datatypes of the attributes
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  5572 non-null   object
 1   message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [16]:
# Finding if there are any missing values
messages.isnull().sum()

category    0
message     0
dtype: int64

In [17]:
# Determining the number of rows and columns in the dataframe
messages.shape

(5572, 2)

In [18]:
# Determining the number of spam and ham (valid) messages
messages['category'].value_counts()

category
ham     4825
spam     747
Name: count, dtype: int64

In [19]:
# Code for calculating the length of messages
length=[]
for i in range(len(messages)):
    length.append(len(messages['message'][i]))

# Updating the DataFrame with the length column
messages['length']=length

messages.head()

Unnamed: 0,category,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [22]:
# Calculating the number of punctuations in each message
# Use the string module to check for punctuations
import string
punctuation=[]
for i in range(len(messages)):
    count=0
    for j in messages['message'][i]:
        if j in string.punctuation:
            count+=1
    punctuation.append(count)
# Updating the DataFrame with the punctuations column
messages['punctuation']=punctuation

messages.head()

Unnamed: 0,category,message,length,punctuation
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [30]:
# Text Cleaning - Removal of stop words, punctuations and lemmatization of words
# For Regex
import re
# For Stopwords
import nltk
nltk.download('stopwords')
# For lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...


True

In [104]:
# Object for WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# Array to store the message after text cleaning
result=[]
# Traverse the messages
for i in range (0,len(messages)):
    # Eliminate characters other than the uppercase and lowercase alphabet
    words = re.sub('[^a-zA-Z]',' ',messages['message'][i])

    # Convert the messages into lowercase
    words = words.lower()

    # Split the message
    words = words.split()

    # Lemmatize the words if it is not contained in the set of stopwords
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]

    # Join back all the words
    words = ' '.join(words)

    # Append the message back to result
    result.append(words)

# Update the 'message' column with the result
messages['message'] = result

messages.head()

Unnamed: 0,category,message,length,punctuation
0,ham,ong point crazy available bugis n great world ...,111,9
1,ham,r joking wif u oni,29,6
2,spam,entry wkly comp win fa cup final tkts st may t...,155,6
3,ham,say early hor u c already say,49,6
4,ham,hink go usf life around though,61,2


In [105]:
# Splitting messages into spam and ham
spam_messages = messages[messages['category'] == 'spam']
ham_messages = messages[messages['category'] == 'ham']

In [106]:
spam_messages.head()

Unnamed: 0,category,message,length,punctuation
2,spam,entry wkly comp win fa cup final tkts st may t...,155,6
5,spam,msg hey darling week word back like fun still ...,148,8
8,spam,er valued network customer selected receivea p...,158,6
9,spam,bile month u r entitled update latest colour m...,154,2
11,spam,hance win cash pound txt csh send cost p day d...,136,8


In [107]:
ham_messages.head()

Unnamed: 0,category,message,length,punctuation
0,ham,ong point crazy available bugis n great world ...,111,9
1,ham,r joking wif u oni,29,6
3,ham,say early hor u c already say,49,6
4,ham,hink go usf life around though,61,2
6,ham,brother like speak treat like aid patent,77,2


In [111]:
spam_messages['punctuation'].mean()

5.714859437751004

In [112]:
ham_messages['punctuation'].mean()

3.9745077720207256

In [113]:
spam_messages['length'].mean()

138.8661311914324

In [114]:
ham_messages['length'].mean()

71.02362694300518

In [115]:
# Upon analysing the average number of punctuations and length of messages in both the categories, 
# we find that average number of punctuations and length of the messages is greater in spam category.

In [116]:
# Splitting data into X (independent variables) and Y (dependent variables)
X = messages['message']
Y = messages['category']

In [117]:
X.head()

0    ong point crazy available bugis n great world ...
1                                   r joking wif u oni
2    entry wkly comp win fa cup final tkts st may t...
3                        say early hor u c already say
4                       hink go usf life around though
Name: message, dtype: object

In [118]:
Y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: category, dtype: object

In [119]:
# Training the model (33% testing data, 77% training data)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.33, random_state=42)

In [120]:
# TF-IDF Vectorization - CountVectorizer(Bag of Words) + TFIDF Transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
count_vectorizer = CountVectorizer()
tfidf = TfidfVectorizer()
X_train_tfidf_vector = count_vectorizer.fit_transform(X_train).toarray()

In [127]:
X_train_tfidf_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [128]:
X_train_tfidf_vector.shape

(3733, 6038)

In [129]:
# There are 3733 sentences and 6038 words in the sentences

In [130]:
# Pipelining
from sklearn.pipeline import Pipeline

In [131]:
# Naive Bayer Classifier
from sklearn.naive_bayes import MultinomialNB

In [132]:
# Pipelining for easy processing
textnbc = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [133]:
# Passing the training data
textnbc.fit(X_train,Y_train)

In [134]:
X_test.head()

3245    fact nobody teach volcano erupt tsunami arise ...
944     score sophas secondary application school thin...
1044             omeone know fancy call find pobox l hb p
2484    romise getting soon text morning let know made ok
812     ratulations ur awarded either cd gift voucher ...
Name: message, dtype: object

In [135]:
# Vectorize and predict from the X_test
Y_Pred = textnbc.predict(X_test)
Y_Pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'spam'], dtype='<U4')

In [136]:
# Training score
textnbc.score(X_train,Y_train)

0.9766943477096169

In [137]:
# Testing score
textnbc.score(X_test,Y_test)

0.9630233822729745

In [138]:
# Evaluation Metrics
from sklearn.metrics import confusion_matrix
print(confusion_matrix(Y_test,Y_Pred))

[[1586    1]
 [  67  185]]


In [139]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_Pred))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1587
        spam       0.99      0.73      0.84       252

    accuracy                           0.96      1839
   macro avg       0.98      0.87      0.91      1839
weighted avg       0.96      0.96      0.96      1839



In [140]:
# SVM Classifier
from sklearn.svm import LinearSVC

In [141]:
# Pipelining for easy processing
textsvm = Pipeline([('tfidf',TfidfVectorizer()),('svm',LinearSVC(dual=False))])

In [142]:
# Passing the training data
textsvm.fit(X_train,Y_train)

In [143]:
X_test.head()

3245    fact nobody teach volcano erupt tsunami arise ...
944     score sophas secondary application school thin...
1044             omeone know fancy call find pobox l hb p
2484    romise getting soon text morning let know made ok
812     ratulations ur awarded either cd gift voucher ...
Name: message, dtype: object

In [144]:
# Vectorize and predict from the X_test
Y_Pred = textsvm.predict(X_test)
Y_Pred

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'spam'], dtype=object)

In [145]:
# Training score
textsvm.score(X_train,Y_train)

0.999732118939191

In [146]:
# Testing score
textsvm.score(X_test,Y_test)

0.9820554649265906

In [147]:
# Evaluation metrics
from sklearn.metrics import confusion_matrix
print(confusion_matrix(Y_test,Y_Pred))

[[1583    4]
 [  29  223]]


In [148]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_Pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1587
        spam       0.98      0.88      0.93       252

    accuracy                           0.98      1839
   macro avg       0.98      0.94      0.96      1839
weighted avg       0.98      0.98      0.98      1839



In [149]:
# Prediction for a new SMS
# Function for text cleaning
def cleaned_text(text):
    # Eliminate characters other than the uppercase and lowercase alphabet
    words = re.sub('[^a-zA-Z]',' ',text)

    # Convert the messages into lowercase
    words = words.lower()

    # Split the message
    words = words.split()

    # Lemmatize the words if it is not contained in the set of stopwords
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]

    # Join back all the words
    words = ' '.join(words)

    return words

In [158]:
text = "Congratulations, you have won a lottery of Rs 5000 @ gamble.com. To Win Text on 5677895 or call 38494939"
clean_text = cleaned_text(text)
clean_text = [clean_text]
clean_text

['congratulation lottery r gamble com win text call']

In [160]:
# Predict using the Naive Bayes Classifier
textnbc.predict(clean_text)

array(['spam'], dtype='<U4')