In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from gensim.models import Word2Vec


In [3]:
df = pd.read_csv('mail_data.csv')
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
print(f'The number of null values in dataset is {df.isnull().sum().sum()}')

The number of null values in dataset is 0


In [5]:
## NO NULL VALUES

## Data Preprocessing

In [6]:
print(f'The number of duplicate values in dataset is {df.duplicated().sum()}')

The number of duplicate values in dataset is 415


In [7]:
df.drop_duplicates(inplace=True)
print(f'The number of duplicate values in dataset is {df.duplicated().sum()}')

The number of duplicate values in dataset is 0


In [8]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [9]:
#Observing starting lines
print(df['Message'][0])
print(df['Message'][1])
print(df['Message'][2])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Ok lar... Joking wif u oni...
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


## Text Preprocessing

In [10]:
## Converting capital to small letters
df['Message'] = df['Message'].str.lower()
df.head()

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [11]:
df['Message'] = df['Message'].str.replace('#','')
df['Message'] = df['Message'].str.replace('@','')
# Remove urls
df['Message'] = df['Message'].str.replace(r'^https?:\/\/.*[\r\n]*','')

  df['Message'] = df['Message'].str.replace(r'^https?:\/\/.*[\r\n]*','')


In [12]:
# Remove Punctuations
df['Message'] = df['Message'].str.translate(str.maketrans('','',string.punctuation))

df.head()

Unnamed: 0,Category,Message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [13]:
# Initialize stopwords
stop_words = set(stopwords.words('english'))

df['Message'] = df['Message'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stop_words)]))

df.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [14]:
# Handling ChatWords
chat_words = {
    "LOL": "laugh out loud",
    "BRB": "be right back",
    "OMG": "oh my god",
    "IDK": "i don't know",
    "BTW": "by the way"
}

def chat_conversion(text):
    new_text = []
    for word in text.split():
        if word.upper() in chat_words:
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    return' '.join(new_text)

df['Message'] = df['Message'].apply(chat_conversion)

df.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [15]:
from nltk.tokenize import sent_tokenize

# Apply sent_tokenize
df['text_sent_token'] = df['Message'].apply(sent_tokenize)

# Head
df.head()

Unnamed: 0,Category,Message,text_sent_token
0,ham,go jurong point crazy available bugis n great ...,[go jurong point crazy available bugis n great...
1,ham,ok lar joking wif u oni,[ok lar joking wif u oni]
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,[free entry 2 wkly comp win fa cup final tkts ...
3,ham,u dun say early hor u c already say,[u dun say early hor u c already say]
4,ham,nah dont think goes usf lives around though,[nah dont think goes usf lives around though]


In [16]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\deeps/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deeps/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\deeps/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df['stem_msg'] = df['Message'].apply(stem_words)

df.head()

Unnamed: 0,Category,Message,text_sent_token,stem_msg
0,ham,go jurong point crazy available bugis n great ...,[go jurong point crazy available bugis n great...,go jurong point crazi avail bugi n great world...
1,ham,ok lar joking wif u oni,[ok lar joking wif u oni],ok lar joke wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,[free entry 2 wkly comp win fa cup final tkts ...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say early hor u c already say,[u dun say early hor u c already say],u dun say earli hor u c alreadi say
4,ham,nah dont think goes usf lives around though,[nah dont think goes usf lives around though],nah dont think goe usf live around though


## Model Building
#### Converting text to numbers

In [19]:
## Text Vectorization
cv = CountVectorizer()

X = cv.fit_transform(df['stem_msg']).toarray()

In [20]:
X.shape

(5157, 8091)

In [21]:
y = df['Category']

In [22]:
y.shape

(5157,)

In [23]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(y)

y

array([0, 0, 1, ..., 0, 0, 0])

In [24]:
## Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [32]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [34]:
## Initialize Models
# Support Vector Machine
svc = SVC(kernel='sigmoid',gamma=1.0)
# KNeighbours
knc = KNeighborsClassifier()
# Decision Tree
dtc = DecisionTreeClassifier()
# Random Forest
rfc = RandomForestClassifier(n_estimators=50,random_state=2)
# Multinomial Naive Bayes
mnb = MultinomialNB()
# Logistic Regression
lrc = LogisticRegression(solver='liblinear',penalty='l1')
# AdaBoost
abc = AdaBoostClassifier(n_estimators=50,random_state=2)
# Extra Tree Classifier
etc = ExtraTreesClassifier(n_estimators=50,random_state=2)
#XGB Classifier
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [35]:
# Support Vector Machine
svc.fit(X_train ,y_train)
# Pred 
svc_pred = svc.predict(X_test)
#-----------------------------
# 2. KNeighbours
knc.fit(X_train ,y_train)
# Pred 
knn_pred = knc.predict(X_test)
#-----------------------------
# 3. Multinomial NaiveBayes
mnb.fit(X_train ,y_train)
# Pred 
mnb_pred = mnb.predict(X_test)
#-----------------------------
# 4. Decision Tree
dtc.fit(X_train ,y_train)
# Pred 
dtc_pred = dtc.predict(X_test)
#-----------------------------
# 5. Logistic Regression
lrc.fit(X_train ,y_train)
# Pred 
lrc_pred = lrc.predict(X_test)
#-----------------------------
# 6. Random Forest Classifier
rfc.fit(X_train ,y_train)
# Pred 
rfc_pred = rfc.predict(X_test)
#-----------------------------
# 7. AddaBoost Classifier
abc.fit(X_train ,y_train)
# Pred 
abc_pred = abc.predict(X_test)
#-----------------------------
# 8.Extra Tree Classifier a Ensemble Method
etc.fit(X_train ,y_train)
# Pred 
etc_pred = etc.predict(X_test)
#-----------------------------
# 9. XGB Classifier
xgb.fit(X_train ,y_train)
# Pred 
xgb_pred = xgb.predict(X_test)

## Evaluation

In [36]:
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix

def evaluate_model(y_test,y_pred):
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    confusion = confusion_matrix(y_test,y_pred)
    return accuracy,precision,confusion

In [37]:
#Evaluate Models
# 1. SVC
accuracy_svc,precision_svc,confusion_svc = evaluate_model(y_test,svc_pred)
print(f'Support Vector Machine Accuracy: {accuracy_svc}')
print(f'Support Vector Machine Precision: {precision_svc}')
print(f'Support Vector Machine Confusion Matrix: \n{confusion_svc}')
print('--------------------------------------------------')

# 2. KNeighbours
accuracy_knc, precision_knc, confusion_knc = evaluate_model(y_test,knn_pred)
print(f'KNeighbours Accuracy: {accuracy_knc}')
print(f'KNeighbours Precision: {precision_knc}')
print(f'KNeighbours Confusion Matrix: \n{confusion_knc}')
print('--------------------------------------------------')

# 3. Multinomial NaiveBayes
accuracy_mnb, precision_mnb, confusion_mnb = evaluate_model(y_test,mnb_pred)
print(f'Multinomial Naive Bayes Accuracy: {accuracy_mnb}')
print(f'Multinomial Naive Bayes Precision: {precision_mnb}')
print(f'Multinomial Naive Bayes Confusion Matrix: \n{confusion_mnb}')
print('--------------------------------------------------')

# 4. Decision Tree
accuracy_dtc, precision_dtc, confusion_dtc = evaluate_model(y_test,dtc_pred)
print(f'Decision Tree Accuracy: {accuracy_dtc}')
print(f'Decision Tree Precision: {precision_dtc}')
print(f'Decision Tree Confusion Matrix: \n{confusion_dtc}')
print('--------------------------------------------------')

# 5. Logistic Regression
accuracy_lrc, precision_lrc, confusion_lrc = evaluate_model(y_test,lrc_pred)
print(f'Logistic Regression Accuracy: {accuracy_lrc}')
print(f'Logistic Regression Precision: {precision_lrc}')
print(f'Logistic Regression Confusion Matrix: \n{confusion_lrc}')
print('--------------------------------------------------')

# 6. Random Forest Classifier
accuracy_rfc, precision_rfc, confusion_rfc = evaluate_model(y_test,rfc_pred)
print(f'Random Forest Classifier Accuracy: {accuracy_rfc}')
print(f'Random Forest Classifier Precision: {precision_rfc}')
print(f'Random Forest Classifier Confusion Matrix: \n{confusion_rfc}')
print('--------------------------------------------------')

# 7. AdaBoost Classifier
accuracy_abc, precision_abc, confusion_abc = evaluate_model(y_test,abc_pred)
print(f'AdaBoost Classifier Accuracy: {accuracy_abc}')
print(f'AdaBoost Classifier Precision: {precision_abc}')
print(f'AdaBoost Classifier Confusion Matrix: \n{confusion_abc}')
print('--------------------------------------------------')

# 8. Extra Tree Classifier
accuracy_etc, precision_etc, confusion_etc = evaluate_model(y_test,etc_pred)
print(f'Extra Tree Classifier Accuracy: {accuracy_etc}')
print(f'Extra Tree Classifier Precision: {precision_etc}')
print(f'Extra Tree Classifier Confusion Matrix: \n{confusion_etc}')
print('--------------------------------------------------')

# 9. XGB Classifier
accuracy_xgb, precision_xgb, confusion_xgb = evaluate_model(y_test,xgb_pred)
print(f'XGB Classifier Accuracy: {accuracy_xgb}')
print(f'XGB Classifier Precision: {precision_xgb}')
print(f'XGB Classifier Confusion Matrix: \n{confusion_xgb}')
print('--------------------------------------------------')

Support Vector Machine Accuracy: 0.9273255813953488
Support Vector Machine Precision: 0.75
Support Vector Machine Confusion Matrix: 
[[867  30]
 [ 45  90]]
--------------------------------------------------
KNeighbours Accuracy: 0.9069767441860465
KNeighbours Precision: 1.0
KNeighbours Confusion Matrix: 
[[897   0]
 [ 96  39]]
--------------------------------------------------
Multinomial Naive Bayes Accuracy: 0.9709302325581395
Multinomial Naive Bayes Precision: 0.8671328671328671
Multinomial Naive Bayes Confusion Matrix: 
[[878  19]
 [ 11 124]]
--------------------------------------------------
Decision Tree Accuracy: 0.9660852713178295
Decision Tree Precision: 0.9032258064516129
Decision Tree Confusion Matrix: 
[[885  12]
 [ 23 112]]
--------------------------------------------------
Logistic Regression Accuracy: 0.9757751937984496
Logistic Regression Precision: 0.9661016949152542
Logistic Regression Confusion Matrix: 
[[893   4]
 [ 21 114]]
-----------------------------------------

## Dataframe for storing Results

In [None]:
evaluation_data = {
    'Model' : ['SVC', 'KNeighbours', 'Multinomial Naive Bayes', 'Decision Tree', 'Logistic Regression', 'Random Forest Classifier', 'AdaBoost Classifier', 'Extra Tree Classifier', 'XGB Classifier'],
    'Accuracy' : [accuracy_svc, accuracy_knc, accuracy_mnb, accuracy_dtc, accuracy_lrc, accuracy_rfc, accuracy_abc, accuracy_etc, accuracy_xgb],
    'Precision' : [precision_svc, precision_knc, precision_mnb, precision_dtc, precision_lrc, precision_rfc, precision_abc, precision_etc, precision_xgb]
}

evaluation_df = pd.DataFrame(evaluation_data)

In [39]:
evaluation_df =  evaluation_df.sort_values(by=['Accuracy','Precision'],ascending=False)

In [40]:
evaluation_df

Unnamed: 0,Model,Accuracy,Precision
4,Logistic Regression,0.975775,0.966102
7,Extra Tree Classifier,0.974806,1.0
8,XGB Classifier,0.974806,1.0
2,Multinomial Naive Bayes,0.97093,0.867133
5,Random Forest Classifier,0.968992,1.0
3,Decision Tree,0.966085,0.903226
6,AdaBoost Classifier,0.957364,0.925234
0,SVC,0.927326,0.75
1,KNeighbours,0.906977,1.0


In [42]:
print(f"We can clearly see out of all the models Extra Tree Classifier is the best model with accuracy of {accuracy_etc} and precision of {precision_etc}")

We can clearly see out of all the models Extra Tree Classifier is the best model with accuracy of 0.9748062015503876 and precision of 1.0


In [43]:
import plotly.graph_objects as go

# Define the models and their accuracies and precisions
models = ['SVC', 'KNN', 'MultinomialNB', 'Decision Tree', 'Logistic Regression', 'Random Forest', 'AdaBoost', 'Extra Tree', 'XGBoost']
accuracies = [accuracy_svc, accuracy_knc, accuracy_mnb, accuracy_dtc, accuracy_lrc, accuracy_rfc, accuracy_abc, accuracy_etc, accuracy_xgb]
precisions = [precision_svc, precision_knc, precision_mnb, precision_dtc, precision_lrc, precision_rfc, precision_abc, precision_etc, precision_xgb]

# Create the figure
fig = go.Figure()

# Add bar traces for accuracy and precision
fig.add_trace(go.Bar(
    x=models,
    y=accuracies,
    name='Accuracy',
    marker_color='skyblue'
))
fig.add_trace(go.Bar(
    x=models,
    y=precisions,
    name='Precision',
    marker_color='salmon'
))

# Update layout
fig.update_layout(
    title='Accuracy and Precision of Different Models',
    xaxis=dict(title='Models'),
    yaxis=dict(title='Score'),
    barmode='group'  # Group bars for each model
)

# Show the plot
fig.show()

## Predictions

In [44]:
df.head()

Unnamed: 0,Category,Message,text_sent_token,stem_msg
0,ham,go jurong point crazy available bugis n great ...,[go jurong point crazy available bugis n great...,go jurong point crazi avail bugi n great world...
1,ham,ok lar joking wif u oni,[ok lar joking wif u oni],ok lar joke wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,[free entry 2 wkly comp win fa cup final tkts ...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say early hor u c already say,[u dun say early hor u c already say],u dun say earli hor u c alreadi say
4,ham,nah dont think goes usf lives around though,[nah dont think goes usf lives around though],nah dont think goe usf live around though


In [45]:
df['stem_msg'][2]

'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18'