# MINI-PROJECT: SPAM MAIL CLASSIFIER 

In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string

In [2]:
#Read dataset
"""
Source: Kaggle
"""
df = pd.read_csv("emails.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df.shape

(5728, 2)

In [4]:
df.columns

Index(['text', 'spam'], dtype='object')

In [5]:
#Check duplicates
df.drop_duplicates(inplace = True)
#Printing shape atfer removal of duplicates
df.shape

(5695, 2)

In [6]:
#Check missing values
df.isnull().sum()

text    0
spam    0
dtype: int64

##### Downloading stopwords package

In [7]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# Function to process the text
def process_text(text):
    #Removing punctuation, stopwords
    
    punc_rem = [char for char in text if char not in string.punctuation]
    punc_rem = ''.join(punc_rem)
    
    clean_words = [word for word in punc_rem.split() if word.lower() not in stopwords.words('english')]
    
    return clean_words

In [9]:
#list of tokens
df['text'].head().apply(process_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [10]:
#Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.text, df.spam, test_size = 0.2, random_state = 0)

### ----- USING COUNTVECTORIZER -----

In [11]:
#dict used to store scores of models using countvectorizer
cv_score = {}

In [12]:
# Converting to matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit_transform(X_train)
print("The total vocabulary size in the train data is", len(cv.vocabulary_))

The total vocabulary size in the train data is 33126


In [13]:
X_train_ct = cv.transform(X_train)
X_train_ct.shape

(4556, 33126)

In [14]:
X_test_ct = cv.transform(X_test)
X_test_ct.shape

(1139, 33126)

#### TRYING VARIOUS ML ALGORITHMS

##### MODEL 1: NAIVE BAYES

In [15]:
#MODELLING
from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB().fit (X_train_ct, y_train)

In [16]:
#PREDICTING
print ("Predictions:",model1.predict(X_train_ct))
print ("Actual:", y_train.values )

Predictions: [0 0 0 ... 0 0 0]
Actual: [0 0 0 ... 0 0 0]


In [17]:
#EVALUATION
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = model1.predict(X_train_ct)
print (classification_report(y_train,pred))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_train,pred))
print ()
print ("Accuracy:", accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      3457
           1       0.98      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      0.99      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: 
 [[3436   21]
 [   1 1098]]

Accuracy: 0.995171202809482


###### TRYING ON TEST DATA

In [18]:
#PREDICTING
print ("Predictions:",model1.predict(X_test_ct))
print ("Actual:", y_test.values )

Predictions: [1 0 0 ... 0 0 0]
Actual: [1 0 0 ... 0 0 0]


In [19]:
#EVALUATION
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = model1.predict(X_test_ct)
print (classification_report(y_test,pred))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_test,pred))
print ()
print ("Accuracy:", accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       870
           1       0.98      0.98      0.98       269

    accuracy                           0.99      1139
   macro avg       0.99      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: 
 [[865   5]
 [  6 263]]

Accuracy: 0.990342405618964


In [20]:
cv_score["CV - Naive Bayes"] = accuracy_score(y_test,pred)
cv_score

{'CV - Naive Bayes': 0.990342405618964}

##### MODEL 2: KNN 

In [21]:
#MODELLING
from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier().fit(X_train_ct,y_train)

In [22]:
#PREDICTING
print ("Predictions:",model2.predict(X_train_ct))
print ("Actual:", y_train.values )

Predictions: [0 0 0 ... 0 0 0]
Actual: [0 0 0 ... 0 0 0]


In [23]:
#EVALUATION
pred2 = model2.predict(X_train_ct)
print (classification_report(y_train,pred2))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_train,pred2))
print ()
print ("Accuracy:", accuracy_score(y_train,pred2))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96      3457
           1       0.97      0.79      0.87      1099

    accuracy                           0.94      4556
   macro avg       0.95      0.89      0.92      4556
weighted avg       0.94      0.94      0.94      4556


Confusion Matrix: 
 [[3431   26]
 [ 235  864]]

Accuracy: 0.9427129060579456


###### TRYING ON TEST DATA

In [24]:
#PREDICTING
print ("Predictions:",model2.predict(X_test_ct))
print ("Actual:", y_test.values )

Predictions: [1 0 0 ... 0 0 0]
Actual: [1 0 0 ... 0 0 0]


In [25]:
#EVALUATION
pred2 = model2.predict(X_test_ct)
print (classification_report(y_test,pred2))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_test,pred2))
print ()
print ("Accuracy:", accuracy_score(y_test,pred2))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94       870
           1       0.93      0.66      0.77       269

    accuracy                           0.91      1139
   macro avg       0.92      0.82      0.86      1139
weighted avg       0.91      0.91      0.90      1139


Confusion Matrix: 
 [[857  13]
 [ 92 177]]

Accuracy: 0.9078138718173837


In [26]:
cv_score["CV - KNN"] = accuracy_score(y_test,pred2)
cv_score

{'CV - Naive Bayes': 0.990342405618964, 'CV - KNN': 0.9078138718173837}

##### MODEL 3: SVC 

In [27]:
#MODELLING
from sklearn.svm import SVC
model3 = SVC().fit(X_train_ct,y_train)

In [28]:
#PREDICTING
print ("Predictions:",model3.predict(X_train_ct))
print ("Actual:", y_train.values )

Predictions: [0 0 0 ... 0 0 0]
Actual: [0 0 0 ... 0 0 0]


In [29]:
#EVALUATION
pred3 = model3.predict(X_train_ct)
print (classification_report(y_train,pred3))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_train,pred3))
print ()
print ("Accuracy:", accuracy_score(y_train,pred3))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3457
           1       0.99      0.87      0.92      1099

    accuracy                           0.97      4556
   macro avg       0.97      0.93      0.95      4556
weighted avg       0.97      0.97      0.96      4556


Confusion Matrix: 
 [[3446   11]
 [ 148  951]]

Accuracy: 0.9651009657594382


###### TRYING ON TEST DATA

In [30]:
#PREDICTING
print ("Predictions:",model3.predict(X_test_ct))
print ("Actual:", y_test.values )

Predictions: [1 0 0 ... 0 0 0]
Actual: [1 0 0 ... 0 0 0]


In [31]:
#EVALUATION
pred3 = model3.predict(X_test_ct)
print (classification_report(y_test,pred3))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_test,pred3))
print ()
print ("Accuracy:", accuracy_score(y_test,pred3))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       870
           1       0.97      0.82      0.89       269

    accuracy                           0.95      1139
   macro avg       0.96      0.91      0.93      1139
weighted avg       0.95      0.95      0.95      1139


Confusion Matrix: 
 [[864   6]
 [ 49 220]]

Accuracy: 0.95171202809482


In [32]:
cv_score["CV - SVC"] = accuracy_score(y_test,pred3)
cv_score

{'CV - Naive Bayes': 0.990342405618964,
 'CV - KNN': 0.9078138718173837,
 'CV - SVC': 0.95171202809482}

#### Using CountVectorizer method, the most accurate algorithm is Naive Bayes

### ----- USING TF-IDF VECTORIZER -----

In [33]:
#dict used to store scores of models using TF-IDF vectorizer
tv_score = {}

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
tv.fit(X_train)
X_train_t  = tv.transform(X_train)
X_test_t = tv.transform(X_test)

#### TRYING VARIOUS ML ALGORITHMS

##### MODEL 1: NAIVE BAYES

In [35]:
#MODELLING
from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB().fit (X_train_t, y_train)

In [36]:
#PREDICTING
print ("Predictions:",model1.predict(X_train_t))
print ("Actual:", y_train.values )

Predictions: [0 0 0 ... 0 0 0]
Actual: [0 0 0 ... 0 0 0]


In [37]:
#EVALUATION
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = model1.predict(X_train_t)
print (classification_report(y_train,pred))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_train,pred))
print ()
print ("Accuracy:", accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94      3457
           1       1.00      0.59      0.74      1099

    accuracy                           0.90      4556
   macro avg       0.94      0.79      0.84      4556
weighted avg       0.91      0.90      0.89      4556


Confusion Matrix: 
 [[3455    2]
 [ 450  649]]

Accuracy: 0.9007901668129938


###### TRYING ON TEST DATA

In [38]:
#PREDICTING
print ("Predictions:",model1.predict(X_test_t))
print ("Actual:", y_test.values )

Predictions: [0 0 0 ... 0 0 0]
Actual: [1 0 0 ... 0 0 0]


In [39]:
#EVALUATION
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = model1.predict(X_test_t)
print (classification_report(y_test,pred))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_test,pred))
print ()
print ("Accuracy:", accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92       870
           1       1.00      0.47      0.64       269

    accuracy                           0.87      1139
   macro avg       0.93      0.73      0.78      1139
weighted avg       0.89      0.87      0.86      1139


Confusion Matrix: 
 [[870   0]
 [143 126]]

Accuracy: 0.874451273046532


In [40]:
tv_score["TV - Naive Bayes"] = accuracy_score(y_test,pred)

##### MODEL 2: KNN 

In [41]:
#MODELLING
from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier().fit(X_train_t,y_train)

In [42]:
#PREDICTING
print ("Predictions:",model2.predict(X_train_t))
print ("Actual:", y_train.values )

Predictions: [0 0 0 ... 0 0 0]
Actual: [0 0 0 ... 0 0 0]


In [43]:
#EVALUATION
pred2 = model2.predict(X_train_t)
print (classification_report(y_train,pred2))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_train,pred2))
print ()
print ("Accuracy:", accuracy_score(y_train,pred2))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3457
           1       0.96      0.97      0.97      1099

    accuracy                           0.98      4556
   macro avg       0.98      0.98      0.98      4556
weighted avg       0.98      0.98      0.98      4556


Confusion Matrix: 
 [[3418   39]
 [  32 1067]]

Accuracy: 0.9844161545215101


###### TRYING ON TEST DATA

In [44]:
#PREDICTING
print ("Predictions:",model2.predict(X_test_t))
print ("Actual:", y_test.values )

Predictions: [1 0 0 ... 0 0 0]
Actual: [1 0 0 ... 0 0 0]


In [45]:
#EVALUATION
pred2 = model2.predict(X_test_t)
print (classification_report(y_test,pred2))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_test,pred2))
print ()
print ("Accuracy:", accuracy_score(y_test,pred2))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       870
           1       0.93      0.94      0.94       269

    accuracy                           0.97      1139
   macro avg       0.96      0.96      0.96      1139
weighted avg       0.97      0.97      0.97      1139


Confusion Matrix: 
 [[852  18]
 [ 15 254]]

Accuracy: 0.971027216856892


In [46]:
tv_score["TV - KNN"] = accuracy_score(y_test,pred2)

##### MODEL 3: SVC 

In [47]:
#MODELLING
from sklearn.svm import SVC
model3 = SVC().fit(X_train_t,y_train)

In [48]:
#PREDICTING
print ("Predictions:",model3.predict(X_train_t))
print ("Actual:", y_train.values )

Predictions: [0 0 0 ... 0 0 0]
Actual: [0 0 0 ... 0 0 0]


In [49]:
#EVALUATION
pred3 = model3.predict(X_train_t)
print (classification_report(y_train,pred3))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_train,pred3))
print ()
print ("Accuracy:", accuracy_score(y_train,pred3))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       1.00      1.00      1.00      1099

    accuracy                           1.00      4556
   macro avg       1.00      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: 
 [[3457    0]
 [   0 1099]]

Accuracy: 1.0


###### TRYING ON TEST DATA

In [50]:
#PREDICTING
print ("Predictions:",model3.predict(X_test_t))
print ("Actual:", y_test.values )

Predictions: [1 0 0 ... 0 0 0]
Actual: [1 0 0 ... 0 0 0]


In [51]:
#EVALUATION
pred3 = model3.predict(X_test_t)
print (classification_report(y_test,pred3))
print ()
print ("Confusion Matrix: \n", confusion_matrix(y_test,pred3))
print ()
print ("Accuracy:", accuracy_score(y_test,pred3))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       870
           1       0.99      0.97      0.98       269

    accuracy                           0.99      1139
   macro avg       0.99      0.98      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: 
 [[868   2]
 [  8 261]]

Accuracy: 0.9912203687445127


In [52]:
tv_score["TV - SVC"] = accuracy_score(y_test,pred3)
tv_score

{'TV - Naive Bayes': 0.874451273046532,
 'TV - KNN': 0.971027216856892,
 'TV - SVC': 0.9912203687445127}

#### Using TF-IDF Vectorizer method, the most accurate algorithm is SVC 

In [53]:
print ("Scores using CountVectorizer Method\n",cv_score)
print ("Scores using TF-IDF Vectorizer Method\n",tv_score)

Scores using CountVectorizer Method
 {'CV - Naive Bayes': 0.990342405618964, 'CV - KNN': 0.9078138718173837, 'CV - SVC': 0.95171202809482}
Scores using TF-IDF Vectorizer Method
 {'TV - Naive Bayes': 0.874451273046532, 'TV - KNN': 0.971027216856892, 'TV - SVC': 0.9912203687445127}


In [54]:
print ("Best Score using CountVectorizer:", max(cv_score, key = cv_score.get))

Best Score using CountVectorizer: CV - Naive Bayes


In [55]:
print ("Best Score using IF-IDF Vectorizer Method:", max(tv_score, key = tv_score.get))

Best Score using IF-IDF Vectorizer Method: TV - SVC


### Comparing the best scores from both the methods and the different algorithms used, applying SVC algorithm after TF-IDF Vectorization seems like the best fit.