In [44]:
#Import Libraries
import numpy as np
import pandas as pd
import nltk
import string
import vaex as vx
import matplotlib.pyplot as plt

In [45]:
dataset=pd.read_csv("completeSpamAssassin.csv")

In [46]:
dataset

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1
...,...,...,...
6041,6041,empty,0
6042,6042,___ ___ ...,0
6043,6043,IN THIS ISSUE:01. Readers write\n02. Extension...,0
6044,6044,empty,0


In [47]:
dataset=dataset.drop(["Unnamed: 0"],axis=1)

In [48]:
dataset

Unnamed: 0,Body,Label
0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,##############################################...,1
4,I thought you might like these:\n1) Slim Down ...,1
...,...,...
6041,empty,0
6042,___ ___ ...,0
6043,IN THIS ISSUE:01. Readers write\n02. Extension...,0
6044,empty,0


In [49]:
dataset["Body"]=dataset.drop_duplicates(subset=["Body"])

In [50]:
#To check if there is any missing data
print(dataset.isnull().sum())

Body     754
Label      0
dtype: int64


In [51]:
dataset.dropna(axis=0,how="any",inplace=True)

In [52]:
#Remove the data having missing values
print(dataset.isnull().sum())

Body     0
Label    0
dtype: int64


In [53]:
#Count number of spam and ham mails
dataset["Label"].value_counts() 

0    3914
1    1378
Name: Label, dtype: int64

In [54]:
#This shows that there are 4150 spam mails and 1896 ham mails

In [55]:
#Removal of stop words ,lowering case and tokenise the data

In [56]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jyoti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [58]:
stopwords=set(stopwords.words("english"))
def clean_tokens(text):
    no_punc=[words for words in text if words not in string.punctuation]
    text="".join(no_punc)
    return [words.lower() for words in text.split() if words not in stopwords]

In [59]:
dataset["Body"]=dataset["Body"].apply(clean_tokens)

In [60]:
dataset["Body"]

0       [save, 70, life, insurance, why, spend, more, ...
1       [1, fight, the, risk, cancer, httpwwwadclickws...
2       [1, fight, the, risk, cancer, httpwwwadclickws...
3       [adult, club, offers, free, membership, instan...
4       [i, thought, might, like, 1, slim, down, guara...
                              ...                        
6033    [1, isilotm, 325, palm, os, pocket, pc, window...
6034    [effector, vol, 15, no, 35, november, 8, 2002,...
6039    [we, extended, free, seat, sale, thursday, 21s...
6042    [oneten, 271102, insignificant, matters, heavi...
6043    [in, this, issue01, readers, write, 02, extens...
Name: Body, Length: 5292, dtype: object

In [61]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [62]:
#CV=CountVectorizer(analyzer=clean_tokens)
Tfd=TfidfVectorizer(analyzer=clean_tokens)

In [63]:
#Vectors=CV.fit_transform(dataset["Body"])
Vectors=Tfd.fit_transform(dataset["Body"])

In [64]:
Vectors.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [65]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(Vectors.toarray(), dataset["Label"], test_size=0.20, random_state=5)

In [66]:
print(xtrain)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [67]:
print(ytest)

2582    0
5479    0
2573    0
3701    0
5478    0
       ..
4665    0
5090    0
5604    0
1525    1
4142    0
Name: Label, Length: 1059, dtype: int64


In [68]:
from sklearn.naive_bayes import MultinomialNB 
MNB=MultinomialNB()

In [69]:
MNB.fit(xtrain,ytrain)

MultinomialNB()

In [70]:
Y_Pred=MNB.predict(xtest)

In [71]:
Y_Pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [72]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [73]:
classification_report(ytest,Y_Pred)

'              precision    recall  f1-score   support\n\n           0       0.76      1.00      0.86       799\n           1       1.00      0.02      0.03       260\n\n    accuracy                           0.76      1059\n   macro avg       0.88      0.51      0.45      1059\nweighted avg       0.82      0.76      0.66      1059\n'

In [74]:
print("Confusion Matrix: \n", confusion_matrix(ytest, Y_Pred))

Confusion Matrix: 
 [[799   0]
 [256   4]]


In [75]:
accuracy_score(ytest, Y_Pred)

0.7582625118035883

In [76]:
from sklearn.metrics import mean_squared_error

In [77]:
np.sqrt(mean_squared_error(ytest,Y_Pred))

0.4916680670904017

In [78]:
from sklearn.linear_model import LogisticRegression

In [79]:
LR=LogisticRegression(random_state=0)

In [80]:
LR.fit(xtrain,ytrain)

LogisticRegression(random_state=0)

In [81]:
ypred=LR.predict(xtest)

In [82]:
ypred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [83]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [84]:
classification_report(ytest,ypred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.75      1.00      0.86       799\n           1       0.00      0.00      0.00       260\n\n    accuracy                           0.75      1059\n   macro avg       0.38      0.50      0.43      1059\nweighted avg       0.57      0.75      0.65      1059\n'

In [85]:
accuracy_score(ytest, ypred)

0.7544853635505193

In [86]:
np.sqrt(mean_squared_error(ytest,Y_Pred))

0.4916680670904017