In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./spam_assassin.csv')
df.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


##### Label: 0 means ham, 1 means spam.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB


In [6]:
df.columns = ['main','label']
df.head()

Unnamed: 0,main,label
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


#### Pre-Processing

In [7]:
df.main = df.main.str.lower()

In [8]:
df.main.isnull().sum()

0

In [9]:
df.main.fillna(" ", inplace=True)

In [10]:
# Remove Punctuations
import string
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))
df['clean_text'] = df['main'].apply(lambda x: remove_punctuations(x))
df.head()

Unnamed: 0,main,label,clean_text
0,from ilug-admin@linux.ie mon jul 29 11:28:02 2...,0,from ilugadminlinuxie mon jul 29 112802 2002 r...
1,from gort44@excite.com mon jun 24 17:54:21 200...,1,from gort44excitecom mon jun 24 175421 2002 re...
2,from fork-admin@xent.com mon jul 29 11:39:57 2...,1,from forkadminxentcom mon jul 29 113957 2002 r...
3,from dcm123@btamail.net.cn mon jun 24 17:49:23...,1,from dcm123btamailnetcn mon jun 24 174923 2002...
4,from ilug-admin@linux.ie mon aug 19 11:02:47 2...,0,from ilugadminlinuxie mon aug 19 110247 2002 r...


In [11]:
# Stop word removal
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,main,label,clean_text
0,from ilug-admin@linux.ie mon jul 29 11:28:02 2...,0,ilugadminlinuxie mon jul 29 112802 2002 return...
1,from gort44@excite.com mon jun 24 17:54:21 200...,1,gort44excitecom mon jun 24 175421 2002 returnp...
2,from fork-admin@xent.com mon jul 29 11:39:57 2...,1,forkadminxentcom mon jul 29 113957 2002 return...
3,from dcm123@btamail.net.cn mon jun 24 17:49:23...,1,dcm123btamailnetcn mon jun 24 174923 2002 retu...
4,from ilug-admin@linux.ie mon aug 19 11:02:47 2...,0,ilugadminlinuxie mon aug 19 110247 2002 return...


In [18]:
# Frequent Words 
from collections import Counter
word_count = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1
        
word_count.most_common(10)

[('2002', 46596),
 ('received', 32310),
 ('id', 24307),
 ('esmtp', 18746),
 ('0100', 16531),
 ('localhost', 15510),
 ('aug', 14847),
 ('127001', 12941),
 ('sep', 11881),
 ('td', 10299)]

In [19]:
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(10))
def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

df['clean_text'] = df['clean_text'].apply(lambda x: remove_freq_words(x))
df.head()



Unnamed: 0,main,label,clean_text
0,from ilug-admin@linux.ie mon jul 29 11:28:02 2...,0,ilugadminlinuxie mon jul 29 112802 returnpath ...
1,from gort44@excite.com mon jun 24 17:54:21 200...,1,gort44excitecom mon jun 24 175421 returnpath g...
2,from fork-admin@xent.com mon jul 29 11:39:57 2...,1,forkadminxentcom mon jul 29 113957 returnpath ...
3,from dcm123@btamail.net.cn mon jun 24 17:49:23...,1,dcm123btamailnetcn mon jun 24 174923 returnpat...
4,from ilug-admin@linux.ie mon aug 19 11:02:47 2...,0,ilugadminlinuxie mon 19 110247 returnpath ilug...


In [13]:
#  Stemming
# from nltk.stem.porter import PorterStemmer
# ps = PorterStemmer()
# def stem_words(text):
#     return " ".join([ps.stem(word) for word in text.split()])
# df['stemmed_text'] = df['result'].apply(lambda x: stem_words(x))
# df.head()

In [21]:
# Lemmatization & POS Tagging

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    # find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,main,label,clean_text,lemmatized_text
0,from ilug-admin@linux.ie mon jul 29 11:28:02 2...,0,ilugadminlinuxie mon jul 29 112802 returnpath ...,ilugadminlinuxie mon jul 29 112802 returnpath ...
1,from gort44@excite.com mon jun 24 17:54:21 200...,1,gort44excitecom mon jun 24 175421 returnpath g...,gort44excitecom mon jun 24 175421 returnpath g...
2,from fork-admin@xent.com mon jul 29 11:39:57 2...,1,forkadminxentcom mon jul 29 113957 returnpath ...,forkadminxentcom mon jul 29 113957 returnpath ...
3,from dcm123@btamail.net.cn mon jun 24 17:49:23...,1,dcm123btamailnetcn mon jun 24 174923 returnpat...,dcm123btamailnetcn mon jun 24 174923 returnpat...
4,from ilug-admin@linux.ie mon aug 19 11:02:47 2...,0,ilugadminlinuxie mon 19 110247 returnpath ilug...,ilugadminlinuxie mon 19 110247 returnpath ilug...


In [22]:
df.sample(frac=1).head(10)

Unnamed: 0,main,label,clean_text,lemmatized_text
4425,from des34newsa@hotmail.com fri aug 23 11:03:2...,1,des34newsahotmailcom fri 23 110327 returnpath ...,des34newsahotmailcom fri 23 110327 returnpath ...
1148,from ilug-admin@linux.ie fri aug 16 15:02:05 2...,0,ilugadminlinuxie fri 16 150205 returnpath ilug...,ilugadminlinuxie fri 16 150205 returnpath ilug...
1909,received: from mail.escorts.co.in (ident:root@...,1,mailescortscoin identroot2032007575 linuxmidra...,mailescortscoin identroot2032007575 linuxmidra...
5373,from mort239o@xum9.xumx.com mon jun 24 17:48:2...,1,mort239oxum9xumxcom mon jun 24 174823 returnpa...,mort239oxum9xumxcom mon jun 24 174823 returnpa...
1292,from razor-users-admin@lists.sourceforge.net f...,0,razorusersadminlistssourceforgenet fri 9 15335...,razorusersadminlistssourceforgenet fri 9 15335...
96,from fork-admin@xent.com thu sep 19 11:04:48 2...,0,forkadminxentcom thu 19 110448 returnpath fork...,forkadminxentcom thu 19 110448 returnpath fork...
2751,from ilug-admin@linux.ie fri jul 19 16:19:05 2...,0,ilugadminlinuxie fri jul 19 161905 returnpath ...,ilugadminlinuxie fri jul 19 161905 returnpath ...
2290,from fork-admin@xent.com mon jul 29 11:28:59 2...,0,forkadminxentcom mon jul 29 112859 returnpath ...,forkadminxentcom mon jul 29 112859 returnpath ...
939,from yourmembership2@aeopublishing.com sat jun...,1,yourmembership2aeopublishingcom sat jun 30 073...,yourmembership2aeopublishingcom sit jun 30 073...
3069,from fork-admin@xent.com mon jul 29 11:41:22 2...,1,forkadminxentcom mon jul 29 114122 returnpath ...,forkadminxentcom mon jul 29 114122 returnpath ...


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [24]:
X = df.lemmatized_text
X

0       ilugadminlinuxie mon jul 29 112802 returnpath ...
1       gort44excitecom mon jun 24 175421 returnpath g...
2       forkadminxentcom mon jul 29 113957 returnpath ...
3       dcm123btamailnetcn mon jun 24 174923 returnpat...
4       ilugadminlinuxie mon 19 110247 returnpath ilug...
                              ...                        
5791    ilugadminlinuxie mon jul 22 181245 returnpath ...
5792    forkadminxentcom mon oct 7 203702 returnpath f...
5793    hqpronsnet hqpronsnet 81258125 g6llbbcu047091 ...
5794    razorusersadminlistssourceforgenet thu 12 1844...
5795    rssfeedsjmasonorg mon 30 134410 returnpath rss...
Name: lemmatized_text, Length: 5796, dtype: object

In [25]:
y = df.label
y

0       0
1       1
2       1
3       1
4       0
       ..
5791    0
5792    0
5793    1
5794    0
5795    0
Name: label, Length: 5796, dtype: int64

In [26]:
X_train , X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2, stratify=y)

In [27]:
# transform text data to feature vectors
featureX = TfidfVectorizer(min_df=1, lowercase=True)
X_train_feature = featureX.fit_transform(X_train)
X_test_feature = featureX.transform(X_test)

# lable values conversion

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [28]:
X = featureX.transform(X)

In [29]:
print(X_train)

468     razorusersadminlistssourceforgenet fri 9 15342...
4124    hqpronsnet hqpronsnet 81258125 g6o753hy015406 ...
5723    rssfeedsjmasonorg thu 26 164312 returnpath rss...
1090    olheie31usanet mon 6 130409 2001 returnpath ol...
4549    rssfeedsjmasonorg thu 26 164306 returnpath rss...
                              ...                        
226     returnpath skippoboxcom deliverydate fri 6 161...
803     forkadminxentcom thu 8 143734 returnpath forka...
4224    rssfeedsjmasonorg tue 24 104729 returnpath rss...
2721    jbgasparhotmailcom thu jun 28 040445 2001 retu...
5179    newadulttoys0463b54yahoocom wed 28 110402 retu...
Name: lemmatized_text, Length: 4636, dtype: object


In [30]:
print(X_train_feature)

  (0, 94069)	0.035223837591099996
  (0, 94159)	0.031537460848176706
  (0, 88546)	0.031074770630239543
  (0, 86105)	0.030636605597285586
  (0, 145522)	0.029848540034760486
  (0, 58697)	0.031537460848176706
  (0, 133078)	0.02505338147091036
  (0, 128557)	0.026618115722851665
  (0, 123488)	0.07233781379957914
  (0, 121512)	0.027816492579933728
  (0, 58141)	0.04294066676027578
  (0, 55702)	0.05903028886975995
  (0, 65915)	0.031537460848176706
  (0, 123515)	0.0689852538824583
  (0, 141731)	0.01623165645620696
  (0, 48633)	0.037510131051896586
  (0, 98976)	0.0344273305473604
  (0, 97609)	0.03775882438905693
  (0, 54514)	0.07233781379957914
  (0, 68709)	0.03494972088119787
  (0, 147239)	0.039909251291246436
  (0, 113935)	0.04294066676027578
  (0, 56246)	0.042823382096905095
  (0, 128080)	0.07917621368630985
  (0, 108088)	0.03915442439320226
  :	:
  (4635, 137720)	0.029939405432652205
  (4635, 143681)	0.029151059868192077
  (4635, 131968)	0.06003226260696458
  (4635, 145444)	0.0759443510727617

In [31]:
# importing models
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB , GaussianNB , BernoulliNB

In [32]:
# list of models
from sklearn.model_selection import cross_val_score
models = [LogisticRegression(max_iter=1000), MultinomialNB(), BernoulliNB(),svm.SVC(kernel='linear')]

In [33]:
def compare_models_cross_validation():

  for model in models:

    cv_score = cross_val_score(model, X, y, cv=5)
    mean_accuracy = sum(cv_score)/len(cv_score)
    mean_accuracy = mean_accuracy*100
    mean_accuracy = round(mean_accuracy, 2)

    print('Cross Validation accuracies for the',model,'=', cv_score)
    print('Acccuracy score of the ',model,'=',mean_accuracy,'%')
    print('---------------------------------------------------------------')

In [34]:
compare_models_cross_validation()

Cross Validation accuracies for the LogisticRegression(max_iter=1000) = [0.98793103 0.98101812 0.986195   0.98446937 0.98705781]
Acccuracy score of the  LogisticRegression(max_iter=1000) = 98.53 %
---------------------------------------------------------------
Cross Validation accuracies for the MultinomialNB() = [0.93793103 0.9257981  0.93356342 0.93356342 0.94219154]
Acccuracy score of the  MultinomialNB() = 93.46 %
---------------------------------------------------------------
Cross Validation accuracies for the BernoulliNB() = [0.94396552 0.93701467 0.94823123 0.9456428  0.95081967]
Acccuracy score of the  BernoulliNB() = 94.51 %
---------------------------------------------------------------
Cross Validation accuracies for the SVC(kernel='linear') = [0.99568966 0.99396031 0.99568594 0.99741156 0.99568594]
Acccuracy score of the  SVC(kernel='linear') = 99.57 %
---------------------------------------------------------------


### As we can see SVM has highest accuracy of 99.57%

In [35]:
# Training Model
classfier = svm.SVC(kernel='linear')
classfier.fit(X_train_feature,y_train)

In [36]:
from sklearn.metrics import accuracy_score
x_train_pred = classfier.predict(X_train_feature)
training_data_acc_score = accuracy_score(x_train_pred, y_train)
print(f"Accuracy score of the trainig data: {training_data_acc_score} ")

# Acc score for the test set
x_test_pred = classfier.predict(X_test_feature)
test_data_acc_score = accuracy_score(x_test_pred, y_test)
print(f"Accuracy score of the testing data: {test_data_acc_score} ")

Accuracy score of the trainig data: 1.0 
Accuracy score of the testing data: 0.993103448275862 


### Building Predictive System

In [37]:
input_mail = ["new sequenc window"]
input_mail = [lemmatize_words(remove_stopwords(remove_punctuations(input_mail[0])))]
# input_mail

In [38]:
input_data_feature = featureX.transform(input_mail)

In [39]:
#making prediction 
prediction = classfier.predict(input_data_feature)
print(prediction)

[0]


In [40]:
import pickle

In [41]:
# pipeline = [lemmatize_words(remove_stopwords(remove_punctuations(input_mail[0])))]

In [42]:
pickle.dump(classfier, open("spam.pkl", "wb"))
# pickle.dump(pipeline,open("pipe.pkl","wb"))
pickle.dump(featureX, open("feature.pkl", "wb"))

In [43]:
clf = pickle.load(open("spam.pkl", "rb"))
clf

In [44]:
input_mail  = " You won 10k dollar"
clean = [lemmatize_words(remove_stopwords(remove_punctuations(input_mail[0])))]

feat = featureX.transform(clean)
result = classfier.predict(feat)
print(result)



[1]


In [45]:
df.label.value_counts()

0    3900
1    1896
Name: label, dtype: int64