In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./Spam mail/Spam_Ham_data.csv')
df.head()

Unnamed: 0,email,label,Subject,content
0,b'From exmh-workers-admin@redhat.com Thu Aug ...,0.0,new sequenc window,date wed number aug number number number chri ...
1,b'Return-Path: <Online#3.19578.34-UgGTgZFN19NA...,0.0,cnet newscom cabl compani crack wifi,htmlhead titlec compani crack nametopa logo ad...
2,b'Return-Path: <Online#3.19584.83-p1SYlJ1blFvQ...,0.0,save extra number ipaq number pda cnet shopper,htmlhead newslett hreftopa header tabl widthnu...
3,b'From Steve_Burt@cursor-system.com Thu Aug 2...,0.0,zzzzteana alexand,martin greek sculptor behind plan judg limesto...
4,b'Return-Path: <Online#3.19586.b5-9w0blztbvHPd...,0.0,week deck texedit plu boom,cnet download dispatchmac editionjuli number n...


##### Label: 0 means ham, 1 means spam.

In [3]:
df.shape

(3063, 4)

In [4]:
df.drop(columns=['email'],inplace=True, axis=1)
df.head()

Unnamed: 0,label,Subject,content
0,0.0,new sequenc window,date wed number aug number number number chri ...
1,0.0,cnet newscom cabl compani crack wifi,htmlhead titlec compani crack nametopa logo ad...
2,0.0,save extra number ipaq number pda cnet shopper,htmlhead newslett hreftopa header tabl widthnu...
3,0.0,zzzzteana alexand,martin greek sculptor behind plan judg limesto...
4,0.0,week deck texedit plu boom,cnet download dispatchmac editionjuli number n...


#### Pre-Processing

In [5]:
df['main'] = df.Subject +" "+df.content
df.head()

Unnamed: 0,label,Subject,content,main
0,0.0,new sequenc window,date wed number aug number number number chri ...,new sequenc window date wed number aug number ...
1,0.0,cnet newscom cabl compani crack wifi,htmlhead titlec compani crack nametopa logo ad...,cnet newscom cabl compani crack wifi htmlhead ...
2,0.0,save extra number ipaq number pda cnet shopper,htmlhead newslett hreftopa header tabl widthnu...,save extra number ipaq number pda cnet shopper...
3,0.0,zzzzteana alexand,martin greek sculptor behind plan judg limesto...,zzzzteana alexand martin greek sculptor behind...
4,0.0,week deck texedit plu boom,cnet download dispatchmac editionjuli number n...,week deck texedit plu boom cnet download dispa...


In [6]:
df.drop(columns=['Subject','content'],axis=1,inplace=True)
df.head(2)

Unnamed: 0,label,main
0,0.0,new sequenc window date wed number aug number ...
1,0.0,cnet newscom cabl compani crack wifi htmlhead ...


In [7]:
df.main = df.main.str.lower()

In [8]:
df.main.fillna(" ", inplace=True)

In [9]:
df.main.isnull().sum()

0

In [10]:
# Remove Punctuations
import string
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))
df['clean_text'] = df['main'].apply(lambda x: remove_punctuations(x))
df.head()

Unnamed: 0,label,main,clean_text
0,0.0,new sequenc window date wed number aug number ...,new sequenc window date wed number aug number ...
1,0.0,cnet newscom cabl compani crack wifi htmlhead ...,cnet newscom cabl compani crack wifi htmlhead ...
2,0.0,save extra number ipaq number pda cnet shopper...,save extra number ipaq number pda cnet shopper...
3,0.0,zzzzteana alexand martin greek sculptor behind...,zzzzteana alexand martin greek sculptor behind...
4,0.0,week deck texedit plu boom cnet download dispa...,week deck texedit plu boom cnet download dispa...


In [11]:
# Stop word removal
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,label,main,clean_text
0,0.0,new sequenc window date wed number aug number ...,new sequenc window date wed number aug number ...
1,0.0,cnet newscom cabl compani crack wifi htmlhead ...,cnet newscom cabl compani crack wifi htmlhead ...
2,0.0,save extra number ipaq number pda cnet shopper...,save extra number ipaq number pda cnet shopper...
3,0.0,zzzzteana alexand martin greek sculptor behind...,zzzzteana alexand martin greek sculptor behind...
4,0.0,week deck texedit plu boom cnet download dispa...,week deck texedit plu boom cnet download dispa...


In [12]:
# Frequent Words 
from collections import Counter
word_count = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1
        
word_count.most_common(10)

[('number', 21007),
 ('widthnumb', 11100),
 ('td', 4673),
 ('tr', 3387),
 ('use', 2826),
 ('helvetica', 2462),
 ('tabl', 2112),
 ('email', 2103),
 ('get', 2057),
 ('faceari', 1926)]

In [13]:
#  Stemming
# from nltk.stem.porter import PorterStemmer
# ps = PorterStemmer()
# def stem_words(text):
#     return " ".join([ps.stem(word) for word in text.split()])
# df['stemmed_text'] = df['result'].apply(lambda x: stem_words(x))
# df.head()

In [14]:
# Lemmatization & POS Tagging

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    # find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,label,main,clean_text,lemmatized_text
0,0.0,new sequenc window date wed number aug number ...,new sequenc window date wed number aug number ...,new sequenc window date wed number aug number ...
1,0.0,cnet newscom cabl compani crack wifi htmlhead ...,cnet newscom cabl compani crack wifi htmlhead ...,cnet newscom cabl compani crack wifi htmlhead ...
2,0.0,save extra number ipaq number pda cnet shopper...,save extra number ipaq number pda cnet shopper...,save extra number ipaq number pda cnet shopper...
3,0.0,zzzzteana alexand martin greek sculptor behind...,zzzzteana alexand martin greek sculptor behind...,zzzzteana alexand martin greek sculptor behind...
4,0.0,week deck texedit plu boom cnet download dispa...,week deck texedit plu boom cnet download dispa...,week deck texedit plu boom cnet download dispa...


In [15]:
df.sample(frac=1).head(10)

Unnamed: 0,label,main,clean_text,lemmatized_text
124,0.0,person laser pick student soho user zdnet shop...,person laser pick student soho user zdnet shop...,person laser pick student soho user zdnet shop...
2080,0.0,light read url phil ringnalda put numberso ans...,light read url phil ringnalda put numberso ans...,light read url phil ringnalda put numberso ans...
98,0.0,,,
1007,0.0,liber defnit depend much spend vs much type bi...,liber defnit depend much spend vs much type bi...,liber defnit depend much spend v much type big...
2274,0.0,underwat highvoltag photographi url suppliedim...,underwat highvoltag photographi url suppliedim...,underwat highvoltag photographi url suppliedim...
341,0.0,eagl lieuten governor' race competit uniqu uns...,eagl lieuten governor race competit uniqu unsu...,eagl lieuten governor race competit uniqu unsu...
591,0.0,gecko adhes final suss via robot wisdom mayb u...,gecko adhes final suss via robot wisdom mayb u...,gecko adhes final sus via robot wisdom mayb uc...
2861,1.0,pleas help home compani survey win dvr pwe tha...,pleas help home compani survey win dvr pwe tha...,plea help home compani survey win dvr pwe than...
2836,1.0,fw make money fast legal seen tv number ultim ...,fw make money fast legal seen tv number ultim ...,fw make money fast legal see tv number ultim w...
338,0.0,problem aptget mon number oct number thoma van...,problem aptget mon number oct number thoma van...,problem aptget mon number oct number thoma van...


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [17]:
X = df.lemmatized_text
X

0       new sequenc window date wed number aug number ...
1       cnet newscom cabl compani crack wifi htmlhead ...
2       save extra number ipaq number pda cnet shopper...
3       zzzzteana alexand martin greek sculptor behind...
4       week deck texedit plu boom cnet download dispa...
                              ...                        
3058    fwddirect market work number stumbl great way ...
3059    see compani sale sky rocket number stumbl grea...
3060    number hour watch emmerci joke frontpag number...
3061    make fortun ebay number htmlbodi tr td p numbe...
3062    faeri uncommon exot pleasur marvel sweet treat...
Name: lemmatized_text, Length: 3063, dtype: object

In [18]:
y = df.label
y

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3058    1.0
3059    1.0
3060    1.0
3061    1.0
3062    1.0
Name: label, Length: 3063, dtype: float64

In [19]:
X_train , X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2, stratify=y)

In [20]:
# transform text data to feature vectors
featureX = TfidfVectorizer(min_df=1)
X_train_feature = featureX.fit_transform(X_train)
X_test_feature = featureX.transform(X_test)

# lable values conversion

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [21]:
X = featureX.transform(X)

In [22]:
print(X_train)

335     subject still time enter lifetime new season s...
2735    low price smoke dear sir madamif feed rip brit...
345     webdev thank info aj find weblog clickpath rep...
2537    footbal player addict video footbal url suppli...
662     io gasp adam l beberg write tue number sep num...
                              ...                        
802     storag bit u udhay shankar n udhay write u num...
2307    brown budget plan disarray url admit growth ta...
882     aa meet hot place meet woman big buck less obs...
1604    satalk problem spam assassin mailscann work sm...
1855    spambay deploy mail give nonspam train need se...
Name: lemmatized_text, Length: 2450, dtype: object


In [23]:
print(X_train_feature)

  (0, 35671)	0.02834518541897588
  (0, 20470)	0.035773021765899374
  (0, 22559)	0.05384134006463584
  (0, 5588)	0.038730784010003884
  (0, 12541)	0.033411095198019096
  (0, 24532)	0.0516495959973709
  (0, 20679)	0.051750908986799446
  (0, 7226)	0.04405047282198867
  (0, 19557)	0.06317614824782632
  (0, 337)	0.04626279723376499
  (0, 18980)	0.05785645943584153
  (0, 27587)	0.0362948202142518
  (0, 24058)	0.12023892351063376
  (0, 17765)	0.03132417166383032
  (0, 18360)	0.028430881131455525
  (0, 25985)	0.07088788985662467
  (0, 25998)	0.042256200088864646
  (0, 35324)	0.09851508640111468
  (0, 26176)	0.05691892019113189
  (0, 12265)	0.025919010452250424
  (0, 31209)	0.0692457535594919
  (0, 1727)	0.04716320096844075
  (0, 25569)	0.04235521917725827
  (0, 18368)	0.07913521468378068
  (0, 12201)	0.04554639280067175
  :	:
  (2449, 27487)	0.05349765856486555
  (2449, 27467)	0.04461484569365248
  (2449, 17983)	0.059088248506488264
  (2449, 13217)	0.07613120979419996
  (2449, 27673)	0.0612626

In [24]:
# importing models
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB , GaussianNB , BernoulliNB

In [25]:
# list of models
from sklearn.model_selection import cross_val_score
models = [LogisticRegression(max_iter=1000), MultinomialNB(), BernoulliNB(),svm.SVC(kernel='linear')]

In [26]:
def compare_models_cross_validation():

  for model in models:

    cv_score = cross_val_score(model, X, y, cv=5)
    mean_accuracy = sum(cv_score)/len(cv_score)
    mean_accuracy = mean_accuracy*100
    mean_accuracy = round(mean_accuracy, 2)

    print('Cross Validation accuracies for the',model,'=', cv_score)
    print('Acccuracy score of the ',model,'=',mean_accuracy,'%')
    print('---------------------------------------------------------------')

In [27]:
compare_models_cross_validation()

Cross Validation accuracies for the LogisticRegression(max_iter=1000) = [0.79771615 0.93800979 0.92169657 0.91503268 0.94607843]
Acccuracy score of the  LogisticRegression(max_iter=1000) = 90.37 %
---------------------------------------------------------------
Cross Validation accuracies for the MultinomialNB() = [0.76835237 0.86786297 0.86786297 0.87254902 0.86764706]
Acccuracy score of the  MultinomialNB() = 84.89 %
---------------------------------------------------------------
Cross Validation accuracies for the BernoulliNB() = [0.59869494 0.8319739  0.91680261 0.89705882 0.91339869]
Acccuracy score of the  BernoulliNB() = 83.16 %
---------------------------------------------------------------
Cross Validation accuracies for the SVC(kernel='linear') = [0.77487765 0.97389886 0.97389886 0.95424837 0.97712418]
Acccuracy score of the  SVC(kernel='linear') = 93.08 %
---------------------------------------------------------------


### As we can see SVM has highest accuracy of 93.08%

In [28]:
# Training Model
classfier = svm.SVC(kernel='linear')
classfier.fit(X_train_feature,y_train)

In [29]:
from sklearn.metrics import accuracy_score
x_train_pred = classfier.predict(X_train_feature)
training_data_acc_score = accuracy_score(x_train_pred, y_train)
print(f"Accuracy score of the trainig data: {training_data_acc_score} ")

# Acc score for the test set
x_test_pred = classfier.predict(X_test_feature)
test_data_acc_score = accuracy_score(x_test_pred, y_test)
print(f"Accuracy score of the testing data: {test_data_acc_score} ")

Accuracy score of the trainig data: 0.9926530612244898 
Accuracy score of the testing data: 0.9820554649265906 


### Building Predictive System

In [30]:
input_mail = ["new sequenc window"]
input_mail = [lemmatize_words(remove_stopwords(remove_punctuations(input_mail[0])))]
# input_mail

In [31]:
input_data_feature = featureX.transform(input_mail)

In [32]:
#making prediction 
prediction = classfier.predict(input_data_feature)
print(prediction)

[0]
