Spam Mail Prediction Model

Importing Dependencies

In [121]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

Data Preprocessing

In [122]:
#We download the Spam-Ham Dataset from Kaggle and then load the dataset to pandas DataFrame
raw_mail_data=pd.read_csv("./Kaggle Spam-Ham Dataset/spam_ham_dataset.csv")
#print(raw_mail_data)
#check for null values in the dataset, if there are null values in the datset replace them with null strings
raw_mail_data=raw_mail_data.drop(columns=['label_num','Unnamed: 0'])
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),"")
mail_data.shape
print(mail_data.head())

  label                                               text
0   ham  Subject: enron methanol ; meter # : 988291\r\n...
1   ham  Subject: hpl nom for january 9 , 2001\r\n( see...
2   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...
3  spam  Subject: photoshop , windows , office . cheap ...
4   ham  Subject: re : indian springs\r\nthis deal is t...


In [123]:
#Labelling
#Label spam mail as 1 and Non-spam mail(ham) mail as 0 which is already done here if not the below code can be used
mail_data.loc[mail_data['label']=='spam','category',]=1
mail_data.loc[mail_data['label']=='ham','category',]=0

In [124]:
#Separate the data as text and label. X=text and Y=label
X=mail_data['text']
Y=mail_data['category']
print(X)
print('-------------------------------------------------')
print(Y)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object
-------------------------------------------------
0       0.0
1       0.0
2       0.0
3       1.0
4       0.0
       ... 
5166    0.0
5167    0.0
5168    0.0
5169    0.0
5170    1.0
Name: category, Length: 5171, dtype: float64


Train Test Split

In [125]:
#split the data as train data and test data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.8,test_size=0.2,random_state=3)

Feature Extraction

In [126]:
#transform the text data to feature vectors that can be used as input to the SVM(Support Vector Machine) using TfidfVectorizer

#convert the text to lower case letters

feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

#convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


Training the Model -- Support Vector Machine


In [127]:
#Support Vector Machine spearates data set into two groups
#training the support vector machine model with training data
model=LinearSVC()
model.fit(X_train_features,Y_train)
#model is trained

In [128]:
#Evaluation of the model
#prediction on training data
prediction_on_training_data=model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)
print("Accuracy on training data:",accuracy_on_training_data)

Accuracy on training data: 1.0


In [129]:
#prediction on test data
prediction_on_test_data=model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)
print("Accuracy on test data:",accuracy_on_test_data)

Accuracy on test data: 0.9864734299516909


Prediction on new mail

In [130]:
input_mail=['Hi-tech industry leader in an emerging 90 billion dollar environmental marketplace seeks U.S.']
#convert text to feature vectors
input_mail_features=feature_extraction.transform(input_mail)
#making prediction
prediction=model.predict(input_mail_features)
if(prediction[0]==1):
    print("Spam Mail")
else:
    print("Ham Mail")

Spam Mail
