In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns

Data Collection and Pre-processing

In [25]:
# loading the data from csv file to a pandas Dataframe
raw_data = pd.read_csv('/content/drive/MyDrive/Major Project/spam_ham_dataset.csv')

In [26]:
#Calculating total Spam Mail
raw_data[raw_data.Category=='spam']

Unnamed: 0,Category,Message
3,spam,"photoshop , windows , office . cheap . main t..."
7,spam,looking for medication ? we ` re the best sou...
10,spam,vocable % rnd - word asceticism\r\nvcsc - bra...
11,spam,report 01405 !\r\nwffur attion brom est inst ...
13,spam,vic . odin n ^ ow\r\nberne hotbox carnal brid...
...,...,...
5159,spam,pictures\r\nstreamlined denizen ajar chased\r...
5161,spam,penny stocks are about timing\r\nnomad intern...
5162,spam,anomaly boys from 3881\r\nuosda apaproved mle...
5164,spam,slutty milf wants to meet you\r\ntake that !\...


In [27]:
#Calculating total Ham Mail
raw_data[raw_data.Category=='ham']

Unnamed: 0,Category,Message
0,ham,enron methanol ; meter # : 988291\r\nthis is ...
1,ham,"hpl nom for january 9 , 2001\r\n( see attache..."
2,ham,"neon retreat\r\nho ho ho , we ' re around to ..."
4,ham,re : indian springs\r\nthis deal is to book t...
5,ham,ehronline web address change\r\nthis message ...
...,...,...
5165,ham,"fw : crosstex energy , driscoll ranch # 1 , #..."
5166,ham,put the 10 on the ft\r\nthe transport volumes...
5167,ham,3 / 4 / 2000 and following noms\r\nhpl can ' ...
5168,ham,calpine daily gas nomination\r\n>\r\n>\r\njul...


In [28]:
#printing Raw Mail Data
print(raw_data)

     Category                                            Message
0         ham   enron methanol ; meter # : 988291\r\nthis is ...
1         ham   hpl nom for january 9 , 2001\r\n( see attache...
2         ham   neon retreat\r\nho ho ho , we ' re around to ...
3        spam   photoshop , windows , office . cheap . main t...
4         ham   re : indian springs\r\nthis deal is to book t...
...       ...                                                ...
5166      ham   put the 10 on the ft\r\nthe transport volumes...
5167      ham   3 / 4 / 2000 and following noms\r\nhpl can ' ...
5168      ham   calpine daily gas nomination\r\n>\r\n>\r\njul...
5169      ham   industrial worksheets for august 2000 activit...
5170     spam   important online banking alert\r\ndear valued...

[5171 rows x 2 columns]


In [31]:
# replacing all the null values with a null string
mail_data = raw_data.where((pd.notnull(raw_data)),'')

Checking Data set

In [30]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Message
0,ham,enron methanol ; meter # : 988291\r\nthis is ...
1,ham,"hpl nom for january 9 , 2001\r\n( see attache..."
2,ham,"neon retreat\r\nho ho ho , we ' re around to ..."
3,spam,"photoshop , windows , office . cheap . main t..."
4,ham,re : indian springs\r\nthis deal is to book t...


In [32]:
# printing the last 5 rows of the dataframe
mail_data.tail()

Unnamed: 0,Category,Message
5166,ham,put the 10 on the ft\r\nthe transport volumes...
5167,ham,3 / 4 / 2000 and following noms\r\nhpl can ' ...
5168,ham,calpine daily gas nomination\r\n>\r\n>\r\njul...
5169,ham,industrial worksheets for august 2000 activit...
5170,spam,important online banking alert\r\ndear valued...


In [33]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5171, 2)

Assigning Label / Label Encoding

In [34]:
# Label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

Reminder

spam = 0

ham = 1

Separating the data

In [35]:
# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

Verifying after Seperating

In [36]:
print(X)

0        enron methanol ; meter # : 988291\r\nthis is ...
1        hpl nom for january 9 , 2001\r\n( see attache...
2        neon retreat\r\nho ho ho , we ' re around to ...
3        photoshop , windows , office . cheap . main t...
4        re : indian springs\r\nthis deal is to book t...
                              ...                        
5166     put the 10 on the ft\r\nthe transport volumes...
5167     3 / 4 / 2000 and following noms\r\nhpl can ' ...
5168     calpine daily gas nomination\r\n>\r\n>\r\njul...
5169     industrial worksheets for august 2000 activit...
5170     important online banking alert\r\ndear valued...
Name: Message, Length: 5171, dtype: object


In [37]:
print(Y)

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: Category, Length: 5171, dtype: object


Spliting the data into Traning and Testing data

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [39]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5171,)
(4136,)
(1035,)


Feature Extraction

In [40]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [41]:
print(X_train)

2209     hplc to wellhead\r\ndaren here is the list of...
2000     mobil chemical - hpl meter # 1256 - expense t...
5030     revised nom 5 / 5 - eastrans ; revised nom fo...
1376     re : exxon company , usa global # 96035668 / ...
1564     your pharmacy nx\r\nwant a cheap pain killers...
                              ...                        
789      incr ' ease yo ' ur man ' hood by 4 - 5 inch ...
968      subscribers receive first notice on run - awa...
1667     neon for march 28\r\nhere is the neon lesson ...
3321     re : first delivery - pure resources , l . p ...
1688     enhance your chest size\r\nemail is loading ....
Name: Message, Length: 4136, dtype: object


In [42]:
print(X_train_features)

  (0, 3871)	0.13389614076826556
  (0, 531)	0.14558291649540278
  (0, 30451)	0.08470120335031706
  (0, 43273)	0.14558291649540278
  (0, 3890)	0.14558291649540278
  (0, 548)	0.14558291649540278
  (0, 37262)	0.11277398913484075
  (0, 2908)	0.11537303948641482
  (0, 456)	0.14558291649540278
  (0, 26297)	0.09507351214164073
  (0, 36190)	0.11402348314483417
  (0, 2478)	0.13874659094068928
  (0, 521)	0.14558291649540278
  (0, 16808)	0.11844706359591517
  (0, 22041)	0.13389614076826556
  (0, 2706)	0.14558291649540278
  (0, 522)	0.14558291649540278
  (0, 32060)	0.07312873621956524
  (0, 19411)	0.042116273278479076
  (0, 2537)	0.13874659094068928
  (0, 517)	0.14558291649540278
  (0, 19429)	0.14558291649540278
  (0, 16637)	0.24441873008225665
  (0, 3875)	0.14558291649540278
  (0, 836)	0.14558291649540278
  :	:
  (4135, 6786)	0.07327688767013263
  (4135, 8873)	0.07154593958976478
  (4135, 16161)	0.07327688767013263
  (4135, 26936)	0.07154593958976478
  (4135, 15691)	0.07812663033583245
  (4135, 14

Training the Model

Logistic Regression

In [43]:
model = LogisticRegression()

In [44]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

LogisticRegression()

Evaluation of Trained Model

In [45]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [46]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9968568665377177


Designing Spam Mail Predictive System

In [47]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
