## Work flow

- data preprocessing
- train test split
- logistic regression model
- trained regression model

## Importing dependencies

In [9]:
import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split

# convert text data into numerical value
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

# evaluation of testing data
from sklearn.metrics import accuracy_score

## Data preprocessing

In [5]:
df = pd.read_csv("spam_ham_dataset.csv")

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [14]:
df.shape

(5171, 4)

In [13]:
df.isnull().any()

Unnamed: 0    False
label         False
text          False
label_num     False
dtype: bool

In [24]:
# separating the data as texts and label

x = df["text"]
y = df["label_num"]

In [39]:
print(x)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [40]:
print(y)

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64


In [34]:
# train test split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 5)

print(x_train.shape)
print(x_test.shape)

(3878,)
(1293,)


## Feature extraction

In [46]:
# convert text data into numerical value which can ube used as input to the logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = 'True' )



In [47]:
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

In [56]:
print(x_train_features)

  (0, 20319)	0.2794977745221887
  (0, 35355)	0.173024466639272
  (0, 28468)	0.23297130805344218
  (0, 41033)	0.14404538374323975
  (0, 16277)	0.23297130805344218
  (0, 27279)	0.27882701079856054
  (0, 2969)	0.3758610487062901
  (0, 13419)	0.1177762213299618
  (0, 35944)	0.15425010061300629
  (0, 11910)	0.3454618202891332
  (0, 35244)	0.16738371795488569
  (0, 28330)	0.28187645923680804
  (0, 37603)	0.17545544140008396
  (0, 25743)	0.15314324524506515
  (0, 27940)	0.1190762362540078
  (0, 5478)	0.18286461138738433
  (0, 981)	0.21463020626793478
  (0, 13476)	0.19932193993116026
  (0, 3053)	0.2706969967377999
  (0, 26623)	0.12765417803744888
  (0, 37360)	0.04385681608428319
  (1, 31674)	0.0319409931099472
  (1, 1085)	0.048754622942930204
  (1, 1505)	0.03062202379975014
  (1, 19432)	0.0344349261059447
  :	:
  (3876, 19393)	0.01665594400499775
  (3876, 19066)	0.03750228110802161
  (3876, 17279)	0.02297525524629861
  (3876, 9740)	0.02128936447276058
  (3876, 34092)	0.015300458759910845
  (38

## Training logistic regression model

In [55]:
model = LogisticRegression()

model.fit(x_train_features, y_train)

LogisticRegression()

## Evaluating model

Note:

The reason we want to check accuracy on the prediction of both the training data and testing data is to identify the problem of overfitting if it happens.

Overfitting is when the model learns the training data too well, capturing noises and random fluctuations on the training data instead of underlying patterns, thereby performing extremely well on the training data but poorly on the testing data.

In [57]:
# prediction on training data

predic = model.predict(x_train_features)

# accuracy on training data

accur = accuracy_score(y_train, predic)

In [63]:
print(f"Accuracy on training data is {accur}")

Accuracy on training data is 0.9958741619391439


In [68]:
# prediction on test data

predic1 = model.predict(x_test_features)

# accuracy on testing data

accur1 = accuracy_score(y_test, predic1)

In [66]:
print(f"Accuracy on training data is {accur1}")

Accuracy on training data is 0.9907192575406032


## Build a predictive system

In [82]:
def spam_detection(mail):
    
    mail_features = feature_extraction.transform(mail)
    
    prediction = model.predict(mail_features)
    
    if prediction[0] == 1:
        return "Spam mail"
        
    else:
        return "Ham mail"

In [83]:
spam_detection(["on the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers conference that brad and dustin are connected with that week , we ' re going to change the date to the following weekend , january 12 - 13 . now comes the part you need to think about ."])

'Ham mail'