**Importing the Libraries**

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

**Loading the Dataset**

In [59]:
df = pd.read_csv("/content/spam_ham_dataset.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0


In [60]:
df.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

**Exploratory data analysis**

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [62]:
#checking the no of rows and columns in the dataframe
df.shape

(5171, 4)

**Label Encoding**

In [63]:
df['label'] = df['label'].replace('ham',1)
df['label'] = df['label'].replace('spam',0)


In [64]:
#Seperating the data as text and label

X = df['text']
Y = df['label']

In [65]:
X

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object

In [66]:
Y

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: int64

**Splitting the data into traing and test data**

In [67]:
x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

In [68]:
x_train.shape

(4136,)

In [69]:
y_train.shape

(4136,)

In [70]:
x_test.shape

(1035,)

In [71]:
y_test.shape

(1035,)

**Feature Extraction**

In [72]:
#transform the text data to feature vectors that can be used as input to the Logistic Regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words ='english', lowercase = True)

In [73]:
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

#convert y_train and y_test values as intergers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [74]:
print(x_train)

4282           Subject: hl & p\r\nthe most recent numbers
2247    Subject: re : your account # bj 535\r\nhi agai...
32      Subject: to : all domestic employees who parti...
4254    Subject: re : first delivery - cummings & walk...
2860    Subject: aol instant messenger reconfirmation\...
                              ...                        
2763    Subject: re [ 13 ]\r\ndriving at ? in 1876\r\n...
905     Subject: 29 th changes\r\n- - - - - - - - - - ...
3980    Subject: buy regalis , also known as superviag...
235     Subject: 24 x 5 products\r\nplease make sure t...
5157    Subject: 5 th changes @ duke and air liquide\r...
Name: text, Length: 4136, dtype: object


In [75]:
print(x_train_features)

  (0, 29574)	0.47778654265448484
  (0, 34010)	0.5748890303250579
  (0, 21439)	0.6555210728666094
  (0, 38532)	0.10730678216216491
  (1, 25575)	0.19855234617038361
  (1, 17218)	0.21360460802391193
  (1, 38087)	0.22134253461294806
  (1, 41624)	0.23224852960324222
  (1, 26741)	0.12059596193972569
  (1, 19477)	0.13779676002245456
  (1, 34244)	0.12059596193972569
  (1, 7928)	0.10011042819455333
  (1, 39670)	0.09783604642461713
  (1, 18669)	0.1275683195788644
  (1, 36832)	0.1362370481538148
  (1, 25842)	0.11404886332474426
  (1, 42111)	0.11576265900925982
  (1, 24293)	0.08551406697060127
  (1, 27871)	0.15376781116260807
  (1, 32701)	0.11228182553503797
  (1, 30490)	0.19855234617038361
  (1, 6127)	0.19855234617038361
  (1, 31036)	0.132036715959402
  (1, 28236)	0.08557160173736271
  (1, 1706)	0.17468611451187457
  :	:
  (4135, 15623)	0.24067696282347756
  (4135, 154)	0.09641535926030442
  (4135, 5137)	0.30233776856689054
  (4135, 1192)	0.1006832934980391
  (4135, 71)	0.17058711589526676
  (413

**Training the Model**

Logistic regression

In [76]:
model = LogisticRegression()

In [77]:
#training the logistic regression model with traing data
model.fit(x_train_features, y_train)

**Evaluating the trained model**

In [78]:
#prediction on training data

prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [79]:
print(accuracy_on_training_data)

0.995164410058027


In [80]:
#prediction on test data

prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [81]:
print(accuracy_on_test_data)

0.991304347826087


**Building a Predictive System**

In [55]:
input_mail = ["Subject: vic . odin n ^ ow berne hotbox carnal bride cutworm dyadic guardia continuous born gremlin akin counterflow hereafter vocabularian pessimum yaounde cannel bitch penetrate demagogue arbitrary egregious adenosine rubin gil luminosity delicti yarmulke sauterne selfadjoint agleam exeter picofarad consulate dichotomous boyhood balfour spheric frey pillory hoosier fibonacci cat handful"]

In [56]:
#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

In [57]:
prediction = model.predict(input_data_features)
print(prediction)

[0]


In [82]:
input_mail = ['Subject: enron / hpl actuals for august 28 , 2000 teco tap 20 . 000 / enron ; 120 . 000 / hpl gas daily ls hpl lsk ic 20 . 000 / enron']

In [83]:
#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

In [85]:
prediction = model.predict(input_data_features)
print(prediction)

if (prediction[0]==1):
  print('ham mail')
else:
  print('spam mail')  

[1]
ham mail
