In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [42]:
df = pd.read_csv('mail.csv')
print(df)

      Unnamed: 0 label                                               text  \
0            605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1           2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2           3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3           4685  spam  Subject: photoshop , windows , office . cheap ...   
4           2030   ham  Subject: re : indian springs\r\nthis deal is t...   
...          ...   ...                                                ...   
5166        1518   ham  Subject: put the 10 on the ft\r\nthe transport...   
5167         404   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...   
5168        2933   ham  Subject: calpine daily gas nomination\r\n>\r\n...   
5169        1409   ham  Subject: industrial worksheets for august 2000...   
5170        4807  spam  Subject: important online banking alert\r\ndea...   

      label_num  
0             0  
1             0  
2             0  
3  

In [3]:
data = df.where((pd.notnull(df)), '')


In [48]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,0,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,0,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,0,"Subject: photoshop , windows , office . cheap ...",1
4,2030,0,Subject: re : indian springs\r\nthis deal is t...,0
5,2949,0,Subject: ehronline web address change\r\nthis ...,0
6,2793,0,Subject: spring savings certificate - take 30 ...,0
7,4185,0,Subject: looking for medication ? we ` re the ...,1
8,2641,0,Subject: noms / actual flow for 2 / 26\r\nwe a...,0
9,1870,0,"Subject: nominations for oct . 21 - 23 , 2000\...",0


In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [6]:
data.shape

(5171, 4)

In [49]:
X = data['text']
Y = data['label_num']

In [28]:
print(X)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [52]:
print(Y)

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64


In [54]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 3)

In [55]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5171,)
(3619,)
(1552,)


In [56]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5171,)
(3619,)
(1552,)


In [57]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [58]:
print(X_train)

1876    Subject: may wellheads\r\ndavid here is the ot...
1704    Subject: new hpl meter - vidor city gate - che...
4876    Subject: your mother may find out\r\nremove\r\...
25      Subject: re : enron / hpl actuals for october ...
2193    Subject: hpl noms for june 06 , 2000\r\n( see ...
                              ...                        
789     Subject: incr ' ease yo ' ur man ' hood by 4 -...
968     Subject: subscribers receive first notice on r...
1667    Subject: neon for march 28\r\nhere is the neon...
3321    Subject: re : first delivery - pure resources ...
1688    Subject: enhance your chest size\r\nemail is l...
Name: text, Length: 3619, dtype: object


In [23]:
print(X_train_features)

  (0, 469)	0.04234497175924303
  (0, 521)	0.09357075330914992
  (0, 3829)	0.09357075330914992
  (0, 29506)	0.08910783023045304
  (0, 39995)	0.08910783023045304
  (0, 3938)	0.04447828877586618
  (0, 569)	0.09357075330914992
  (0, 3836)	0.09357075330914992
  (0, 29162)	0.08594133591368462
  (0, 38336)	0.08147841283498775
  (0, 1195)	0.08910783023045304
  (0, 3916)	0.09357075330914992
  (0, 24347)	0.06413660599354906
  (0, 36812)	0.03834594227552464
  (0, 3790)	0.09357075330914992
  (0, 36681)	0.08910783023045304
  (0, 1022)	0.0391784774424746
  (0, 1121)	0.08910783023045304
  (0, 3762)	0.09357075330914992
  (0, 36184)	0.06533855415269353
  (0, 3530)	0.05577312179813425
  (0, 499)	0.09357075330914992
  (0, 3853)	0.09357075330914992
  (0, 34944)	0.09357075330914992
  (0, 3774)	0.08348521210646197
  :	:
  (3618, 13169)	0.0634574032194839
  (3618, 4945)	0.07085860257936485
  (3618, 25157)	0.0747398329785886
  (3618, 39904)	0.059824817760223914
  (3618, 33314)	0.06697737218014109
  (3618, 243

In [59]:
model = LogisticRegression()

In [60]:
model.fit(X_train_features, Y_train)

In [62]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [63]:
print(accuracy_on_training_data)

0.9969604863221885


In [66]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print(accuracy_on_test_data)

0.9845360824742269


In [80]:
input = ["lottery you have won 500 million dollars"]
input_data_features = feature_extraction.transform(input)

prediction = model.predict(input_data_features)
if(prediction[0] == 0):
    print("ham")
else:
    print("spam")

ham
