In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
import warnings
warnings.filterwarnings('ignore')


In [16]:
df = pd.read_csv('emails.csv', encoding='latin-1')

In [17]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [18]:
df.shape

(5728, 2)

In [19]:
df['spam'].value_counts()

spam
0    4360
1    1368
Name: count, dtype: int64

In [20]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [21]:
X = df['text']
Y = df['spam']

In [14]:
print(X)

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5728, dtype: object


In [15]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5728, dtype: int64


In [22]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

Feature Extraction


In [None]:
#Transform text data to feature vectors

In [23]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english',lowercase=True)
x_train_features = feature_extraction.fit_transform(X_train)
x_test_features = feature_extraction.transform(X_test)

In [24]:
#Convert Y_train and Y_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [25]:
print(x_train_features)

  (0, 1631)	0.11948158504424804
  (0, 2175)	0.1139366811829867
  (0, 2270)	0.09287045505205638
  (0, 7353)	0.0655788157361805
  (0, 15304)	0.0663995869621661
  (0, 28666)	0.06431166772110039
  (0, 28707)	0.06016733162188114
  (0, 12928)	0.06467963832229102
  (0, 21474)	0.05636789397124751
  (0, 21089)	0.03528233862778253
  (0, 33086)	0.043228163741262066
  (0, 14600)	0.044761414388067036
  (0, 31687)	0.05286353166660429
  (0, 8779)	0.15915971536413323
  (0, 8929)	0.2046990616246818
  (0, 31574)	0.08608147431672733
  (0, 29719)	0.09538081914466627
  (0, 15873)	0.06034727389686767
  (0, 24009)	0.059727569756054304
  (0, 25478)	0.100523433523679
  (0, 3685)	0.1139366811829867
  (0, 30510)	0.07784647543051056
  (0, 15183)	0.07546212040150725
  (0, 32265)	0.044560090228177425
  (0, 6690)	0.06742389347978599
  :	:
  (4581, 429)	0.07532196662652381
  (4581, 18624)	0.062223221403916826
  (4581, 32353)	0.10192506274469543
  (4581, 4987)	0.11815520710746134
  (4581, 16993)	0.10310140246650003
  

Training the model

In [26]:
model = LogisticRegression()

In [27]:
model.fit(x_train_features , Y_train)

In [28]:
X_train_prediction = model.predict(x_train_features)
training_data_accuracy = accuracy_score(X_train_prediction , Y_train)

In [29]:
print('The accuracy on training data is :', training_data_accuracy)

The accuracy on training data is : 0.9965080750763858


In [31]:
X_test_prediction = model.predict(x_test_features)
test_data_accuracy = accuracy_score(X_test_prediction , Y_test)

In [32]:
print('The accuracy on test data is :', test_data_accuracy)

The accuracy on test data is : 0.9834205933682374


Predictive system


In [34]:
input_data = ["Subject: fpa notice : ebay misrepresentation of identity - user suspension - section 9 -  dear ebay member ,  in an effort to protect your ebay  account security , we have suspended your account until such time that it can  be safely restored to you . we have taken this action because your account  may have been compromised . although we cannot disclose our investigative  procedures that led to this conclusion , please know that we took this action  in order to maintain the safety of your account . for instructions on  getting your account reinstated , please click the button bellow :  thank you for your patience and  cooperation . regards ,  safeharbor departmentebay  inc . "]

#convert text to feature vector
input_data_features = feature_extraction.transform(input_data)

#making prediction

prediction = model.predict(input_data_features)
print(prediction)

if(prediction[0]==1):
    print('This is a spam mail')
else:
    print('This is a not a spam mail')



[1]
This is a spam mail
