Importing Libraries

In [212]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

Data Collection and Preprocessing

In [213]:
raw_mail_data = pd.read_csv('mail_data.csv')

In [214]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [215]:
raw_mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [216]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [217]:
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [218]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [219]:
mail_data.shape

(5572, 2)

Label Encoding

In [220]:
#spam-0 ham-1

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

In [221]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [222]:
X_train, X_test, Y_train, Y_test = train_test_split(mail_data['Message'],mail_data['Category'], test_size=0.3, random_state=10)

In [223]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(3900,)
(1672,)
(3900,)
(1672,)


In [224]:
X_test.info()

<class 'pandas.core.series.Series'>
Int64Index: 1672 entries, 4635 to 2407
Series name: Message
Non-Null Count  Dtype 
--------------  ----- 
1672 non-null   object
dtypes: object(1)
memory usage: 26.1+ KB


Feature Extraction

In [225]:
#transform data to feature vectors that can be used as input
feature_extraction = TfidfVectorizer(min_df=  1,stop_words='english',lowercase=True) #all the letters will to converted to lowercase
X_train_features = feature_extraction.fit_transform(X_train) #fit only trainingdata
X_test_features = feature_extraction.transform(X_test)
                        #Y values are already in 0,1 format
#conver Y_test and Y_train values to int
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [226]:
print(X_train_features)

  (0, 378)	0.29868500672190157
  (0, 46)	0.2285086449862687
  (0, 2654)	0.27454362285226114
  (0, 6423)	0.29868500672190157
  (0, 1420)	0.22027554183909337
  (0, 6655)	0.21848906539588253
  (0, 6409)	0.24630001429375273
  (0, 4051)	0.24263044926552294
  (0, 1425)	0.21362134495226823
  (0, 1676)	0.2241493810880265
  (0, 3579)	0.20554800655674957
  (0, 4432)	0.29868500672190157
  (0, 2650)	0.14415891914732595
  (0, 6424)	0.23628043470336652
  (0, 253)	0.24630001429375273
  (0, 4050)	0.33701400374329904
  (1, 3516)	0.4816899186843001
  (1, 3623)	0.8763417268611101
  (2, 4007)	0.5239955964975372
  (2, 3609)	0.42909472268498017
  (2, 6233)	0.5017753242574331
  (2, 1460)	0.5380779290971132
  (3, 5619)	0.7617605048713944
  (3, 6045)	0.647858729367814
  (4, 3025)	0.253194811622687
  :	:
  (3897, 2481)	0.27244373355202384
  (3897, 915)	0.24933052461249822
  (3897, 6121)	0.22621731567297262
  (3897, 3887)	0.2441903221115819
  (3897, 2660)	0.24933052461249822
  (3897, 1825)	0.22621731567297262
  

In [227]:
pickle.dump(feature_extraction, open("vectorizer.pkl", "wb"))

Training The Model

In [228]:
model=LogisticRegression()
model.fit(X_train_features,Y_train)

Evaluating The Model

In [229]:
#prediction on training data
prediction_on_training_data = model.predict(X_train_features)

In [230]:
acc_train=accuracy_score(Y_train,prediction_on_training_data)

In [231]:
acc_train

0.9702564102564103

In [232]:
#prediction on test data
prediction_on_test_data = model.predict(X_test_features)

In [233]:
acc_test=accuracy_score(Y_test,prediction_on_test_data)

In [234]:
acc_test

0.9551435406698564

Building A predictie System

In [235]:
input_mail=["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."]

In [236]:

pickle.dump(model,open('model.pkl','wb'))