In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [3]:
raw_mail_data  = pd.read_csv('mail_data.csv')

In [5]:
# Fetching Random rows
raw_mail_data.sample(10)

Unnamed: 0,Category,Message
3104,ham,"U so lousy, run already come back then half de..."
4625,ham,Jus finish blowing my hair. U finish dinner al...
1251,ham,Ummmmmaah Many many happy returns of d day my ...
3033,ham,Jokin only lar... :-) depends on which phone m...
647,ham,Do you mind if I ask what happened? You dont h...
4339,ham,Yes when is the appt again?
480,ham,When're you guys getting back? G said you were...
5145,ham,Aiyar u so poor thing... I give u my support k...
3008,ham,Ah you see. You have to be in the lingo. I wil...
83,ham,You will be in the place of that man


In [7]:
# Information of Data
raw_mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
# Dimension of the data
raw_mail_data.shape

(5572, 2)

In [11]:
# Describe 
raw_mail_data.describe(include='object')

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [13]:
from sklearn.model_selection import train_test_split

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,classification_report

In [21]:
# Checking Missing Data exists in the Dataset
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [25]:
# Handling NUll Values/Missing Data in a particular row

df = raw_mail_data.where((pd.notnull(raw_mail_data)),' ')

In [27]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [29]:
# checking the shape
df.shape

(5572, 2)

In [34]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

#### Supervised Machine Learning problem

In [32]:
## Label Encoding for output column
from sklearn.preprocessing import LabelEncoder


In [38]:
df['Category'] = df['Category'].map({'spam' : 0 ,'ham':1})

In [40]:
df['Category'].value_counts()

Category
1    4825
0     747
Name: count, dtype: int64

In [42]:
df.sample(5)

Unnamed: 0,Category,Message
221,1,Ok no prob. Take ur time.
1397,1,Shall i start from hear.
3507,1,Nite...
3530,0,Xmas & New Years Eve tickets are now on sale f...
2775,1,"Dude u knw also telugu..thts gud..k, gud nyt.."


In [44]:
# Splitting the data into independent and dependent column
X = df['Message']
y = df['Category']

In [46]:
# Splitting the Independent and Output Feature into Train and Test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [48]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [50]:
# Feature Extractions
# Convert the text into vectors
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',binary=True)

In [52]:
X_train_features = feature_extraction.fit_transform(X_train).toarray
X_test_features = feature_extraction.transform(X_test)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   int64 
 1   Message   5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [58]:
# df['Category'] = df['Category'].astype('int64')
# y_train = y_train.astype("int64")
# y_test = y_test.astype("int64")

In [60]:
feature_extraction.fit_transform(X_train).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [62]:
feature_extraction.transform(X_test).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [66]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_features,y_train)

In [68]:
print(X_train_features)
# (spam,vectors of the data   similarity_score of vectors)
#                          OR
# (ham,vectors of the data   similarity_score of vectors)

  (0, 5512)	0.1898892037332199
  (0, 7222)	0.2173884735352799
  (0, 258)	0.2379428657041507
  (0, 7162)	0.2550284465664535
  (0, 354)	0.3544175987866074
  (0, 2724)	0.3544175987866074
  (0, 7300)	0.24288153842988894
  (0, 2049)	0.3034375179183143
  (0, 3262)	0.33791755486732394
  (0, 5800)	0.17558937755823417
  (0, 6264)	0.1898892037332199
  (0, 694)	0.3171299579602537
  (0, 2497)	0.2442158912653505
  (0, 5818)	0.22682143517864364
  (1, 3267)	0.26787130770292167
  (1, 6109)	0.32397626344658004
  (1, 6738)	0.28986069568917994
  (1, 2335)	0.21623212751660786
  (1, 5650)	0.3604441444703179
  (1, 3333)	0.20665394084233094
  (1, 2440)	0.3387054464839871
  (1, 4509)	0.40282459910606705
  (1, 3932)	0.24325511357721422
  (1, 3804)	0.19029023465152678
  (1, 2555)	0.3840709491751003
  :	:
  (4452, 3084)	0.22948428918295163
  (4452, 3290)	0.26370969643076225
  (4452, 3978)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 2438)	0.4574160733416501
  (4452, 7264)	0.479751063610169
  (44

In [70]:
print(X_test_features)

  (0, 1751)	0.34896165336060586
  (0, 2065)	0.36113324080559445
  (0, 2173)	0.30145841567028486
  (0, 3225)	0.402169324846608
  (0, 3395)	0.402169324846608
  (0, 3955)	0.3774291665065587
  (0, 4100)	0.3392428284838497
  (0, 4942)	0.27552235188443686
  (1, 1361)	0.37034060973735533
  (1, 1975)	0.3578586983359201
  (1, 3443)	0.3234324946551934
  (1, 4044)	0.3234324946551934
  (1, 5430)	0.387052012561607
  (1, 6544)	0.2204999931204713
  (1, 6642)	0.326271353777915
  (1, 6986)	0.2493471978387002
  (1, 7158)	0.3981347747267476
  (2, 1292)	0.31502044528879175
  (2, 2377)	0.42309928191578644
  (2, 3067)	0.21988546741069182
  (2, 3084)	0.21988546741069182
  (2, 3510)	0.4016985150384896
  (2, 4369)	0.42309928191578644
  (2, 5597)	0.4382833676588088
  (2, 6570)	0.30427433251497293
  :	:
  (1110, 2380)	0.3362376691126707
  (1110, 2437)	0.24437455884042017
  (1110, 2749)	0.2379337409312386
  (1110, 3084)	0.16868944269743877
  (1110, 3180)	0.3526556865484764
  (1110, 3938)	0.24167410415901527
  (11

In [76]:
prediction_train_data = model.predict(X_train_features)
accuracy_train_data = accuracy_score(y_train,prediction_train_data)

In [88]:
print("Accuracy on Train data : ",accuracy_train_data)

Accuracy on Train data :  0.9649988781691721


In [82]:
y_pred = model.predict(X_test_features)
accuracy_test_data = accuracy_score(y_test,y_pred)

In [86]:
print("Accuracy on Test data : ",accuracy_test_data)

Accuracy on Test data :  0.9659192825112107


In [106]:
# Building Predictive system
input_user_mail = [" Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."]
input_data_feature = feature_extraction.transform(input_user_mail)


In [108]:
prediction = model.predict(input_data_feature)

In [112]:
if prediction[0] == 1:
    print('Ham Mail')
else:
    print('Spam Mail')

Ham Mail


In [114]:
import pickle
pickle.dump(model,open('Logistic_Regression.pkl','wb'))
pickle.dump(feature_extraction,open('feature_extraction.pkl','wb'))