# Project : Logistic Regression - Spam mail detection
### Author: Sathyakala D

### Import library files

In [80]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Reading the dataset

In [81]:
df = pd.read_csv('email_spams.csv', encoding="Latin")
df

Unnamed: 0,title,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
1334,ham,"Oh... Icic... K lor, den meet other day..."
1335,ham,Oh ! A half hour is much longer in Syria than ...
1336,ham,"Sometimes we put walls around our hearts,not j..."
1337,ham,"Sweet, we may or may not go to 4U to meet carl..."


### Data Preprocessing

In [82]:
df.shape

(1339, 2)

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    1339 non-null   object
 1   Message  1339 non-null   object
dtypes: object(2)
memory usage: 21.1+ KB


In [84]:
df.describe()

Unnamed: 0,title,Message
count,1339,1339
unique,2,1297
top,ham,"Sorry, I'll call later"
freq,1143,8


In [85]:
df.loc[df['title'] == 'spam', 'title',] = 0
df.loc[df['title'] == 'ham', 'title',] = 1
df

Unnamed: 0,title,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
1334,1,"Oh... Icic... K lor, den meet other day..."
1335,1,Oh ! A half hour is much longer in Syria than ...
1336,1,"Sometimes we put walls around our hearts,not j..."
1337,1,"Sweet, we may or may not go to 4U to meet carl..."


In [86]:
# Get the spam (0) and ham (1) 
df['title'].value_counts()

title
1    1143
0     196
Name: count, dtype: int64

In [87]:
# Seperate the data and labels
X = df['Message']
Y = df['title']

In [88]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
1334           Oh... Icic... K lor, den meet other day...
1335    Oh ! A half hour is much longer in Syria than ...
1336    Sometimes we put walls around our hearts,not j...
1337    Sweet, we may or may not go to 4U to meet carl...
1338         Then she buying today? Ü no need to c meh...
Name: Message, Length: 1339, dtype: object


In [89]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
1334    1
1335    1
1336    1
1337    1
1338    1
Name: title, Length: 1339, dtype: object


### Train Test Split

In [90]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [91]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(1339,)
(1071,)
(268,)


### Training the model

In [92]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)


Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [93]:
print(X_train)

46          Didn't you get hep b immunisation in nigeria.
537     Probably gonna be here for a while, see you la...
671                  Ho ho - big belly laugh! See ya tomo
147     FreeMsg Why haven't you replied to my text? I'...
936                            Then dun wear jeans lor...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
256     Don't necessarily expect it to be done before ...
968     What do u want when i come back?.a beautiful n...
952                              Shb b ok lor... Thanx...
1273                                                Ok...
Name: Message, Length: 1071, dtype: object


In [94]:
print(X_train_features)

  (0, 1976)	0.5248882294643128
  (0, 1502)	0.5248882294643128
  (0, 1416)	0.5248882294643128
  (0, 958)	0.41650574991163875
  (1, 1767)	0.3496759675502978
  (1, 2867)	0.4394188407378411
  (1, 1663)	0.4074798335070685
  (1, 1318)	0.46528369946383935
  (1, 2233)	0.5496444891206227
  (2, 2862)	0.304021840790591
  (2, 3176)	0.27281808402415547
  (2, 1665)	0.3671695548999616
  (2, 525)	0.3467313751548408
  (2, 540)	0.3117920761058863
  (2, 1434)	0.6934627503096816
  (3, 1069)	0.2149503621489096
  (3, 2679)	0.19122854457474367
  (3, 2343)	0.17419187422415539
  (3, 1915)	0.1832136811260221
  (3, 33)	0.2850729172189991
  (3, 1963)	0.2850729172189991
  (3, 1401)	0.24207746942823247
  (3, 1775)	0.2262091289167887
  (3, 1726)	0.2692045767075553
  (3, 1718)	0.2220857246941159
  :	:
  (1066, 3082)	0.17905466222963903
  (1066, 1229)	0.15909371017811133
  (1066, 2800)	0.16085876622763337
  (1067, 1121)	0.5308578300737817
  (1067, 1952)	0.5308578300737817
  (1067, 1399)	0.501308082265555
  (1067, 1004

### Model Evaluation

In [95]:
model = LogisticRegression()

In [96]:
model.fit(X_train_features, Y_train)

In [97]:
# Find the accuracy score of the training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [98]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9122315592903828


In [99]:
# Find the accuracy score of the test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [100]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9067164179104478


### Making a predictive system

In [101]:
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

input_data_features = feature_extraction.transform(input_mail)


prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
