In [1]:
# Import all the Dependencies

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
# Data Collection & Preprocessing
# Load csv file into a pandas dataframe
from google.colab import files
uploaded = files.upload()

Saving emails.csv to emails.csv


In [3]:
# Load csv file into a pandas dataframe
raw_mail_data = pd.read_csv('/content/emails.csv')

In [4]:
# Print the dataset
print(raw_mail_data)

                                                   text  spam
0     Subject: naturally irresistible your corporate...     1
1     Subject: the stock trading gunslinger  fanny i...     1
2     Subject: unbelievable new homes made easy  im ...     1
3     Subject: 4 color printing special  request add...     1
4     Subject: do not have money , get software cds ...     1
...                                                 ...   ...
5723  Subject: re : research and development charges...     0
5724  Subject: re : receipts from visit  jim ,  than...     0
5725  Subject: re : enron case study update  wow ! a...     0
5726  Subject: re : interest  david ,  please , call...     0
5727  Subject: news : aurora 5 . 2 update  aurora ve...     0

[5728 rows x 2 columns]


In [5]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [6]:
# Print the first five rows of the dataframe
mail_data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [7]:
# Checking the number of rows & columns
mail_data.shape

(5728, 2)

In [8]:
# how many are spam and ham
mail_data.spam.value_counts()

Unnamed: 0_level_0,count
spam,Unnamed: 1_level_1
0,4360
1,1368


In [9]:
# Split the data into feautures & targets
X = mail_data['text']
Y = mail_data['spam']

In [None]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5728, dtype: int64


In [10]:
print(X)

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5728, dtype: object


In [11]:
# Splitting the data into training data and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [12]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5728,)
(4582,)
(1146,)


In [14]:
# Convert text data into meaningful numerical values
# Feature extraction
# Transform text data into feature vectors that can be used in our logistic regression model
# TfidfVectorizer - if a word repeated several ties its given a score. if a word appears miniscule times its given a score.

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert Y train and Y test as intergers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [15]:
# Print X test & X train
print(X_train_features)

  (0, 29045)	0.027350831183146494
  (0, 12734)	0.2731330732901836
  (0, 9702)	0.2434779314354512
  (0, 26414)	0.20469906162468185
  (0, 11945)	0.16768019627120936
  (0, 21847)	0.3221661388560145
  (0, 29259)	0.19594745660827412
  (0, 7447)	0.26830087740346925
  (0, 8787)	0.13947216511966962
  (0, 17361)	0.14810582386362775
  (0, 8653)	0.07145974179954041
  (0, 7094)	0.07086072273068604
  (0, 33101)	0.060807769107754726
  (0, 8799)	0.13947216511966962
  (0, 25415)	0.04547507625063386
  (0, 14190)	0.20469906162468185
  (0, 14051)	0.18776782473981463
  (0, 30580)	0.1357750225763003
  (0, 24306)	0.048890118949862775
  (0, 4932)	0.10871375473306225
  (0, 14192)	0.11948158504424808
  (0, 15229)	0.07973851459430858
  (0, 27608)	0.09497852966241772
  (0, 19873)	0.04042751503749571
  (0, 30155)	0.04986444772716652
  :	:
  (4581, 29985)	0.05731960277359003
  (4581, 2039)	0.11904749726535191
  (4581, 17129)	0.11305487382993415
  (4581, 112)	0.08190880155517595
  (4581, 1267)	0.10098266909611253
 

In [16]:
# Training the Logistic Regression model
model = LogisticRegression()

In [17]:
# Training Logistic Regression model with training data
model.fit(X_train_features, Y_train)

In [18]:
# Model evaluation of the trained data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [19]:
print('accuracy_on_training_data:', accuracy_on_training_data)

accuracy_on_training_data: 0.9958533391532082


In [None]:
# Accuracy score on training data = 99.5%
# Model performed well.

In [21]:
# Model Evaluation on test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [22]:
print('accuracy_on_test_data:' , accuracy_on_test_data)

accuracy_on_test_data: 0.9834205933682374


In [None]:
# Accuracy score on test data is 98.3$
# Model performed well.

In [25]:
# Building a predictive system
input_mail = [" naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  marketing break - through shouldn ' t make gaps in your budget . 100 % satisfaction  guaranteed "]

# Convert text to feature vectors

input_data_features = feature_extraction.transform(input_mail)

# Makung predictions

prediction = model.predict(input_data_features)

# Print the predicted value
print(prediction)

if prediction[0]==1:
  print('Ham mail')
else:
  print('Spam mail')

[1]
Ham mail
