In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
from google.colab import files
uploaded = files.upload()

Saving mail_data.csv to mail_data.csv


In [4]:
dataset = pd.read_csv('mail_data.csv')
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
from sklearn.model_selection import train_test_split
'for converting mail text data to numerics'
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Replace null values with null strings as our data set has text values

In [7]:
dataset = dataset.where((pd.notnull(dataset)),'')

In [8]:
dataset.shape

(5572, 2)

Label encoding where spam mail is 0 and ham mail is 1

In [9]:
dataset.loc[dataset['Category'] == 'spam', 'Category',] = 0
dataset.loc[dataset['Category'] == 'ham', 'Category',] = 1

In [10]:
dataset.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
X = dataset['Message']
Y = dataset['Category']

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=4)

In [13]:
X.shape

(5572,)

In [14]:
X_train.shape

(4179,)

In [15]:
X_test.shape

(1393,)

Convert text data to numerical values using feature extraction

In [18]:
#TfIdf gives a score
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)
#min_df assigns 1 value to all words i.e. if word is repeated only once it is of no use
#stop words uses words like 'him','did','as' etc and removes it while calcullating TFIDF score

X_train_features = feature_extraction.fit_transform(X_train)
 #transform and fit the training data
X_test_features = feature_extraction.transform(X_test)
#only transform test data

Y_train = Y_train.astype('int') #Y values can be string. To be on safe side, convert it to int data type
Y_test = Y_test.astype('int')


Training Logistic regressions

In [20]:
model = LogisticRegression()

In [21]:
model.fit(X_train_features,Y_train)
#X_train contains transformed numeric data and Y_train contains 0 and 1 values


Evaluate model

In [22]:
#accuracy score predicts accuracy on what model is predicting and what actual value is
prediction_on_training = model.predict(X_train_features)
accuracy_on_training = accuracy_score(Y_train, prediction_on_training)
print(accuracy_on_training)

0.9674563292653745


*Accuracy of >95% means a well trained model*

In [23]:
prediction_on_test = model.predict(X_test_features)
accuracy_on_test = accuracy_score(Y_test, prediction_on_test)
print(accuracy_on_test)

0.9504666188083274


Now build a predictive system

In [28]:
input_mail = ["hi! Happy holidays"]
#convert text to numeric value
input_mail_feature = feature_extraction.transform(input_mail)
prediction = model.predict(input_mail_feature)
print(prediction)
#prediction[0] means we are printing 0 index value i.e. first value
if (prediction[0]==1):
  print('Ham mail')
else:
  print('Spam mail')

[1]
Ham mail
