In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
raw_mail_data = pd.read_csv('mail_data.csv')
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [4]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
mail_data.shape

(5572, 2)

In [6]:
#label encoding ham as 1 and spam as 0
mail_data.loc[mail_data['Category'] == 'spam','Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham','Category',] = 1
mail_data['Category'].value_counts()

1    4825
0     747
Name: Category, dtype: int64

In [7]:
X = mail_data['Message']
Y = mail_data['Category']

In [8]:
#Train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=3)

In [9]:
#Feature Extraction
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_feature = feature_extraction.fit_transform(X_train)
X_test_feature = feature_extraction.transform(X_test)

In [10]:
# Y_train.dtype = object
#convert datatype of Y_train,Y-test as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [11]:
# print(X_train_feature)

In [12]:
#Model Training
model = LogisticRegression()
model.fit(X_train_feature, Y_train)

LogisticRegression()

In [13]:
train_prediction = model.predict(X_train_feature)
train_accuracy = accuracy_score(train_prediction, Y_train)
train_accuracy

0.9670181736594121

In [14]:
test_prediction = model.predict(X_test_feature)
test_accuracy = accuracy_score(test_prediction, Y_test)
test_accuracy

0.9721973094170404

In [16]:
#Predictive System
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
input_mail_features = feature_extraction.transform(input_mail)
prediciton = model.predict(input_mail_features)
if prediciton[0]==1:
    print("It's not a spam mail")
else:
    print("It's a spam mail")

It's a spam mail
