In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('C:\Computer Science\Machine Learning\Projects\Spam Mail Prediction\mail_data.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df = df.where((pd.notnull(df)),'')

In [4]:
df.shape

(5572, 2)

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['Category'])

In [6]:
df['Category'] = le.transform(df['Category'])

In [7]:
le.classes_

array(['ham', 'spam'], dtype=object)

In [8]:
y = df['Category']
x = df['Message']

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

Feature Extraction

In [10]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
# TfidfVectorizer: What it does is it assign each word with some score according to the no of occurance in ham or spam and then model link these things and classify
# stop_word: it is used so that basic common eng word can be ignored and its impact cannot be shown. words like : is,the 

In [11]:
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)

In [12]:
#  convert Y_train,Y_test value into int
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [13]:
print(x_train_feature)

  (0, 2010)	0.19832877373922167
  (0, 2111)	0.12604809188195232
  (0, 2366)	0.13596858368909262
  (0, 4070)	0.10942895360863092
  (0, 5518)	0.2316496926325036
  (0, 5048)	0.6625954635560332
  (0, 1956)	0.44173030903735544
  (0, 1920)	0.2316496926325036
  (0, 3135)	0.11348430467691432
  (0, 2578)	0.21321340295490812
  (0, 5937)	0.1947771132773127
  (0, 6701)	0.15131260159017793
  (0, 2066)	0.2316496926325036
  (1, 4358)	0.46909811323167033
  (1, 5456)	0.6842437498161793
  (1, 4114)	0.4242562585988265
  (1, 4805)	0.3629932203770568
  (2, 2791)	0.35291632283354923
  (2, 3275)	0.219045164731788
  (2, 3384)	0.30174704574497613
  (2, 1353)	0.370148735718615
  (2, 3957)	0.3062249055442335
  (2, 4416)	0.2845146328599104
  (2, 5335)	0.3406897313143648
  (2, 3743)	0.25100491154391114
  :	:
  (4453, 6799)	0.44021473146824863
  (4453, 5180)	0.44021473146824863
  (4453, 1418)	0.35404770247824413
  (4453, 7279)	0.32136372886390163
  (4453, 1072)	0.26050854444934685
  (4453, 3073)	0.31461434258698556

Training the model

In [14]:
model = LogisticRegression()

In [15]:
# training the Logistics Regression Model with the training data
model.fit(x_train_feature,y_train)

In [16]:
y_pred_training = model.predict(x_train_feature)

In [17]:
accuracy_score(y_pred_training,y_train)

0.9672425398249944

In [18]:
# Evaluating the trained model
y_pred = model.predict(x_test_feature)

In [19]:
accuracy_score(y_pred,y_test)

0.9605381165919282

Build a Predictive System

In [26]:
# input_mail = ["Urgent Please call 09066612661 from landline. £5000 cash or a luxury 4* Canary Islands Holiday await collection. T&Cs SAE award. 20M12AQ. 150ppm. 16+"]

In [38]:
input_ = input("Enter Text: ")

Enter Text: Yup... Ok i go home look at the timings then i msg ü again... Xuhui going to learn on 2nd may too but her lesson is at 8am


In [39]:
input_mail = []
input_mail.append(input_)

In [40]:
# convert text into feature vector
input_data_feature = feature_extraction.transform(input_mail)

In [41]:
ans = model.predict(input_data_feature)
print(ans)

[0]


In [42]:
if(ans == 1):
    print("Spam")
else:
    print("Ham")

Ham
