In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Loading the data from csv file to pandas as dataframe
mail_data = pd.read_csv("mail_data.csv")

In [3]:
#Loading the first 5 rows of the data
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#Loading the last 5 rows of the data
mail_data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [5]:
# Checking for null values in the individuals columns
(mail_data['Message'] == '').any()

False

In [6]:
# Checking for null values in the individuals columns
print(mail_data['Message'].isnull().any())

False


In [7]:
# Replace the null values with a null string
new_mail_data = mail_data.where((pd.notnull(mail_data)),'')

In [8]:
# Printing the first 5 rows of the dataframe
new_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# Checking the number of rows and columns in the dataset
new_mail_data.shape

(5572, 2)

In [10]:
# number of class in the output column
new_mail_data['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [11]:
# label spam mail as 0 and ham mail as 1
new_mail_data.loc[new_mail_data['Category'] == 'spam','Category',] = 0
new_mail_data.loc[new_mail_data['Category'] == 'ham','Category',] = 1

In [12]:
new_mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
# Seperating the Data as texts and labels
X = new_mail_data['Message']
y = new_mail_data['Category']

In [14]:
# Visualising y variable
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [15]:
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.2,random_state= 4 )

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [17]:
# Transform the text data to feature vectors that can be used as input to Logistic Regression Model
feature_extraction = TfidfVectorizer(min_df= 1,stop_words='english',lowercase=True)
# lowercase :Convert all characters to lowercase before tokenizing.
# stop_words : {'english'},
    # If a string, it is passed to _check_stop_list and the appropriate stop
    # list is returned. 'english' is currently the only supported string
    # value.
    # There are several known issues with 'english' and you should
    # consider an alternative (see :ref:`stop_words`).
# min_df : When building the vocabulary ignore terms that have a document
    # frequency strictly lower than the given threshold. This value is also
    # called cut-off

In [18]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [19]:
# convert y_train and y_test values as integer
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [20]:
# Visualising X_train transformed data
print(X_train_features)

  (0, 6023)	0.31295501407186926
  (0, 3068)	0.20722043882108684
  (0, 3189)	0.2695003791316419
  (0, 3094)	0.1960160348955552
  (0, 6220)	0.40714919169918795
  (0, 7166)	0.23411798769212422
  (0, 4787)	0.29950623963635054
  (0, 1705)	0.3274543662048457
  (0, 5531)	0.3423416769137198
  (0, 4189)	0.40714919169918795
  (0, 4076)	0.21814163878169243
  (1, 4061)	0.4900712309801611
  (1, 5773)	0.7151217422337083
  (1, 5240)	0.49842861309340514
  (2, 6977)	0.17165112662617582
  (2, 230)	0.25960869981277335
  (2, 3871)	0.18868130288704416
  (2, 1873)	0.2722850313233416
  (2, 6566)	0.23311905039120562
  (2, 3344)	0.18280209804262273
  (2, 248)	0.18192137275151332
  (2, 1)	0.19592827444073843
  (2, 1621)	0.16393337207218853
  (2, 1136)	0.20537868697819087
  (2, 1839)	0.19731800013429093
  :	:
  (4454, 6645)	0.5888192010990861
  (4455, 4470)	0.2851349711027913
  (4455, 3483)	0.24151640507435573
  (4455, 2856)	0.2516758365847381
  (4455, 1268)	0.27394793268337375
  (4455, 1353)	0.324981330584005
 

In [21]:
# Visualising X_test transformed data
print(X_test_features)

  (0, 929)	0.2085278792709853
  (0, 1279)	0.34214311139979436
  (0, 1849)	0.16878737748053055
  (0, 2382)	0.5259762486367688
  (0, 3083)	0.32621451083338754
  (0, 3975)	0.2789167248847912
  (0, 4443)	0.2876828663523952
  (0, 4891)	0.34214311139979436
  (0, 6453)	0.2308409974214785
  (0, 6633)	0.18846021584608846
  (0, 6824)	0.24794236456194046
  (1, 6557)	1.0
  (2, 4762)	1.0
  (3, 488)	0.32023198244739365
  (3, 999)	0.22592152905944315
  (3, 1354)	0.6404639648947873
  (3, 1604)	0.32023198244739365
  (3, 1654)	0.29474569958063007
  (3, 1844)	0.19771037506290576
  (3, 2954)	0.27416922366319624
  (3, 3410)	0.24377313384710314
  (3, 4323)	0.1864818173754996
  (3, 7125)	0.19579156055392108
  (5, 3130)	0.6295559559861159
  (5, 7207)	0.7769551456052065
  :	:
  (1111, 7416)	0.5650866161494663
  (1112, 806)	0.7596260821351761
  (1112, 2607)	0.6503600659173062
  (1113, 3578)	0.4991822285943939
  (1113, 3894)	0.31021414761701144
  (1113, 4027)	0.26598850251005907
  (1113, 4356)	0.5712397286438355

In [22]:
# Training logistic regression model with the Training data
model = LogisticRegression()
model.fit(X_train_features,y_train)

In [23]:
# Evaluating the Model Performance
# prediction on training data
train_pred = model.predict(X_train_features)
score = accuracy_score(y_train,train_pred)
print("Accuracy score on Trained Data : ",score)
# Out of every 100 data points model is correctly able to predict 96 data points

Accuracy score on Trained Data :  0.9685887368184878


In [24]:
# Evaluating the Test Data Performance
test_pred = model.predict(X_test_features)
score_test = accuracy_score(y_test,test_pred)
print("Accuracy score on Test Data : ",score_test)
# Out of every 100 data points model is correctly able to predict 95 data points

Accuracy score on Test Data :  0.9506726457399103


In [50]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times."]
# convert text value to feature vectors
input_data_feature = feature_extraction.transform(input_mail)
# making predictions
test_output = model.predict(input_data_feature)

In [58]:
# printing the outcome
print(test_output)

if test_output == 1:
    print("Ham Mail")
else:
    print("Spam Mail")

[1]
Ham Mail
