In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# loading the csv data to a Pandas DataFrame
raw_mail_data = pd.read_csv('email.csv')

# first 5 rows of the dataset
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
raw_mail_data.shape

(5573, 2)

In [5]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [6]:
mail_data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
mail_data = mail_data[mail_data['Category'].isin(['spam', 'ham'])]

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

In [8]:
mail_data.head(5)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
X = mail_data['Message']

Y = mail_data['Category']

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=232)

In [11]:
print(X.shape, X_train.shape, X_test.shape)

(5572,) (5014,) (558,)


In [15]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)



X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

print(len(feature_extraction.vocabulary_))
print(feature_extraction.vocabulary_)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

7932


In [46]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 39238 stored elements and shape (5014, 7932)>
  Coords	Values
  (0, 3129)	0.5303945739722681
  (0, 2860)	0.2720289116949356
  (0, 3282)	0.19129216302599938
  (0, 5954)	0.37608855113879736
  (0, 3882)	0.35881313845346463
  (0, 4201)	0.28490801449333
  (0, 7640)	0.21070905205738122
  (0, 4898)	0.1988934728488023
  (0, 1469)	0.37608855113879736
  (0, 3373)	0.17699608939800263
  (1, 3411)	0.4406941928663958
  (1, 5056)	0.493948374698412
  (1, 7126)	0.34595643482301924
  (1, 7642)	0.4676000858399185
  (1, 4210)	0.4727239537337622
  (2, 7577)	0.16781625867947236
  (2, 4170)	0.20590057302898865
  (2, 7251)	0.15886837588955374
  (2, 1858)	0.2518138991757599
  (2, 1803)	0.32350480154384464
  (2, 4679)	0.13549173160946
  (2, 4321)	0.2518138991757599
  (2, 7291)	0.2006993850639259
  (2, 7776)	0.1685770385247751
  (2, 6193)	0.12441246640896633
  :	:
  (5009, 4686)	0.348266076905156
  (5009, 490)	0.348266076905156
  (5010, 3282)	0.286488

In [17]:
model = LogisticRegression()

In [18]:
model.fit(X_train_features, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [19]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_score_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [20]:
print(accuracy_score_on_training_data)

0.9704826485839649


In [21]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_score_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [22]:
print(accuracy_score_on_test_data)

0.967741935483871


In [23]:
input_mail = ["07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow"]

input_data_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_data_features)
print(prediction)

if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
