##Getting the data

In [40]:
import pandas as pd
raw_data = pd.read_csv("mail_data.csv")
raw_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
raw_data.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [42]:
data = raw_data.where((pd.notnull(raw_data)),'')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Categorizing the data

In [43]:
data.loc[data.Category == 'spam', 'Category'] = 0
data.loc[data.Category == 'ham', 'Category'] = 1
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [44]:
x = data['Message']
y = data['Category']

##Vectorizing the text data

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
vector_x = vectorizer.fit_transform(x)

In [46]:
print(vector_x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 43529 stored elements and shape (5572, 8440)>
  Coords	Values
  (0, 4244)	0.35097479377088364
  (0, 5775)	0.27451666535585145
  (0, 2278)	0.27180581777101714
  (0, 1281)	0.26251769953088055
  (0, 1715)	0.29650492406235857
  (0, 3551)	0.19387866945820545
  (0, 8281)	0.23740715800944148
  (0, 4370)	0.29650492406235857
  (0, 1713)	0.3350433781715565
  (0, 2003)	0.29650492406235857
  (0, 3511)	0.16453831818791093
  (0, 1061)	0.35097479377088364
  (0, 8079)	0.1961033223643189
  (1, 5373)	0.2718944069420321
  (1, 4406)	0.4083258549263009
  (1, 4212)	0.5236804332035243
  (1, 8187)	0.43162957585464123
  (1, 5399)	0.5466243141314314
  (2, 3276)	0.11676028650249681
  (2, 2885)	0.36440225960212075
  (2, 8239)	0.19287984407221892
  (2, 2119)	0.19686982823560253
  (2, 8199)	0.14953315491852773
  (2, 3014)	0.47550942852592687
  (2, 2337)	0.20418515380343544
  :	:
  (5567, 2777)	0.23210746089026935
  (5567, 307)	0.24294734175129457
  (5567

##Splitting the data

In [47]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(vector_x, y, test_size = 0.2, random_state = 3)
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4457, 8440) (1115, 8440)


Changing the types to Int

In [48]:
y_train.value_counts()
y_train.info()
y_train_int = y_train.astype('int')
y_test_int = y_test.astype('int')

<class 'pandas.core.series.Series'>
Index: 4457 entries, 3075 to 1688
Series name: Category
Non-Null Count  Dtype 
--------------  ----- 
4457 non-null   object
dtypes: object(1)
memory usage: 69.6+ KB


In [49]:
x_train_int = x_train.astype('int')
x_test_int = x_test.astype('int')

##Model: Logistic Regression

In [50]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train_int, y_train_int)

In [51]:
predic_y = model.predict(x_test_int)

##Prediction Score

In [52]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_int, predic_y)

0.8609865470852018

##Real time prediction

In [53]:
input = ("Nah I don't think he goes to usf, he lives around here though")
vector_input = vectorizer.transform([input])
prediction = model.predict(vector_input)
print(prediction)
if prediction[0] == 1:
  print("Ham mail")
else:
  print("Spam mail")

[1]
Ham mail
