In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
raw_mail_data = pd.read_csv('mail_data.csv')

In [7]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
raw_mail_data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [9]:
raw_mail_data.isnull().count()

Category    5572
Message     5572
dtype: int64

In [10]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [11]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
mail_data.loc[mail_data['Category'] == 'spam', 'Category',]= 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',]= 1

In [13]:
## segregating the values into X and Y

In [19]:
X = mail_data['Message']
Y = mail_data['Category']

In [20]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [21]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [22]:
## now we are splitting the data into test and train

In [23]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2,random_state=3)

In [28]:
print(X_train.shape)
print(X_test.shape)
print(X.shape)

(4457,)
(1115,)
(5572,)


In [29]:
## now we have done feature extraction which can be used for input

In [60]:
feature_ex = TfidfVectorizer(min_df=1,binary=True,stop_words='english')

X_train_fe = feature_ex.fit_transform(X_train)
X_test_fe = feature_ex.fit_transform(X_test)

## now we are converting Y data values as integer

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [61]:
print(X_train_fe)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.2572302747018002
  (4, 5497)	0.1713556864939963
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (445

# Logistic Regression

In [62]:
model = LogisticRegression()

In [75]:
model.fit(X_train_fe,Y_train)


In [76]:
#predicting on trained data

prediction_on_trained_data = model.predict(X_train_fe)

In [77]:
# comparing the outcome with Y_train data

accuracy_on_train_data = accuracy_score(Y_train,prediction_on_trained_data)

In [79]:
print("Accuracy is :",accuracy_on_train_data)

Accuracy is : 0.9645501458380076


In [80]:
model.fit(X_test_fe,Y_test)

In [81]:
#predicting on test data
prediction_on_test_data = model.predict(X_test_fe)

# comparing the outcome with Y_test data
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_test_data)

In [82]:
print("Accuracy is :" ,accuracy_on_test_data)

Accuracy is : 0.9165919282511211


# Building the predictive model

In [101]:
input_email = ["07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow"]

In [102]:
# let's convert the given text into feature extraction

input_data = feature_ex.transform(input_email)

In [103]:
prediction = model.predict(input_data)

if (prediction[0] == 1):
    print("ham mail.")
else:
    print("spam mail.")

spam mail.
