In [None]:
### Importing the necessary modules ###
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
### Importing the datasets ###
df1 = pd.read_csv('Email_Dataset.csv')
df2 = pd.read_csv('Email Test Data.csv')

In [None]:
df1.head()

Unnamed: 0.1,Unnamed: 0,CATEGORY,MESSAGE
0,1000,Spam,\n\nThe Internet's Online Pharmacy\n\n\n\nViag...
1,1001,Spam,------=_NextPart_000_00B0_35C58D0E.D7267B06\n\...
2,1002,Spam,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont..."
3,1003,Spam,------=_NextPart_000_00E4_86E61E0A.B5488E11\n\...
4,1004,Spam,BARRISTER ADEWALE COKER CHAMBERS\n\nLegal Prac...


In [None]:
df2.head()

Unnamed: 0.1,Unnamed: 0,MESSAGE
0,0,"Dear Homeowner,\n\n \n\nInterest Rates are at ..."
1,1,ATTENTION: This is a MUST for ALL Computer Use...
2,2,This is a multi-part message in MIME format.\n...
3,3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
4,4,This is the bottom line. If you can GIVE AWAY...


In [None]:
### Replacing missing values in training and testing data with empty string ###
df_train = df1.where((pd.notnull(df1)), '')
df_test = df2.where((pd.notnull(df2)), '')

In [None]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,CATEGORY,MESSAGE
0,1000,Spam,\n\nThe Internet's Online Pharmacy\n\n\n\nViag...
1,1001,Spam,------=_NextPart_000_00B0_35C58D0E.D7267B06\n\...
2,1002,Spam,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont..."
3,1003,Spam,------=_NextPart_000_00E4_86E61E0A.B5488E11\n\...
4,1004,Spam,BARRISTER ADEWALE COKER CHAMBERS\n\nLegal Prac...


In [None]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,MESSAGE
0,0,"Dear Homeowner,\n\n \n\nInterest Rates are at ..."
1,1,ATTENTION: This is a MUST for ALL Computer Use...
2,2,This is a multi-part message in MIME format.\n...
3,3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
4,4,This is the bottom line. If you can GIVE AWAY...


In [None]:
### Dropping the unnamed column ###
df_train.drop('Unnamed: 0', axis=1, inplace=True)
df_test.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df_train.head()

Unnamed: 0,CATEGORY,MESSAGE
0,Spam,\n\nThe Internet's Online Pharmacy\n\n\n\nViag...
1,Spam,------=_NextPart_000_00B0_35C58D0E.D7267B06\n\...
2,Spam,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont..."
3,Spam,------=_NextPart_000_00E4_86E61E0A.B5488E11\n\...
4,Spam,BARRISTER ADEWALE COKER CHAMBERS\n\nLegal Prac...


In [None]:
print(df_train.shape)
print(df_test.shape)

(4000, 2)
(1000, 1)


In [None]:
### Let's convert the Spam and Not Spam to 1 and 0 ###
df_train.loc[df_train.CATEGORY=='Spam', 'CATEGORY'] = 1
df_train.loc[df_train.CATEGORY=='Not Spam', 'CATEGORY'] = 0
df_train.head()

Unnamed: 0,CATEGORY,MESSAGE
0,1,\n\nThe Internet's Online Pharmacy\n\n\n\nViag...
1,1,------=_NextPart_000_00B0_35C58D0E.D7267B06\n\...
2,1,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont..."
3,1,------=_NextPart_000_00E4_86E61E0A.B5488E11\n\...
4,1,BARRISTER ADEWALE COKER CHAMBERS\n\nLegal Prac...


In [None]:
X_train = df_train.MESSAGE
Y_train = df_train.CATEGORY
X_test = df_test.MESSAGE

In [None]:
# Creating an instance of the TfidfVectorizer class
feature_extraction = TfidfVectorizer(min_df=1, stop_words = 'english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')

In [None]:
print(X_train_features)

  (0, 50100)	0.06853224077760911
  (0, 45200)	0.07936311552620393
  (0, 17217)	0.03410456929946066
  (0, 57234)	0.08992338744792722
  (0, 53719)	0.2050677486861983
  (0, 22667)	0.05532015555337328
  (0, 47501)	0.07797564505073296
  (0, 35209)	0.11297612571147951
  (0, 26004)	0.09086126773145262
  (0, 45208)	0.08418983950675601
  (0, 37982)	0.05094079865831264
  (0, 39416)	0.2050677486861983
  (0, 41461)	0.2152130621470623
  (0, 28962)	0.030941625560311502
  (0, 34240)	0.08226909468455003
  (0, 16784)	0.1322874164807977
  (0, 31162)	0.19786953423141534
  (0, 21042)	0.14153901273400876
  (0, 45890)	0.07462801164697835
  (0, 47927)	0.14153901273400876
  (0, 35915)	0.1749426374152584
  (0, 20903)	0.15201574059910145
  (0, 11337)	0.14506899983609062
  (0, 54875)	0.20414296650547026
  (0, 39183)	0.11563576919164867
  :	:
  (3999, 38041)	0.061203379474315686
  (3999, 52259)	0.14443841683221662
  (3999, 54087)	0.04714498326630463
  (3999, 34212)	0.05177190590624254
  (3999, 13377)	0.0505958470

In [None]:
#Let's define the model
model = LogisticRegression()

In [None]:
model.fit(X_train_features, Y_train)

In [None]:
# Let's measure the accuracy of the model on the training data
prediction_training_data = model.predict(X_train_features)
accuracy_training_data = accuracy_score(Y_train, prediction_training_data)

In [None]:
accuracy_training_data

0.97175

In [None]:
### Seems like the model is working nicely !!!. Let's make predictions on the test data. ###
prediction_test_data = model.predict(X_test_features)

In [None]:
print(prediction_test_data.shape)
print(prediction_test_data[:10])

(1000,)
[1 0 1 1 1 1 1 1 1 1]


In [None]:
### Now we need to reassign Spam and not Spam strings in place of 1s and 0s ###
prediction_test_data = prediction_test_data.astype(str)
prediction_test_data[prediction_test_data=='0'] = 'Not Spam'
prediction_test_data[prediction_test_data=='1'] = 'Spam'
prediction_test_data[:10]

array(['Spam', 'Not Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam',
       'Spam', 'Spam'], dtype='<U11')

In [None]:
### Now the task is to convert this output into csv format ###
pred_series = pd.Series(prediction_test_data)
df = pred_series.to_frame(name='Predictions')
df.head()


Unnamed: 0,Predictions
0,Spam
1,Not Spam
2,Spam
3,Spam
4,Spam


In [None]:
df.to_csv('Q7_predictions.csv')