# Predicting spam mail 

In [53]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [54]:
# loading the data from csv file to a pandas Dataframe
filepath = '/Users/ramezalghazawi/Desktop/machine learning projects/Classification/mail_data.csv'
raw_mail_data = pd.read_csv(filepath)

In [55]:
raw_mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [101]:
raw_mail_data.isnull().sum().all()

False

In [102]:
#since we don't have any missing values no need to write a code the replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [103]:
# printing the dataframe
mail_data

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [83]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5572, 2)

Label Encoding

In [84]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

### spam = 0, and ham = 1

In [85]:
# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

In [61]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [62]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

### Splitting the data into training data and test data

In [63]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [64]:
print(f"shape of X :  {X.shape}, shape of X_train : {X_train.shape}, shape of X_test {X_test.shape}")

shape of X :  (5572,), shape of X_train : (4457,), shape of X_test (1115,)


### Extraction the Feature 

In [65]:
# transform the text data to feature vectors using TfidfVectorizer so that it can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [66]:
X_train

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object

In [67]:
X_train_features

<4457x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 34775 stored elements in Compressed Sparse Row format>

### Training the Model using Logistic Regression

In [68]:
model = LogisticRegression()

In [69]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

LogisticRegression()

Evaluating the trained model

In [70]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [71]:
print(f"Accuracy on training data : {accuracy_on_training_data}")

Accuracy on training data : 0.9670181736594121


In [72]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [73]:
print(f"Accuracy on test data : {accuracy_on_test_data}")

Accuracy on test data : 0.9659192825112107


### Building a Predictive System

In [74]:
#testing an accual spam email from my mailbox 
input_mail = ["We the IMF wish to inform you that your total compensation fund of $2.7Million USA Dollars will be transferring to you daily $5,100 through Western Union. So contact Mr. Terry Young."]

# convert text to feature vectors using a simple if else statment to diplay spam or ham 
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail


### a real world example produceed a False Positive, it might be because of the dataset not having wide range of samples, so i will use another dataset and try to see if it will predict the spam email correctly 

In [76]:
#but before doing that we will try another example 


#testing an accual spam email from my mailbox 
input_mail = ["We have determined that you are eligible to receive a tax refund of 209.27 GBP. Please submit the tax refund request"]

# convert text to feature vectors using a simple if else statment to diplay spam or ham 
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail


In [97]:
# loading the data from csv file to a pandas Dataframe
filepath = '/Users/ramezalghazawi/Desktop/machine learning projects/Classification/spam_ham_dataset.csv'
raw_mail_data_2 = pd.read_csv(filepath)

In [98]:
raw_mail_data_2

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [99]:
#checking if there is a missing values 
raw_mail_data_2.isnull().sum().all()
#no missing data

False

In [106]:
# separating the data as texts and label

X_2 = raw_mail_data_2['text']

Y_2 = raw_mail_data_2['label_num']

In [107]:
X_train_2, X_test_2, Y_train_2, Y_test_2 = train_test_split(X_2, Y_2, test_size=0.2, random_state=3)

In [108]:
# transform the text data to feature vectors using TfidfVectorizer so that it can be used as input to the Logistic regression

feature_extraction_2 = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features_2 = feature_extraction_2.fit_transform(X_train_2)
X_test_features_2 = feature_extraction_2.transform(X_test_2)

# convert Y_train and Y_test values as integers

Y_train_2 = Y_train_2.astype('int')
Y_test_2 = Y_test_2.astype('int')

In [109]:
model_2 = LogisticRegression()

In [110]:
# training the Logistic Regression model with the training data
model_2.fit(X_train_features_2, Y_train_2)

LogisticRegression()

In [111]:
# prediction on training data

prediction_on_training_data_2 = model_2.predict(X_train_features_2)
accuracy_on_training_data_2 = accuracy_score(Y_train_2, prediction_on_training_data_2)

In [112]:
print(f"Accuracy on training data 2 : {accuracy_on_training_data_2}")

Accuracy on training data 2 : 0.9970986460348162


In [113]:
# prediction on test data

prediction_on_test_data_2 = model_2.predict(X_test_features_2)
accuracy_on_test_data_2 = accuracy_score(Y_test_2, prediction_on_test_data_2)

In [114]:
print(f"Accuracy on test data : {accuracy_on_test_data_2}")

Accuracy on test data : 0.9806763285024155


In [119]:
#testing an accual spam email from my mailbox 
input_mail_2 = ["We the IMF wish to inform you that your total compensation fund of $2.7Million USA Dollars will be transferring to you daily $5,100 through Western Union. So contact Mr. Terry Young."]
input_mail_2_e = ["We have determined that you are eligible to receive a tax refund of 209.27 GBP. Please submit the tax refund request"]
# convert text to feature vectors using a simple if else statment to diplay spam or ham 
input_data_features_2 = feature_extraction.transform(input_mail_2)
input_data_features_2_e = feature_extraction.transform(input_mail_2_e)
# making prediction

prediction_2 = model.predict(input_data_features_2)
print(prediction)
if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')
prediction_2_e = model.predict(input_data_features_2_e)
print(prediction)

if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail
[1]
Ham mail


### also in the second example it did not recognises the spam email 
### in the two examples it output the same as the last dataset 
### need to see the look more into this 