***The goal  is to implement a “logistic regression” model from scratch for email classification. Emails are available as  “spam_ham_dataset.csv” dataset. Each email is labeled as either “spam” or “ham.” ***

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re                              
import string   
import random  
from collections import Counter
from itertools import chain
from numpy.random import rand

In [2]:
from google.colab import drive
drive.mount('/content/drive')

dataset_path = '/content/drive/My Drive/Assignment_1_NLP'

Mounted at /content/drive


## **Read dataset into a pandas dataframe**

In [3]:
col_list = ["text", "label_num"]
dataset_pd = pd.read_csv(os.path.join(dataset_path,'spam_ham_dataset.csv'), usecols=col_list)
print(dataset_pd)

                                                   text  label_num
0     Subject: enron methanol ; meter # : 988291\r\n...          0
1     Subject: hpl nom for january 9 , 2001\r\n( see...          0
2     Subject: neon retreat\r\nho ho ho , we ' re ar...          0
3     Subject: photoshop , windows , office . cheap ...          1
4     Subject: re : indian springs\r\nthis deal is t...          0
...                                                 ...        ...
5166  Subject: put the 10 on the ft\r\nthe transport...          0
5167  Subject: 3 / 4 / 2000 and following noms\r\nhp...          0
5168  Subject: calpine daily gas nomination\r\n>\r\n...          0
5169  Subject: industrial worksheets for august 2000...          0
5170  Subject: important online banking alert\r\ndea...          1

[5171 rows x 2 columns]


In [4]:
#dataset_pd = dataset_pd_original.sample(frac=1)
#print(dataset_pd)

#### Check if the dataset has any samples with N/A labels (unlabled samples)

In [5]:
Total_unique_labels = dataset_pd.groupby('label_num').size()
print(Total_unique_labels)

label_num
0    3672
1    1499
dtype: int64


This dataset does not have any unlabeled samples. So, nothing to drop.

In [6]:
3672/1499

2.4496330887258173

#### Check the number of hams and spams in the dataset

In [7]:
Total_samples = dataset_pd.shape[0]
Total_hams = Total_unique_labels[0]
Total_spams = Total_unique_labels[1]
print("The total number of hams in the dataset is: ",Total_hams)
print("The total number of spams in the dataset is: ",Total_spams)

The total number of hams in the dataset is:  3672
The total number of spams in the dataset is:  1499


In [8]:
list_of_emails = list(dataset_pd['text'])
list_of_labels = list(dataset_pd['label_num'])
type(list_of_emails)

list

# **Data Pre-processing**


### Define a class that contains various methods for text pre-processing

In [9]:
class DataPreprocessing_NLP():
  def __init__(self, input_emails_list):
    self.list_2_preprocess = input_emails_list

  def remove_punctuation(self, emails_list):  # Pnctuations are not important in ham/spam classification
    # Remove all punctuation
    remove_punctuation = lambda x : re.sub('[%s]' % re.escape(string.punctuation), '', x)
    email_list_without_punc = list((map(remove_punctuation, list_of_emails)))   
    return email_list_without_punc

  def Remove_newline(self,emails_list): # get rid of /n and /r as they don't help distinguish ham/spam
    remove_newline_tabs = lambda x : re.sub(r'[\r?\n\r]',' ', x)
    email_list_no_newine_ch = list((map(remove_newline_tabs, emails_list)))
    return email_list_no_newine_ch

  def Remove_digits(self, email_list): # digits are not important for this porblem
    remove_digits = lambda x : re.sub(r'\d','', x)
    email_list_without_digits = list((map(remove_digits, email_list)))
    return email_list_without_digits

  def Tokenization(self,emails_list): 
    words = lambda x: re.sub("[^\w]", " ",  x).split()
    tokenized_email_list = list(map(words, emails_list))
    return tokenized_email_list

  def lower_case(self, email_list): #  lowercase all words so that words to reduce the size of the vocabulary
    lowercase = lambda x :  [w.lower() for w in x]
    email_list_lowercase = list((map(lowercase,email_list)))
    return email_list_lowercase
  
  def Remove_stop_words(self, email_list): # create a list of stopwords based on tje input data and remove them to reduce the size of the vocabulary
  # Define a list of stop words
    stop_words =  ['xls', 'subject', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    # Remove stopwords from a given text
    remove_stopwords = lambda x :  [w for w in x if w not in stop_words]
    email_list_without_stopwords = list((map(remove_stopwords,email_list)))
    return email_list_without_stopwords

### Perform pre-processing

In [10]:
subset_emails = list_of_emails
print(subset_emails[0])
preProcess = DataPreprocessing_NLP(list_of_emails)

emails_no_punc = preProcess.remove_punctuation(subset_emails)
print(emails_no_punc[0])
emails_sp_ch_remove = preProcess.Remove_newline(emails_no_punc)
print(emails_sp_ch_remove[0])
emails_no_digits = preProcess.Remove_digits(emails_sp_ch_remove)
print(emails_no_digits[0])
emails_Tokens = preProcess.Tokenization(emails_no_digits)
print(emails_Tokens[0])
emails_lower = preProcess.lower_case(emails_Tokens)
print(emails_lower[0])
emails_no_stopWords = preProcess.Remove_stop_words(emails_lower)
print(emails_no_stopWords[0])

Preprocessed_emails = emails_no_stopWords

Subject: enron methanol ; meter # : 988291
this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary
flow data provided by daren } .
please override pop ' s daily volume { presently zero } to reflect daily
activity you can obtain from gas control .
this change is needed asap for economics purposes .
Subject enron methanol  meter   988291
this is a follow up to the note i gave you on monday  4  3  00  preliminary
flow data provided by daren  
please override pop  s daily volume  presently zero  to reflect daily
activity you can obtain from gas control 
this change is needed asap for economics purposes 
Subject enron methanol  meter   988291  this is a follow up to the note i gave you on monday  4  3  00  preliminary  flow data provided by daren    please override pop  s daily volume  presently zero  to reflect daily  activity you can obtain from gas control   this change is needed asap for economics purposes 
Subject enron methanol  meter     this is a fo

# **Feature extraction**

In [11]:
# Make a separate list of hams and spams
def split_spam_ham(all_emails, labels):
  hams = []
  spams = []
  for i in range (0,len(labels)):
    if labels[i]==0:
      hams.append(all_emails[i])
    else:
      spams.append(all_emails[i])
  return hams, spams

In [12]:
hams , spams = split_spam_ham(Preprocessed_emails, list_of_labels)
print(len(hams))
print(len(spams))

3672
1499


In [13]:
class FeatureExtraction():
  #Refer to the slides to understand what features ae being used for this problem
  def __init__(self, preProcessedEmails, all_labels):
    self.preProcessedEmails = preProcessedEmails
    self.labels = all_labels

  def word_freq_dict(self, emailSet): #create a dictionary that maps (word,class) to frequency 
    dict_ =  Counter(chain(*emailSet))
    return dict_
  
  def count_spam_ham_words(self, all_data_in, dict_pos, dict_neg): #Create a feature matrix using the dictionary
    pos_feature_list = []
    neg_feature_list = []
    for i in range(0, len(all_data_in)):
      email = all_data_in[i]
      email = list(set(email))
      #print(email)
      Total_spam_words_in_email = 0
      Total_ham_words_in_email = 0

      for word in email:
        if word in dict_pos: #check the dictionary to see how many times this word appears in hams
          counter_spam = dict_pos[word] 
        else:
          counter_spam = 0

        if word in dict_neg: #check the dictionary to see how many times this word appears in spams
          counter_ham = dict_neg[word] 
        else:
          counter_ham = 0

        Total_spam_words_in_email = Total_spam_words_in_email + counter_spam
        Total_ham_words_in_email = Total_ham_words_in_email + counter_ham
      pos_feature_list.append(Total_spam_words_in_email)
      neg_feature_list.append(Total_ham_words_in_email)
    return pos_feature_list, neg_feature_list


In [14]:
Features = FeatureExtraction(Preprocessed_emails, list_of_labels)
hams, spams = split_spam_ham(Preprocessed_emails,list_of_labels)
print(len(hams))
print(len(spams))
labels_hams = list(np.zeros(len(hams)))
labels_spams = list(np.ones(len(spams)))
labels = labels_hams + labels_spams
labels = [int(a) for a in labels]
print(len(labels))

dict_hams = Features.word_freq_dict(hams)
dict_spams = Features.word_freq_dict(spams)
print(dict_hams)
print(dict_spams)

# create postive(hams) and negative(spams feature vector for the entire input set of emails)
ham_words_count, spam_words_count = Features.count_spam_ham_words(hams+spams, dict_hams, dict_spams)
print(ham_words_count)
print(spam_words_count)

3672
1499
5171
[21284, 6306, 22606, 8331, 12647, 20045, 23218, 3176, 14146, 10856, 89118, 48851, 60757, 9873, 33664, 12130, 6306, 2953, 41291, 23313, 11040, 23200, 64178, 44495, 10928, 20854, 51576, 13721, 11178, 6863, 21893, 40554, 9945, 10928, 60221, 68191, 39961, 3837, 10187, 33347, 28618, 10645, 24983, 14507, 22496, 12617, 28439, 13318, 12647, 70751, 5706, 13276, 8913, 9550, 34250, 20992, 26233, 60505, 22266, 40161, 33939, 19115, 22161, 6304, 5057, 11937, 10877, 12398, 2762, 19684, 7056, 6640, 7330, 9076, 26664, 3113, 8213, 15518, 19611, 29130, 59600, 47057, 7878, 57026, 2408, 17832, 8187, 3566, 7245, 31820, 18709, 44375, 19492, 44868, 13359, 941, 22216, 64559, 40706, 69160, 49767, 16090, 14049, 24347, 21230, 9266, 81637, 17564, 3540, 88250, 10894, 10759, 72760, 10638, 15782, 13642, 23240, 20461, 54730, 24229, 20041, 4694, 11767, 6254, 10599, 3317, 14306, 70111, 48036, 44624, 10453, 2383, 82383, 9748, 7095, 68470, 9606, 21175, 20847, 15452, 12081, 25555, 14108, 36048, 11165, 51500,

## **Shuffle the datasetand split into test/train**

So far, we have extracted the features from the emails. We no longer need the original set of emails. We can simply start using the extracted features for feeding into the model. The features are **[SumOfPosFrequencies, SumOfNegFrequencies]**

In [15]:
# Split hams and spams into test and train sets
def test_train_split(ham_word_count, spam_word_count, tot_hams, tot_spams):

  hams_pos = ham_word_count[0:tot_hams]
  #print(hams_pos)
  hams_neg = spam_word_count[0: tot_hams]
  #print(hams_neg)

  spams_pos = ham_word_count[tot_hams: len(ham_word_count)]
  #print(spams_pos)
  spams_neg = spam_word_count[tot_hams: len(ham_word_count) ]
  #print(spams_neg)

  total_hams = len(hams_pos)
  split_idx_hams = int(np.ceil(total_hams*0.7))
  train_hams_pos = hams_pos[0:split_idx_hams]
  #print(train_hams_pos)
  test_hams_pos = hams_pos[split_idx_hams: len(hams)]
  #print(test_hams_pos)
  train_hams_neg = hams_neg[0:split_idx_hams]
  #print(train_hams_neg)
  test_hams_neg = hams_neg[split_idx_hams: len(hams)]
  #print(test_hams_neg)
  train_hams_labels = list(np.zeros(len(train_hams_pos)))
  test_hams_labels = list(np.zeros(len(test_hams_pos)))


  total_spams = len(spams_pos)
  split_idx_spams = int(np.ceil(total_spams*0.7))
  train_spams_pos = spams_pos[0:split_idx_spams]
  #print(train_spams_pos)
  test_spams_pos = spams_pos[split_idx_spams: len(spams)]
  #print(test_spams_pos)
  train_spams_neg = spams_neg[0:split_idx_spams]
  #print(train_spams_neg)
  test_spams_neg = spams_neg[split_idx_spams: len(spams)]
  #print(test_spams_neg)
  train_spams_labels = list(np.ones(len(train_spams_pos)))
  test_spams_labels = list(np.ones(len(test_spams_pos)))

  train_data_pos = train_hams_pos + train_spams_pos
  test_data_pos =  test_hams_pos + test_spams_pos
  train_data_neg = train_hams_neg + train_spams_neg
  test_data_neg =  test_hams_neg + test_spams_neg
  train_labels = train_hams_labels + train_spams_labels 
  test_labels = test_hams_labels + test_spams_labels

  return train_data_pos, test_data_pos, train_data_neg, test_data_neg, train_labels, test_labels

# Randomly shuffle the test and train dataset
def random_shuffle(train_data_pos, test_data_pos, train_data_neg, test_data_neg, train_labels, test_labels):
  
  temp1 = list(zip(train_data_pos, train_data_neg, train_labels ))
  random.shuffle(temp1)
  train_data_pos, train_data_neg, train_labels = zip(*temp1)
  train_data_pos, train_data_neg, train_labels = list(train_data_pos), list(train_data_neg), list(train_labels)
  train_labels = [int(item) for item in train_labels]

  temp2 = list(zip(test_data_pos, test_data_neg, test_labels))
  random.shuffle(temp2)
  test_data_pos, test_data_neg, test_labels = zip(*temp2)
  test_data_pos, test_data_neg, test_labels = list(test_data_pos), list(test_data_neg), list(test_labels)
  test_labels = [int(item) for item in test_labels]

  return train_data_pos, train_data_neg, train_labels, test_data_pos, test_data_neg,  test_labels

In [16]:
train_data_pos, test_data_pos, train_data_neg, test_data_neg, train_labels, test_labels = test_train_split(ham_words_count, spam_words_count, len(hams), len(spams))
train_data_pos, train_data_neg, train_labels, test_data_pos, test_data_neg,  test_labels = random_shuffle(train_data_pos, test_data_pos, train_data_neg, test_data_neg, train_labels, test_labels)

print(train_data_pos[0:10])
print(test_data_pos[0:10])
print(train_data_neg[0:10])
print(test_data_neg[0:10])
print(train_labels[0:10])
print(test_labels[0:10])

[26664, 42324, 18905, 17892, 12, 7999, 12308, 80536, 241, 16674]
[63426, 15539, 63314, 10346, 44035, 6084, 13198, 16786, 17680, 9439]
[5160, 15790, 1335, 2667, 98, 779, 762, 5176, 589, 1602]
[3453, 1437, 5368, 65, 2404, 4671, 729, 1236, 1141, 6846]
[0, 0, 0, 0, 1, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 0, 1, 0, 0, 0, 1]


In [17]:
# Data statistics after test and train split
total_hams_train = train_labels.count(0)
total_spams_train = train_labels.count(1)
total_hams_test = test_labels.count(0)
total_spams_test = test_labels.count(1)

print(total_hams_train)
print(total_spams_train)
print(total_hams_test)
print(total_spams_test)

2571
1050
1101
449


In [18]:
# Create input matrices for feeding into the model
X_train = np.array((train_data_pos, train_data_neg), dtype=int)
X_train = X_train.T
y_train = train_labels
print(np.shape(X_train))

X_test = np.array((test_data_pos, test_data_neg), dtype=int)
X_test = X_test.T
y_test = test_labels
print(np.shape(X_test))

(3621, 2)
(1550, 2)


# **Logistic Regression model**

In [19]:
class LogisticRegression_model():
  def __init__(self, max_itr, learning_rate):
    self.max_itr = max_itr
    self.alpha = learning_rate
  
  def _sigmoid_function(self, x):
    if x >= 0:
        z = np.exp(-x)
        return 1 / (1 + z)
    else:
        z = np.exp(x)
        return z / (1 + z)

  def _sigmoid(self, x):
    return np.array([self._sigmoid_function(value) for value in x])
  
  def fit_LogisticRegression(self, X, y):
    n_features = X.shape[1]
    m_samples = X.shape[0]
    W = rand(n_features)
    bias = 0

    #gradient descent steps
    for itr in range(0, self.max_itr):
      sig_term = np.matmul(W, X.transpose()) + bias
      y_pred = self._sigmoid(sig_term)
      difference = y_pred - y

      # binary cross entropy loss / cost function
      #loss = np.array(y) * np.log(np.array(y_pred) + 1e-9) + (1-np.array(y)) * np.log(1 - np.array(y_pred) + 1e-9)
      #loss =  -np.mean(loss)
      #print("The loss is: ",loss)
      
      #compute gradients
      gradient_b = np.mean(difference)
      gradients_w = np.matmul(X.transpose(), difference)
      gradients_w = np.array([np.mean(grad) for grad in gradients_w])

      #update model parameters , w and b
      W = W - self.alpha*gradients_w
      bias = bias - self.alpha*gradient_b

      pred_to_class = [1 if p>0.5 else 0 for p in y_pred]

    return pred_to_class, W, bias

  def evaluate_LogisticRegression(self, weights, bias, X):
   x_dot_weights = np.matmul(X, weights.transpose()) + bias
   prob_pred = self._sigmoid(x_dot_weights)
   pred_to_class = [1 if p>0.5 else 0 for p in prob_pred]
   return pred_to_class

In [20]:
#find Accuracy, Precision, Recall, F1-score

def Model_metrics(y_true, y_pred):
  TP=0
  TN=0
  FP=0
  FN=0

  y_true = list(y_true)

  for i in range(0, len(y_pred)):
    if y_true[i]==0 and y_pred[i]==0:
      TP = TP+1
    elif y_true[i]==1 and y_pred[i]==1:
      TN = TN+1
    elif y_true[i]==0 and y_pred[i]==1:
      FN = FN+1
    elif y_true[i]==1 and y_pred[i]==0:
      FP = FP+1
    else:
      print("something went wrong!")
  Accuracy = (TP+TN)/ (TP+TN+FP+FN)
  Recall = TP/(TP+FN)
  Precision = TP/(TP+FP)
  F1_score = (2*Precision*Recall) /(Precision + Recall)

  print("The TP is: ", TP)
  print("The TN is: ", TN)
  print("The FN is: ", FN)
  print("The FP is: ", FP)

  return Accuracy, Recall, Precision, F1_score, 

In [21]:
Model_LR =  LogisticRegression_model(10,0.001)
prediction_train, model_weights, model_bias = Model_LR.fit_LogisticRegression(X_train, y_train)
#print(prediction_train)
print("The learned model weights are: ",model_weights)
print("The learned model bias is: ",model_bias)

The learned model weights are:  [-13176.76552518  28285.3670796 ]
The learned model bias is:  0.0011704552413194463


## Check the metrics on the train set

In [22]:
Train_acc, Train_recall, Train_precision, Train_F1score = Model_metrics(y_train, prediction_train)
print(Train_acc)
print(Train_recall)
print(Train_precision)
print(Train_F1score)

The TP is:  2522
The TN is:  940
The FN is:  49
The FP is:  110
0.956089478044739
0.9809412679891093
0.9582066869300911
0.9694407072842591


## Evaluate the model on the test set

In [23]:
# evaluate the model on the test set
prediction_test = Model_LR.evaluate_LogisticRegression(model_weights, model_bias,X_test)
Test_acc, Test_recall, Test_precision, Test_F1score = Model_metrics(y_test, prediction_test)
print(Test_acc)
print(Test_recall)
print(Test_precision)
print(Test_F1score)

The TP is:  1080
The TN is:  409
The FN is:  21
The FP is:  40
0.9606451612903226
0.9809264305177112
0.9642857142857143
0.9725348941918056


#**Implement the Logistic Regression using sklearn library**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
predictions = clf.predict(X_test)

In [None]:
Test_acc_sklearn, Test_recall_sklearn, Test_precision_sklearn, Test_F1score_sklearn = Model_metrics(y_test, predictions)
print(Test_acc_sklearn)
print(Test_recall_sklearn)
print(Test_precision_sklearn)
print(Test_F1score_sklearn)

The TP is:  1054
The TN is:  430
The FN is:  47
The FP is:  19
0.9574193548387097
0.9573115349682108
0.9822926374650512
0.9696412143514259


#### **The performance of the model built from scratch is comparable to the sklearn implementation of the LR model**