In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount='True')
import pandas as pd
import numpy as np
import math
import re

Mounted at /content/drive


In [None]:
def data_prep_bernoulli(sms_spam, sms_spam_test):
  #Data Cleaning
  sms_spam['SMS'] = sms_spam['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
  sms_spam['SMS'] = sms_spam['SMS'].str.lower() # Lowercase
  sms_spam['SMS'] = sms_spam['SMS'].str.split() # Word Split
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.lower() # Lowercase
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.split()
  sms_spam['Label'] = sms_spam['Label'].replace({'ham': 0, 'spam': 1})
  sms_spam_test['Label'] = sms_spam_test['Label'].replace({'ham': 0, 'spam': 1})
  vocabulary = []
  for sms in sms_spam['SMS']:
    for word in sms:
        vocabulary.append(word)

  vocabulary = list(set(vocabulary)) # Gives Unique Vocabulary

  word_counts_per_sms = {unique_word: [0] * len(sms_spam['SMS']) for unique_word in vocabulary}
  word_counts_per_sms_spam = {unique_word: [0] * len(sms_spam_test['SMS']) for unique_word in vocabulary}

  for index, sms in enumerate(sms_spam['SMS']):
    for word in sms:
        if word_counts_per_sms[word][index] == 0:
          word_counts_per_sms[word][index] += 1

  word_counts = pd.DataFrame(word_counts_per_sms)
  sms_spam_clean = pd.concat([sms_spam, word_counts], axis=1)

  for index1, sms1 in enumerate(sms_spam_test['SMS']):
    for word1 in sms1:
      if word1 in word_counts_per_sms_spam:
        if word_counts_per_sms_spam[word1][index1] == 0:
          word_counts_per_sms_spam[word1][index1] += 1

  word_counts_spam = pd.DataFrame(word_counts_per_sms_spam)
  train_set_X = word_counts.to_numpy()
  train_set_X = train_set_X.reshape(train_set_X.shape[0], -1).T
  train_set_Y = sms_spam['Label'].to_numpy()
  train_set_Y = train_set_Y.reshape(train_set_Y.shape[0], -1).T
  test_set_X = word_counts_spam.to_numpy()
  test_set_X = test_set_X.reshape(test_set_X.shape[0], -1).T
  test_set_Y = sms_spam_test['Label'].to_numpy()
  test_set_Y = test_set_Y.reshape(test_set_Y.shape[0], -1).T

  return sms_spam_clean, train_set_X, train_set_Y, test_set_X, test_set_Y

def data_prep_bow(sms_spam, sms_spam_test):
  #Data Cleaning
  sms_spam['SMS'] = sms_spam['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
  sms_spam['SMS'] = sms_spam['SMS'].str.lower() # Lowercase
  sms_spam['SMS'] = sms_spam['SMS'].str.split() # Word Split
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.lower() # Lowercase
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.split()
  sms_spam['Label'] = sms_spam['Label'].replace({'ham': 0, 'spam': 1})
  sms_spam_test['Label'] = sms_spam_test['Label'].replace({'ham': 0, 'spam': 1})
  vocabulary = []
  for sms in sms_spam['SMS']:
    for word in sms:
        vocabulary.append(word)

  vocabulary = list(set(vocabulary)) # Gives Unique Vocabulary

  word_counts_per_sms = {unique_word: [0] * len(sms_spam['SMS']) for unique_word in vocabulary}
  word_counts_per_sms_spam = {unique_word: [0] * len(sms_spam_test['SMS']) for unique_word in vocabulary}

  for index, sms in enumerate(sms_spam['SMS']):
    for word in sms:
      word_counts_per_sms[word][index] += 1

  word_counts = pd.DataFrame(word_counts_per_sms)
  sms_spam_clean = pd.concat([sms_spam, word_counts], axis=1)

  for index1, sms1 in enumerate(sms_spam_test['SMS']):
    for word1 in sms1:
      if word1 in word_counts_per_sms_spam:
        word_counts_per_sms_spam[word1][index1] += 1

  word_counts_spam = pd.DataFrame(word_counts_per_sms_spam)
  train_set_X = word_counts.to_numpy()
  train_set_X = train_set_X.reshape(train_set_X.shape[0], -1).T
  train_set_Y = sms_spam['Label'].to_numpy()
  train_set_Y = train_set_Y.reshape(train_set_Y.shape[0], -1).T
  test_set_X = word_counts_spam.to_numpy()
  test_set_X = test_set_X.reshape(test_set_X.shape[0], -1).T
  test_set_Y = sms_spam_test['Label'].to_numpy()
  test_set_Y = test_set_Y.reshape(test_set_Y.shape[0], -1).T

  return sms_spam_clean, train_set_X, train_set_Y, test_set_X, test_set_Y

def sigmoid(z):

    return 1 / (1 + np.exp(-z))

def initialize_with_zeros(dim):
  
    w = np.zeros((dim, 1))
    b = 0
    
    return w, b

def optimize(w, b, X, Y, num_iterations, learning_rate, reg, print_cost = False):

    costs = []   
    for i in range(num_iterations):
        # Cost and gradient calculation
        grads, cost = propagate(w,b,X,Y,reg)
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        # update 
        w = w - learning_rate * dw 
        b = b - learning_rate * db
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        # Print the cost every 100 training iterations
        #if print_cost and i % 100 == 0:
            #print ("Cost after iteration %i: %f" %(i, cost))
    params = {"w": w,"b": b}
    grads = {"dw": dw,"db": db}
    
    return params, grads, costs

def predict(w, b, X, Y):

    tp = 0
    tn = 0
    fp = 0
    fn = 0
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    A = sigmoid(np.dot(w.T, X) + b)
    
    for i in range(A.shape[1]):
      
        if A[0,i] >= 0.5:
            Y_prediction[0,i] = 1
            if Y_prediction[0,i] == Y[0][i]:
              tp += 1
            if Y_prediction[0,i] != Y[0][i]:
              fp += 1
        else:
            Y_prediction[0,i] = 0
            if Y_prediction[0,i] == Y[0][i]:
              tn += 1
            if Y_prediction[0,i] != Y[0][i]:
              fn += 1
    
    return Y_prediction, tp, tn, fp, fn

def model(X_train, Y_train, X_test, Y_test, num_iterations, learning_rate, reg, print_cost = False):

    w, b = initialize_with_zeros(X_train.shape[0])
    parameters, grads, costs = optimize(w,b,X_train,Y_train,num_iterations,learning_rate,reg,print_cost)
    w = parameters["w"]
    b = parameters["b"]
    Y_prediction_test, tp, tn, fp, fn = predict(w,b,X_test, Y_test)
    accuracy = (tp+tn)/Y_test.shape[1]
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1_score = (2 * precision *recall)/(precision + recall)
    # Print train/test Errors
    d = {"costs": costs,
         #"Y_prediction_test": Y_prediction_test, 
         #"Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1_score)
    return d

def propagate(w, b, X, Y, reg):
    
    m = X.shape[1]
    # FORWARD PROPAGATION (FROM X TO COST)
    A = sigmoid(np.dot(w.T, X) + b)  # compute activation
    cost = (-1/m) * np.sum((Y * np.log(A)) + (1 - Y) * np.log(1 - A), axis = 1) + (reg/(2*m))*(np.linalg.norm(w, ord=2)) # compute cost and regularize
    # BACKWARD PROPAGATION (TO FIND GRAD)
    dw = (1/m) * np.dot(X, (A-Y).T) + (reg)*(np.linalg.norm(w, ord=2))/m
    db = (1/m) * np.sum(A - Y, axis = 1)
    cost = np.squeeze(cost)
    
    grads = {"dw": dw,
             "db": db}
    #print("Cost = ",cost)
    #print("Reg = ",(reg)*(np.square(w))/m)
    return grads, cost

def tuning(X_train, Y_train, X_test, Y_test, num_iterations, learning_rate, reg, print_cost = False):

    w, b = initialize_with_zeros(X_train.shape[0])
    parameters, grads, costs = optimize(w,b,X_train,Y_train,num_iterations,learning_rate,reg,print_cost)
    w = parameters["w"]
    b = parameters["b"]
    Y_prediction_test, tp, tn, fp, fn = predict(w,b,X_test, Y_test)
    accuracy = (tp+tn)/Y_test.shape[1]
    return accuracy

def cross_valdation(sms_spam_clean, test_set_X, test_set_Y):
  my_dict = {}
  #Cross Validation
  validation_data = sms_spam_clean.sample(frac=0.3, random_state=25)
  validation_data = validation_data.drop('SMS', axis = 1)
  validation_label = validation_data['Label']
  validation_data = validation_data.drop('Label', axis = 1)
  validation_data = validation_data.to_numpy()
  validation_data = validation_data.reshape(validation_data.shape[0], -1).T
  validation_label = validation_label.to_numpy()
  validation_label = validation_label.reshape(validation_label.shape[0], -1).T
  reg_param = [0.001,  0.003,  0.005, 0.01,  0.03,  0.05, 0.1, 0.3, 0.5, 1]
  for param in reg_param:
    accuracy = tuning(validation_data, validation_label, test_set_X, test_set_Y, num_iterations = 1000, learning_rate = 0.005,reg = param, print_cost = True)
    my_dict.update({param : accuracy})
  print(max(my_dict, key=my_dict.get))
  return max(my_dict, key=my_dict.get)

In [None]:
# HW Dataset Bernoulli
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtrain.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtest.csv', header=None, names=['Label', 'SMS'])
sms_spam_clean, train_set_X, train_set_Y, test_set_X, test_set_Y = data_prep_bernoulli(sms_spam, sms_spam_test)
reg_param = cross_valdation(sms_spam_clean, test_set_X, test_set_Y)
d = model(train_set_X, train_set_Y, test_set_X, test_set_Y, num_iterations = 2000, learning_rate = 0.005,reg = reg_param, print_cost = True)

0.001
Accuracy: 0.895397489539749
Precision: 0.925531914893617
Recall: 0.6692307692307692
F1 Score: 0.7767857142857142


In [None]:
# HW Dataset Bag of Words
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtrain.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtest.csv', header=None, names=['Label', 'SMS'])
sms_spam_clean, train_set_X, train_set_Y, test_set_X, test_set_Y = data_prep_bow(sms_spam, sms_spam_test)
reg_param = cross_valdation(sms_spam_clean, test_set_X, test_set_Y)
d = model(train_set_X, train_set_Y, test_set_X, test_set_Y, num_iterations = 2000, learning_rate = 0.005,reg = reg_param, print_cost = True)

0.001
Accuracy: 0.9016736401673641
Precision: 0.8672566371681416
Recall: 0.7538461538461538
F1 Score: 0.8065843621399177


In [None]:
# Enron1 Dataset Bernoulli
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1test.csv', header=None, names=['Label', 'SMS'])
sms_spam_clean, train_set_X, train_set_Y, test_set_X, test_set_Y = data_prep_bernoulli(sms_spam, sms_spam_test)
reg_param = cross_valdation(sms_spam_clean, test_set_X, test_set_Y)
d = model(train_set_X, train_set_Y, test_set_X, test_set_Y, num_iterations = 2000, learning_rate = 0.005,reg = reg_param, print_cost = True)

0.001
Accuracy: 0.9276315789473685
Precision: 0.9461538461538461
Recall: 0.825503355704698
F1 Score: 0.881720430107527


In [None]:
# Enron1 Dataset Bag of Words
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1test.csv', header=None, names=['Label', 'SMS'])
sms_spam_clean, train_set_X, train_set_Y, test_set_X, test_set_Y = data_prep_bow(sms_spam, sms_spam_test)
reg_param = cross_valdation(sms_spam_clean, test_set_X, test_set_Y)
d = model(train_set_X, train_set_Y, test_set_X, test_set_Y, num_iterations = 2000, learning_rate = 0.005,reg = reg_param, print_cost = True)

0.01
Accuracy: 0.9232456140350878
Precision: 0.9253731343283582
Recall: 0.8322147651006712
F1 Score: 0.8763250883392225


In [None]:
# Enron4 Dataset Bernoulli
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4test.csv', header=None, names=['Label', 'SMS'])
sms_spam_clean, train_set_X, train_set_Y, test_set_X, test_set_Y = data_prep_bernoulli(sms_spam, sms_spam_test)
reg_param = cross_valdation(sms_spam_clean, test_set_X, test_set_Y)
d = model(train_set_X, train_set_Y, test_set_X, test_set_Y, num_iterations = 2000, learning_rate = 0.005,reg = reg_param, print_cost = True)

0.3
Accuracy: 0.9337016574585635
Precision: 0.9156908665105387
Recall: 1.0
F1 Score: 0.9559902200488998


In [None]:
# Enron1 Dataset Bag of Words
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4test.csv', header=None, names=['Label', 'SMS'])
sms_spam_clean, train_set_X, train_set_Y, test_set_X, test_set_Y = data_prep_bow(sms_spam, sms_spam_test)
reg_param = cross_valdation(sms_spam_clean, test_set_X, test_set_Y)
d = model(train_set_X, train_set_Y, test_set_X, test_set_Y, num_iterations = 2000, learning_rate = 0.005,reg = reg_param, print_cost = True)

0.001




Accuracy: 0.9429097605893186
Precision: 0.9285714285714286
Recall: 0.9974424552429667
F1 Score: 0.9617755856966709
