In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount='True')
import pandas as pd
import math
import re
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

Mounted at /content/drive


In [None]:
def datasetprep(sms_spam, sms_spam_test):
  sms_spam['SMS'] = sms_spam['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
  sms_spam['SMS'] = sms_spam['SMS'].str.lower() # Lowercase
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
  sms_spam_test['SMS'] = sms_spam_test['SMS'].str.lower() # Lowercase
  vocabulary = []
  for sms in sms_spam['SMS']:
    for word in sms:
        vocabulary.append(word)
  vocabulary = list(set(vocabulary)) # Gives Unique Vocabulary

  bernoulli = {unique_word: [0] * len(sms_spam['SMS']) for unique_word in vocabulary}
  bow = {unique_word: [0] * len(sms_spam['SMS']) for unique_word in vocabulary}

  for index, sms in enumerate(sms_spam['SMS']):
    for word in sms:
        bow[word][index] += 1
        if bernoulli[word][index] == 0:
          bernoulli[word][index] += 1

  word_counts = pd.DataFrame(bernoulli)
  bernoulli_clean = pd.concat([sms_spam, word_counts], axis=1)
  word_counts = pd.DataFrame(bow)
  bow_clean = pd.concat([sms_spam, word_counts], axis=1)
  return bernoulli_clean, bow_clean

def classifier(sms_spam, sms_spam_test, model_type):
  bernoulli, bow = datasetprep(sms_spam,sms_spam_test)
  vec = CountVectorizer(stop_words='english')
  if model_type == 'bernoulli':
    X_train = vec.fit_transform(bernoulli['SMS']) 
    y_train = bernoulli['Label']
    X_test = vec.transform(sms_spam_test['SMS']) 
    y_test = sms_spam_test['Label']
  elif model_type == 'bow':
    X_train = vec.fit_transform(bow['SMS']) 
    y_train = bow['Label']
    X_test = vec.transform(sms_spam_test['SMS']) 
    y_test = sms_spam_test['Label']
  return X_train, y_train, X_test, y_test 

def model(alpha, eta0, learning_rate, loss, max_iter,penalty, tol, sms_spam, sms_spam_test, model_type):
  X_train, y_train, X_test, y_test = classifier(sms_spam, sms_spam_test, model_type)
  sgd = SGDClassifier(alpha = alpha, eta0 = eta0, learning_rate = learning_rate, loss=loss, max_iter = max_iter, penalty = penalty, tol = tol)
  sgd.fit(X_train, y_train);
  sgd.score(X_test, y_test)
  prediction = sgd.predict(X_test)
  print('Accuracy:', accuracy_score(y_test, prediction))
  print('Precision:', precision_score(y_test, prediction, pos_label='spam'))
  print('Recall:', recall_score(y_test, prediction, pos_label='spam'))
  print('F1 score:', f1_score(y_test, prediction, pos_label='spam'))

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")


def parameter_tuning(sms_spam, sms_spam_test, model_type):
  X_train, y_train, X_test, y_test = classifier(sms_spam, sms_spam_test, model_type)
  sgd = SGDClassifier()
  sgd.fit(X_train, y_train)
  sgd.score(X_test, y_test)
  param_grid = {'alpha': (0.01, 0.02, 0.03, 0.04, 0.05),
                              'max_iter': (range(500, 3000, 1000)),
                              'learning_rate': ('optimal', 'invscaling', 'adaptive'),
                              'eta0': (0.3, 0.7),
                              'tol': (0.001, 0.005),
                              'penalty': ('l1','l2'),
                              'loss': ('log', 'ridge', 'hinge')
                              }
  grid_search = GridSearchCV(sgd, param_grid=param_grid)
  start = time()
  grid_search.fit(X_train, y_train)
  print(
      "GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_["params"]))
  )

  report(grid_search.cv_results_) 

In [None]:
# Bernoulli Enron1 
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1test.csv', header=None, names=['Label', 'SMS'])

# Before Tuning 
# Accuracy = 94.298
# After Parameter Tuning
# Parameters: {'alpha': 0.01, 'eta0': 0.3, 'learning_rate': 'optimal', 'loss': 'log', 'max_iter': 500, 'penalty': 'l2', 'tol': 0.005}
model(alpha = 0.01, eta0 = 0.3, learning_rate = 'optimal', loss="log", max_iter = 500, penalty = 'l2', tol = 0.005,
      sms_spam = sms_spam, sms_spam_test = sms_spam_test, model_type = 'bernoulli')

Accuracy: 0.9539473684210527
Precision: 0.9
Recall: 0.9664429530201343
F1 score: 0.9320388349514563


In [None]:
# Bow Enron1 
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron1test.csv', header=None, names=['Label', 'SMS'])

# Before Tuning 
# Accuracy = 93.640
# After Parameter Tuning
# Parameters: {'alpha': 0.03, 'eta0': 0.7, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 1500, 'penalty': 'l2', 'tol': 0.001}
model(alpha = 0.03, eta0 = 0.7, learning_rate = 'optimal', loss="hinge", max_iter = 1500, penalty = 'l2', tol = 0.001,
      sms_spam = sms_spam, sms_spam_test = sms_spam_test, model_type = 'bow')

Accuracy: 0.9495614035087719
Precision: 0.8841463414634146
Recall: 0.9731543624161074
F1 score: 0.926517571884984


In [None]:
# Bernoulli Enron4
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4test.csv', header=None, names=['Label', 'SMS'])

# Before Tuning 
# Accuracy = 93.186
# After Parameter Tuning
# Parameters: {'alpha': 0.01, 'eta0': 0.3, 'learning_rate': 'adaptive', 'loss': 'hinge', 'max_iter': 1500, 'penalty': 'l2', 'tol': 0.005}
model(alpha = 0.01, eta0 = 0.3, learning_rate = 'adaptive', loss="hinge", max_iter = 1500, penalty = 'l2', tol = 0.005,
      sms_spam = sms_spam, sms_spam_test = sms_spam_test, model_type = 'bow')

Accuracy: 0.9576427255985267
Precision: 0.9444444444444444
Recall: 1.0
F1 score: 0.9714285714285714


In [None]:
# Bow Enron4
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4train.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/enron4test.csv', header=None, names=['Label', 'SMS'])

# Before Tuning 
# Accuracy = 94.475
# After Parameter Tuning
#Parameters: {'alpha': 0.02, 'eta0': 0.7, 'learning_rate': 'adaptive', 'loss': 'hinge', 'max_iter': 1500, 'penalty': 'l2', 'tol': 0.001}}
model(alpha = 0.02, eta0 = 0.7, learning_rate = 'adaptive', loss="hinge", max_iter = 1500, penalty = 'l2', tol = 0.001,
      sms_spam = sms_spam, sms_spam_test = sms_spam_test, model_type = 'bow')

Accuracy: 0.9558011049723757
Precision: 0.9421686746987952
Recall: 1.0
F1 score: 0.9702233250620347


In [None]:
# Bernoulli HW
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtrain.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtest.csv', header=None, names=['Label', 'SMS'])

# Before Tuning 
# Accuracy = 93.933
# After Parameter Tuning
# Parameters: {'alpha': 0.01, 'eta0': 0.3, 'learning_rate': 'adaptive', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'l2', 'tol': 0.001}
model(alpha = 0.01, eta0 = 0.3, learning_rate = 'adaptive', loss="hinge", max_iter = 500, penalty = 'l2', tol = 0.001,
      sms_spam = sms_spam, sms_spam_test = sms_spam_test, model_type = 'bernoulli')

Accuracy: 0.9497907949790795
Precision: 0.9015151515151515
Recall: 0.9153846153846154
F1 score: 0.9083969465648855


In [None]:
# Bow HW
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtrain.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtest.csv', header=None, names=['Label', 'SMS'])

# Before Tuning 
# Accuracy = 93.514
# After Parameter Tuning
#Parameters: {'alpha': 0.01, 'eta0': 0.3, 'learning_rate': 'adaptive', 'loss': 'hinge', 'max_iter': 2500, 'penalty': 'l2', 'tol': 0.005}
model(alpha = 0.01, eta0 = 0.3, learning_rate = 'adaptive', loss="hinge", max_iter = 2500, penalty = 'l2', tol = 0.005,
      sms_spam = sms_spam, sms_spam_test = sms_spam_test, model_type = 'bow')

Accuracy: 0.9497907949790795
Precision: 0.9076923076923077
Recall: 0.9076923076923077
F1 score: 0.9076923076923076


In [None]:
# Parameter Tuning
# Import Datasets
sms_spam = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtrain.csv', header=None, names=['Label', 'SMS'])
sms_spam_test = pd.read_csv('/content/drive/MyDrive/Datasets/ML/hwtest.csv', header=None, names=['Label', 'SMS'])
model_type = 'bernoulli'  # Can be bernoulli or bow
parameter_tuning(sms_spam, sms_spam_test, model_type)

GridSearchCV took 34.59 seconds for 1080 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.950 (std: 0.021)
Parameters: {'alpha': 0.01, 'eta0': 0.3, 'learning_rate': 'adaptive', 'loss': 'hinge', 'max_iter': 2500, 'penalty': 'l2', 'tol': 0.005}

Model with rank: 2
Mean validation score: 0.948 (std: 0.017)
Parameters: {'alpha': 0.01, 'eta0': 0.3, 'learning_rate': 'adaptive', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'l2', 'tol': 0.001}

Model with rank: 3
Mean validation score: 0.948 (std: 0.014)
Parameters: {'alpha': 0.01, 'eta0': 0.3, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 1500, 'penalty': 'l2', 'tol': 0.001}

Model with rank: 3
Mean validation score: 0.948 (std: 0.016)
Parameters: {'alpha': 0.01, 'eta0': 0.3, 'learning_rate': 'adaptive', 'loss': 'hinge', 'max_iter': 2500, 'penalty': 'l2', 'tol': 0.001}

Model with rank: 3
Mean validation score: 0.948 (std: 0.011)
Parameters: {'alpha': 0.03, 'eta0': 0.7, 'learning_rate': 'adaptive', 'loss': 

1800 fits failed out of a total of 5400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1800 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_stochastic_gradient.py", line 892, in fit
    sample_weight=sample_weight,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_stochastic_gradient.py", line 649, in _fit
    self._validate_params()
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_stochastic_gradient.py", line 162, in _validate_params
    ra