# Google Colab Advice
## If you are loading the notifications csv from Google Drive, then you first need to mount your Drive to give access to your files. If you are using the csv locally, ignore this cell.

In [None]:
# Only needed to access a file stored on google drive (if using google colab)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Load the notifications csv into python

In [None]:
import pandas as pd

# Read notifications csv file, using the path to your file
#   r is needed so slashes are taken as literal
file_path = r'/content/drive/MyDrive/4th Year Project/Datasets/Daisy Notifcations.csv'
df = pd.read_csv(file_path)

#Preprocessing the data
## Encode class labels
### 0 = Unimportant
### 1 = Important

In [None]:
from sklearn import preprocessing

# Encode the T/F labels as 1/0
enc = preprocessing.LabelEncoder().fit(df.Important)
# Apply the encoder to the labels column, and store it as 'y'
y = enc.transform(df['Important'] )

# Set up the feature vectors

## Handle the Categorical Data
### Rearrange the Names column as separate binary columns

E.g.

Row  | Names
------------- | -------------
1  | Bob Bobbington. Charles Charlington
2  | Emma Emmington. Bob Bobbington
3  | Charles Charlington

 to 

Row  |Bob Bobbington  | Charles Charlington | Emma Emmington
------------- |------------- | ------------- | -------------
1  |1  | 1  | 0
2  |1  | 0  | 1
3  |0  | 1  | 0

In [None]:
# Replace the names column values with a list of the names in that notifcation
names_lists = [str(names).split(". ") for _, names in df['Names'].items()]
df['Names'] = names_lists

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Rearrange the Names column as above
mlb = MultiLabelBinarizer()
df_names = pd.DataFrame(mlb.fit_transform(df['Names']),columns=mlb.classes_, index=df.index)

### Rearrange the Channels column as separate one-hot binary columns

E.g.

Row  | Notification Channel
------------- | -------------
1  | Comments
2  | Tags
3  | Tags

 to 

Row  |Comments  | Tags 
------------- |------------- | -------------
1  |1  | 0 
2  |0  | 1
3  |0  | 1

In [None]:
# Rearrange the Channels column as above
df_channels = pd.get_dummies(df['Notification Channel'])

## Combine the feature parts to produce the categorical feature vectors

In [None]:
# Append the Channel binary columns to the Names binary columns
X = pd.concat([df_names, df_channels], axis=1)

## Split the data into training and testing data

In [None]:
from sklearn.model_selection import train_test_split

# Returns data the format: X_train, X_test, y_train, y_test
def split_data(X, y):
  # Due to limited data, use 80% of the data to train, and 20% to test
  #  Feel free to change this 0.2 size to 0.7 or 0.6 if you have lots of data
  return train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
X_tr, X_te, y_tr, y_te = split_data(X, y)

# Model Scorer
Uses score = (2\*TPR + 3\*TNR) for a score out of 5

In [None]:
import sklearn
from sklearn.metrics import recall_score as TPR, confusion_matrix as cm, make_scorer

# Function to produce a score based on the importance of FPR vs TPR
#  Returns 2*TRP + 3*FPR for a score out of 5
def rate_model(test_labels, predicted_labels):
  # Use sklearn's recall_score function to produce a TPR
  tpr = TPR(test_labels, predicted_labels)
  # Use sklearns confusion matrix function to get the number of true negatives and false positives
  #  Returns data in the form: tn, fp, fn, tp
  tn, fp, _, _ = cm(test_labels, predicted_labels).ravel()
  tnr = tn/(tn+fp)
  score = (2*tpr) + (3*tnr)
  return score

# Make a scorer that can be used in a grid search
scorer = make_scorer(rate_model)

# Get Best Model
First each classifier (except for HistGradientBoostingClassifier) undergoes a grid search to fins the best parameters for that classifier. Once the best model is made for each classifier, they are compared based on the scorer above, to find the overall best model for this data.

### Trainer function for HistGradientBoostingClassifier since it does not use a grid search

In [None]:
def train_model(clf, X, y):
  clf = clf.fit(X, y)
  return clf

### K-NN Grid Search

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN

# Performs a grid search on KNN models using different numbers of neighbours
# Returns the best KNN model
def grid_search_knn(X, y):
  grid = GridSearch(KNN(), param_grid={'n_neighbors': [3,5,10]}, cv=3, scoring=scorer, n_jobs=-1)
  grid = grid.fit(X, y)
  print("KNN Grid Search Results: ",grid.best_params_)
  return grid.best_estimator_

### Decision Tree Grid Search

In [None]:
from sklearn.tree import DecisionTreeClassifier as DecTree

# Performs a grid search on Decision tree models using different maximum tree depths
# Returns the best Dec Tree model
def grid_search_dec_tree(X, y):
  grid = GridSearch(DecTree(), param_grid={'max_depth': [3,4,5,6]}, cv=3, scoring=scorer, n_jobs=-1)
  grid = grid.fit(X, y)
  print("Dec Tree Grid Search Results: ",grid.best_params_)
  return grid.best_estimator_

### SVM Grid Search


In [None]:
from sklearn.svm import SVC

# Performs a grid search on SVM models using different kernels and regularization values
# Returns the best SVM model
def grid_search_svm(X, y):
  grid = GridSearch(SVC(), param_grid={'kernel': ['linear','poly','rbf','sigmoid'], 'C': [0.5, 1.0, 1.5, 2.0, 2.5]})
  grid = grid.fit(X, y)
  print("SVM Grid Search Results: ",grid.best_params_)
  return grid.best_estimator_

### Bernoulli Naive Bayes Grid Search
Using Bernoulli since the vectors are binary

In [None]:
from sklearn.naive_bayes import BernoulliNB as NB

# Performs a grid search on Naive Bayes models using different alpha values for smoothing
# Returns the best NB model
def grid_search_naive_bayes(X, y):
  grid = GridSearch(NB(), param_grid={'alpha': range(10)}, cv=3, scoring=scorer, n_jobs=-1)
  grid = grid.fit(X, y)
  print("Naive Bayes Grid Search Results: ",grid.best_params_)
  return grid.best_estimator_

### Multilayer Perceptron Grid Search

In [None]:
from sklearn.neural_network import MLPClassifier

# Performs a grid search on MLP models using different alpha values for varying L2 penalties 
# Returns the best NB model
def grid_search_mlp(X, y):
  grid = GridSearch(NB(), param_grid={'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1]}, cv=3, scoring=scorer, n_jobs=-1)
  grid = grid.fit(X, y)
  print("Multilayer Perceptron Grid Search Results: ",grid.best_params_)
  return grid.best_estimator_

## Train the best model for each classifier type

In [None]:
from sklearn.model_selection import GridSearchCV as GridSearch
from sklearn.ensemble import HistGradientBoostingClassifier as HGB

# Use a dictionary to keep track of models
models = {}

# No grid search used for this classifier
models["Hist Gradient Boosting Classifier"] = [train_model(HGB(categorical_features=[0,1]),X_tr, y_tr)]
# Use a grid search for best tree depth
models["Decision Tree"] = [train_model(grid_search_dec_tree(X_tr, y_tr),X_tr, y_tr)]
# Use a grid search for best number of neighbours
models["KNN"] = [grid_search_knn(X_tr, y_tr)]
# Use a grid search for best kernel and c values
models["SVM"] = [grid_search_svm(X_tr, y_tr)]
# Use a grid search for best alpha smoothing value
models["Naive Bayes"] = [grid_search_naive_bayes(X_tr, y_tr)]
# Use a grid search for best alpha L2 penalty value
models["Multilayer Perceptron"] = [grid_search_mlp(X_tr, y_tr)]


Dec Tree Grid Search Results:  {'max_depth': 3}
KNN Grid Search Results:  {'n_neighbors': 3}
SVM Grid Search Results:  {'C': 0.5, 'kernel': 'poly'}
Naive Bayes Grid Search Results:  {'alpha': 2}
Multilayer Perceptron Grid Search Results:  {'alpha': 1e-05}


## Compare the best of the trained models

In [None]:
# Models are trained already on by now

def eval_models(models):
  best_model = next(iter(models))# just set it as the first model
  for m in models:
    # Train model
    #models[m][0] = train_model(models[m][0], X_tr, y_tr)
    #models[m][0] = models[m][0].fit(X_tr, y_tr)
    # Test Model
    preds = models[m][0].predict(X_te)
    # Rate Model
    models[m].append(rate_model(y_te, preds))
    if (models[best_model][1] < models[m][1]):
      best_model = m
  return models[best_model]

best_model = eval_models(models)

## Save the Best Model
If using Google Colab, this model will only save locally in this session. To use the model, download and save it in your relevant directory for using the model.

In [None]:
#!pip install joblib
import joblib
def save_model(clf):
  #filename = "Best_model.joblib"
  filename = "Spam Filter Model.joblib"
  joblib.dump(clf, filename)

save_model(best_model[0])