In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import utils

from sklearn.feature_extraction.text import CountVectorizer 
import re
from tqdm import tqdm

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

Import Datasets

In [2]:
# Old Rules
train_old = pd.read_csv("../data/labelled_data/train_oldpreproc.csv")
val_old = pd.read_csv("../data/labelled_data/val_oldpreproc.csv")
test_old = pd.read_csv("../data/labelled_data/test_oldpreproc.csv")

trainval_old =pd.concat([train_old, val_old])

# New rules
train_new = pd.read_csv("../data/labelled_data/train_newpreproc.csv")
val_new = pd.read_csv("../data/labelled_data/val_newpreproc.csv")
test_new = pd.read_csv("../data/labelled_data/test_newpreproc.csv")

trainval_new = pd.concat([train_new, val_new])

In [3]:
train_old.head()

Unnamed: 0.1,Unnamed: 0,restaurant_code,review_title,account_name,new_aspect_1,phrase,phrase_lemma,phrase_stem,label
0,0,2lhaZp7B,03 Jan 2018 my comfort food..,Bobcatsysop YK Chan,time,sold lunch time noon,sold lunch time noon,sold lunch time noon,0
1,1,2lhaZp7B,How do people take clean #Instafood shots at a...,Lynn Kwek,food,went pig congee lo mai kai crystal dumplings t...,went pig congee lo mai kai crystal dumpling to...,went pig conge lo mai kai crystal dumpl total ...,1
2,2,2lhaZp7B,How do people take clean #Instafood shots at a...,Lynn Kwek,service,friendly owner,friendly owner,friendli owner,1
3,3,2lhaZp7B,My Saturday morning started with breakfast at ...,Maureen Ow,food,cantonese style congee,cantonese style congee,cantones style conge,0
4,4,2lhaZp7B,So glad the weekend is here and I can finally ...,Maureen Ow,food,gen shu glutinous rice,gen shu glutinous rice,gen shu glutin rice,0


# Modelling

Initialise Parameters

In [18]:
bow_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1), (1,2), (1,3)],
    "max_df": [0.25, 0.5, 1.0],
    "min_df": [1, 10, 20]
}
bow_paramgrid = list(ParameterGrid(bow_params))

type_proc_params = {
    "type": ["original", "lemma", "stem"], # normal, lemma, stem
    "processing": ["old", "new"] # old, new phrase splitting
}
type_proc_paramgrid = list(ParameterGrid(type_proc_params))

In [5]:
# logistic regression
logreg_params = {
    "C": [0.5, 1.0, 1.5],
    "solver": ["lbfgs", "newton-cg"],
    "penalty": ["l2", "none"],
    "class_weight": ["balanced", None] 
}
logreg_paramgrid = list(ParameterGrid(logreg_params))

In [6]:
# naive bayes
nb_params = {
    "alpha": [0, 0.01, 0.1, 0.25, 0.5, 1]
}
nb_paramgrid = list(ParameterGrid(nb_params))

In [7]:
# svm
svm_params = {
    "C": [0.5, 1.0, 1.5, 10, 100],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [8]:
depth_list = [int(x) for x in np.linspace(10, 110, num = 11)]
depth_list.append(None)

rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "bootstrap": [True, False],
        "max_features": ["auto","sqrt"],
        "n_estimators": [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
        "max_depth": depth_list,
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

In [9]:
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

# Grid Search 

In [15]:
def prepare_datasets(bow_param):
    # original
    bow = CountVectorizer(**bow_param)
    bow_train_old = bow.fit_transform(train_old.phrase)
    bow_val_old = bow.transform(val_old.phrase)
    bow_test_old = bow.transform(test_old.phrase)
    bow_trainval_old = bow.transform(trainval_old.phrase)
    
    bow = CountVectorizer(**bow_param)
    bow_train_new = bow.fit_transform(train_new.phrase)
    bow_val_new = bow.transform(val_new.phrase)
    bow_test_new = bow.transform(test_new.phrase)
    bow_trainval_new = bow.transform(trainval_new.phrase)
    
    # lemmatize
    bow = CountVectorizer(**bow_param)
    bow_train_old_lemma = bow.fit_transform(train_old.phrase_lemma)
    bow_val_old_lemma = bow.transform(val_old.phrase_lemma)
    bow_test_old_lemma = bow.transform(test_old.phrase_lemma)
    bow_trainval_old_lemma = bow.transform(trainval_old.phrase_lemma)

    bow = CountVectorizer(**bow_param)
    bow_train_new_lemma = bow.fit_transform(train_new.phrase_lemma)
    bow_val_new_lemma = bow.transform(val_new.phrase_lemma)
    bow_test_new_lemma = bow.transform(test_new.phrase_lemma)
    bow_trainval_new_lemma = bow.transform(trainval_new.phrase_lemma)
    
    # stem
    bow = CountVectorizer(**bow_param)
    bow_train_old_stem = bow.fit_transform(train_old.phrase_stem)
    bow_val_old_stem = bow.transform(val_old.phrase_stem)
    bow_test_old_stem = bow.transform(test_old.phrase_stem)
    bow_trainval_old_stem = bow.transform(trainval_old.phrase_stem)
    
    bow = CountVectorizer(**bow_param)
    bow_train_new_stem = bow.fit_transform(train_new.phrase_stem)
    bow_val_new_stem = bow.transform(val_new.phrase_stem)
    bow_test_new_stem = bow.transform(test_new.phrase_stem)
    bow_trainval_new_stem = bow.transform(trainval_new.phrase_stem)
    
    return {
        "original": {
            "old": [bow_train_old, bow_val_old, bow_test_old, bow_trainval_old],
            "new": [bow_train_new, bow_val_new, bow_test_new, bow_trainval_new]
        },
        "lemma": {
            "old": [bow_train_old_lemma, bow_val_old_lemma, bow_test_old_lemma, bow_trainval_old_lemma],
            "new": [bow_train_new_lemma, bow_val_new_lemma, bow_test_new_lemma, bow_trainval_new_lemma]
        },
        "stem": {
            "old": [bow_train_old_stem, bow_val_old_stem, bow_test_old_stem, bow_trainval_old_stem],
            "new": [bow_train_new_stem, bow_val_new_stem, bow_test_new_stem, bow_trainval_new_stem]
        }
    }

In [16]:
labels = {
    "old": [train_old.label, val_old.label, test_old.label, trainval_old.label],
    "new": [train_new.label, val_new.label, test_new.label, trainval_new.label]
}

## Logistic Regression

In [None]:
model_name = "logreg"
model_fn = LogisticRegression
model_paramgrid = logreg_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in bow_paramgrid:
    datasets = prepare_datasets(bow_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # predict
            val_f1 = f1_score(val_label, val_pred, average="weighted")
            val_recall = recall_score(val_label, val_pred, average="weighted")
            val_precision = precision_score(val_label, val_pred, average="weighted")
            val_accuracy = accuracy_score(val_label, val_pred)
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # predict
            test_f1 = f1_score(test_label, test_pred, average="weighted")
            test_recall = recall_score(test_label, test_pred, average="weighted")
            test_precision = precision_score(test_label, test_pred, average="weighted")
            test_accuracy = accuracy_score(test_label, test_pred)
            
            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1": val_f1, "val_recall": val_recall, "val_precision": val_precision, 
                            "val_accuracy": val_accuracy})
            results.update({"test_f1": test_f1, "test_recall": test_recall, "test_precision": test_precision, 
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
            if (ind % 30 == 0):
                print(ind)
            
final_logreg_results = pd.DataFrame.from_records(gridsearch_results)
final_logreg_results = final_logreg_results.sort_values(by=["val_f1", "test_f1"], ascending=False)
final_logreg_results.to_csv("model_results/bow/logreg.csv", index=False)

30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930


## Naive Bayes

In [None]:
model_name = "nb"
model_fn = MultinomialNB
model_paramgrid = nb_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in bow_paramgrid:
    datasets = prepare_datasets(bow_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # predict
            val_f1 = f1_score(val_label, val_pred, average="weighted")
            val_recall = recall_score(val_label, val_pred, average="weighted")
            val_precision = precision_score(val_label, val_pred, average="weighted")
            val_accuracy = accuracy_score(val_label, val_pred)
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # predict
            test_f1 = f1_score(test_label, test_pred, average="weighted")
            test_recall = recall_score(test_label, test_pred, average="weighted")
            test_precision = precision_score(test_label, test_pred, average="weighted")
            test_accuracy = accuracy_score(test_label, test_pred)
            
            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1": val_f1, "val_recall": val_recall, "val_precision": val_precision, 
                            "val_accuracy": val_accuracy})
            results.update({"test_f1": test_f1, "test_recall": test_recall, "test_precision": test_precision, 
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
            print(ind)
            
            
final_nb_results = pd.DataFrame.from_records(gridsearch_results)
final_nb_results = final_nb_results.sort_values(by=["val_f1", "test_f1"], ascending=False)
final_nb_results.to_csv("model_results/bow/nb.csv", index=False)