In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
import os
import sys
import matplotlib.image as img
import scipy.io
import pickle
from sklearn.metrics import pairwise_distances, classification_report, confusion_matrix, roc_auc_score
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RepeatedStratifiedKFold

%matplotlib inline

If the following code doesn't run, then do 'pip install ipynb' in the command line. This code lets us import functions from notebooks in the lib folder. Lib is supposed to have all the model training/predicting functions and the doc folder is only supposed to have report/presentation files like main.ipynb.

In [2]:
import ipynb
sys.path.append('../lib/')

If the following code doesn't run, then do 'pip install imblearn' in the command line. This code lets us do SMOTE (synthetic minority oversampling technique) and random undersampling to help deal with the imbalanced data.

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

### Step 0: set work directories

In [4]:
np.random.seed(2020)

Provide directories for training images. Training images and Training fiducial points will be in different subfolders.

In [5]:
#Change train_dir to your own path

#root=sys.path[0]
#train_dir = os.path.join(root,"../data/train_set/")
#train_dir = os.path.join(root,"../data/train_set/")
train_dir = '/Users/rohan/Desktop/train_set/'
train_image_dir = train_dir+"images/"
train_pt_dir = train_dir+"points/"
train_label_path = train_dir+"label.csv"

### Step 1: set up controls for evaluation experiments.

In this chunk, we have a set of controls for the evaluation experiments. 

+ (T/F) cross-validation on the training set
+ (T/F) reweighting the samples for training set 
+ (number) K, the number of CV folds
+ (T/F) process features for training set
+ (T/F) run evaluation on an independent test set
+ (T/F) process features for test set

In [6]:
run_feature_train = True # process features for training set
run_test = True # run evaluation on an independent test set
run_feature_test = True # process features for test set
run_feature_train_initial = True
run_feature_test_initial = True

Set these to be False if you don't want to go through training certain models when running the notebook.

In [7]:
run_baseline = True
run_baseline_improved = True
run_baseline_pca = True
run_knn = True
run_knn_smote = True
run_xgboost=True
run_xgboost_smote=True
feature_initial=True
run_randonforest=True
run_LDA=True
run_logistic=True
run_svm = True
run_bagging_smote = True
run_naivebayes = True

### Step 2: import data and train-test split 

In [8]:
info = pd.read_csv(train_label_path)
n = info.shape[0]

#Data is imbalanced 
print('Number of records with label 0 (basic emotion):   {:4d} '.format(info.loc[info['label']==0].shape[0]))
print('Number of records with label 1 (complex emotion): {:2d} '.format(info.loc[info['label']==1].shape[0]))

Number of records with label 0 (basic emotion):   2402 
Number of records with label 1 (complex emotion): 598 


In [9]:
#we could use sklearn train_test_split here instead of doing it
#manually like in the starter code
n_train = int(round(n*(4/5),0))
train_idx = np.random.choice(list(info.index),size=n_train,replace=False)
test_idx = list(set(list(info.index))-set(train_idx)) #set difference

Reading images (we never use the image list later since we're just using the fiducial points for the features)

In [10]:
n_files = len(os.listdir(train_image_dir))

# image_list = []
# for i in range(1,101): # 1 to 100
#     image = img.imread(train_image_dir+'{:04d}'.format(i)+'.jpg')
#     image_list.append(image)

Fiducial points are stored in matlab format. In this step, we read them and store them in a list.

In [11]:
#function to read fiducial points
#input: index
#output: matrix of fiducial points corresponding to the index

def readMat_matrix(index):
    try:
        mat_data = scipy.io.loadmat(train_pt_dir+'{:04d}'.format(index)+'.mat')['faceCoordinatesUnwarped']
    except KeyError:
        mat_data = scipy.io.loadmat(train_pt_dir+'{:04d}'.format(index)+'.mat')['faceCoordinates2']
    return np.matrix.round(mat_data,0)

#load fiducial points
#pickle is the closest equivalent to .RData that I could find in Python
fiducial_pt_list = list(map(readMat_matrix,list(range(1,n_files+1))))
pickle.dump(fiducial_pt_list, open( "../output/fiducial_pt_list.p", "wb" ) )

### Step 3: construct features and responses
`feature.R` should be the wrapper for all your feature engineering functions and options. The function `feature( )` should have options that correspond to different scenarios for your project and produces an R object that contains features and responses that are required by all the models you are going to evaluate later. 
  
  + `feature.R`
  + Input: list of images or fiducial point
  + Output: an RData file that contains extracted features and corresponding responses

Use feature_initial.ipynb's feature_initial function to generate pairwise distance features for the baseline model. This is the same feature extraction method as that of the starter code. Note that this method double counts distances between points i.e. there are separate entries for the distance from point A to point B and the distance from point B to point A even though the distances are the same.

Feature extraction times exclude the time it takes to write to a csv file.

In [12]:
from ipynb.fs.full.feature_initial import feature_initial

tm_feature_train = np.nan
if run_feature_train_initial == True:
    start = time.time()
    dat_train_initial = feature_initial(fiducial_pt_list, train_idx, info)
    end = time.time()
    tm_feature_train = end-start
    dat_train_initial.to_csv("../output/train_data_initial.csv",index=False)
    print('Initial feature extraction time for train: {:4f}'.format(tm_feature_train))
else:
    dat_train_initial = pd.read_csv("../output/train_data_initial.csv")
        
        
tm_feature_test = np.nan
if run_feature_test_initial == True:
    start = time.time()
    dat_test_initial = feature_initial(fiducial_pt_list, test_idx, info)
    end = time.time()
    tm_feature_test = end-start
    dat_test_initial.to_csv("../output/test_data_initial.csv",index=False)
    print('Initial feature extraction time for test:  {:4f}'.format(tm_feature_test))
else:
    dat_test_initial = pd.read_csv("../output/test_data_initial.csv")  

Initial feature extraction time for train: 5.407898
Initial feature extraction time for test:  1.262230


Use feature_improved.ipynb's feature_improved function to generate pairwise distance features to be used by all of the models other than the baseline. Unlike feature_initial, feature_improved does not double count distances. Hence, feature_improved produces exactly half as many features as feature_initial while keeping the same information.

In [13]:
from ipynb.fs.full.feature_improved import feature_improved

tm_feature_train = np.nan
if run_feature_train == True:
    start = time.time()
    dat_train = feature_improved(fiducial_pt_list, train_idx, info)
    end = time.time()
    tm_feature_train = end-start
    dat_train.to_csv("../output/train_data.csv",index=False)
    print('Improved feature extraction time for train: {:4f}'.format(tm_feature_train))
else:
    dat_train = pd.read_csv("../output/train_data.csv")


tm_feature_test = np.nan
if run_feature_test == True:
    start = time.time()
    dat_test = feature_improved(fiducial_pt_list, test_idx, info)
    end = time.time()
    tm_feature_test = end-start
    dat_test.to_csv("../output/test_data.csv",index=False)
    print('Improved feature extraction time for test:  {:4f}'.format(tm_feature_test))
else:
    dat_test = pd.read_csv("../output/test_data.csv")

Improved feature extraction time for train: 0.182291
Improved feature extraction time for test:  0.026276


In [14]:
#Get the traning/test features and labels

feature_train_initial = dat_train_initial.loc[:, dat_train_initial.columns != 'labels']
label_train_initial = dat_train_initial['labels'] 

feature_test_initial = dat_test_initial.loc[:, dat_test_initial.columns != 'labels']
label_test_initial = dat_test_initial['labels'] 

feature_train = dat_train.loc[:, dat_train.columns != 'labels']
label_train = dat_train['labels'] #same values as label_train_initial

feature_test = dat_test.loc[:, dat_test.columns != 'labels']
label_test = dat_test['labels'] #same values as label_test_initial

In [15]:
#PCA (made some changes)
scaler = MinMaxScaler()
feature_train_scaled = scaler.fit_transform(feature_train)
feature_test_scaled = scaler.fit_transform(feature_test)

#pick the number of components that captures 95% of the variance
pca = PCA(n_components = 0.95, svd_solver='full').fit(feature_train_scaled)
feature_train_PCA = pca.transform(feature_train_scaled)
feature_test_PCA = pca.transform(feature_test_scaled)

#print how many components after pca
pca.n_components_

24

### Step 4: Train a classification model with training features and responses
Call the train model and test model from library. 

`train.R` and `test.R` should be wrappers for all your model training steps and your classification/prediction steps. 

+ `train.R`
  + Input: a data frame containing features and labels and a parameter list.
  + Output:a trained model
+ `test.R`
  + Input: the fitted classification model using training data and processed features from testing images 
  + Input: an R object that contains a trained classifier.
  + Output: training model specification

+ In this Starter Code, we use logistic regression with LASSO penalty to do classification. 

* Model selection with cross-validation
* Do model selection by choosing among different values of training model parameters.

### Baseline Model

In [19]:
if run_baseline == True:
    
    #grid search for optimal parameters
    #params = {'learning_rate':[0.01,0.05,0.1,0.5], 'max_depth': [1,2,3], 'n_estimators':[50,100,150]}
    #gscv = GridSearchCV(GradientBoostingClassifier(),params,cv=3,scoring='roc_auc').fit(feature_train_initial,label_train_initial)
    #gscv.best_params_
    # output: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}
    
    start = time.time()
    baseline = GradientBoostingClassifier(learning_rate=0.1,max_depth=3,n_estimators=150)
    baseline.fit(feature_train_initial,label_train_initial)
    end = time.time()
    print('Training time: {:4f} seconds'.format(end-start))
    
    start = time.time()
    test_preds = baseline.predict(feature_test_initial)
    end = time.time()
    print('Prediction time: {:4f} seconds'.format(end-start))
    
    classification_error = np.mean(np.array(test_preds) != np.array(label_test_initial))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test_initial,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test_initial,test_preds))
    
    test_probs = baseline.predict_proba(feature_test_initial)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))
    
    #save baseline model
    pickle.dump(baseline,open("../output/baseline.p", "wb"))
    
    #load baseline model
    #baseline = pickle.load(open("../output/baseline.p", "rb"))

Training time: 178.450470 seconds
Prediction time: 0.044135 seconds

Classification Error: 0.190000
Accuracy: 0.810000
Classification Report:

              precision    recall  f1-score   support

           0       0.82      0.97      0.89       473
           1       0.66      0.21      0.32       127

    accuracy                           0.81       600
   macro avg       0.74      0.59      0.61       600
weighted avg       0.79      0.81      0.77       600

Confusion Matrix:

[[459  14]
 [100  27]]

AUC: 0.786336


### Baseline Model with Improved Features

In [20]:
if run_baseline_improved == True:
    
    #grid search for optimal parameters
    #params = {'learning_rate':[0.01,0.05,0.1,0.5], 'max_depth': [1,2,3], 'n_estimators':[50,100,150]}
    #gscv = GridSearchCV(GradientBoostingClassifier(),params,cv=3,scoring='roc_auc').fit(feature_train,label_train)
    #gscv.best_params_
    # output: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}
    
    start = time.time()
    baseline = GradientBoostingClassifier(learning_rate=0.1,max_depth=3,n_estimators=150)
    baseline.fit(feature_train,label_train)
    end = time.time()
    print('Training time: {:4f} seconds'.format(end-start))
    
    start = time.time()
    test_preds = baseline.predict(feature_test)
    end = time.time()
    print('Prediction time: {:4f} seconds'.format(end-start))
    
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = baseline.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 177.320505 seconds
Prediction time: 0.023891 seconds

Classification Error: 0.185000
Accuracy: 0.815000
Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.97      0.89       473
           1       0.67      0.24      0.36       127

    accuracy                           0.81       600
   macro avg       0.75      0.61      0.63       600
weighted avg       0.79      0.81      0.78       600

Confusion Matrix:

[[458  15]
 [ 96  31]]

AUC: 0.799421


### Baseline Model With PCA

In [21]:
if run_baseline_pca == True:
    
    #params = {'learning_rate':[0.01,0.05,0.1,0.5], 'max_depth': [1,2,3], 'n_estimators':[50,100,150]}
    #gscv = GridSearchCV(GradientBoostingClassifier(),params,cv=3).fit(feature_train_PCA,label_train)
    #gscv.best_params_
    #output: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}

    #Baseline model with PCA
    #need to do grid search to get optimal parameters though
    start = time.time()
    gbm_pca=GradientBoostingClassifier(learning_rate=0.1,max_depth=2,n_estimators=150)
    gbm_pca.fit(feature_train_PCA,label_train)
    end = time.time()
    print('Training time: {:4f} seconds'.format(end-start))
    
    start = time.time()
    test_preds = gbm_pca.predict(feature_test_PCA)
    end = time.time()
    print('Prediction time: {:4f} seconds'.format(end-start))
    
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = gbm_pca.predict_proba(feature_test_PCA)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 1.185983 seconds
Prediction time: 0.001168 seconds

Classification Error: 0.210000
Accuracy: 0.790000

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.98      0.88       473
           1       0.52      0.09      0.16       127

    accuracy                           0.79       600
   macro avg       0.66      0.54      0.52       600
weighted avg       0.74      0.79      0.73       600

Confusion Matrix:

[[462  11]
 [115  12]]

AUC: 0.691598


### KNN Model

In [22]:
if run_knn == True:
    
    #params = {'n_neighbors':list(range(5,55,5))}
    #gscv = GridSearchCV(KNeighborsClassifier(),params,cv=5).fit(feature_train,label_train)
    #gscv.best_params_
    #output: {'n_neighbors': 25}
    
    start = time.time()
    #need to cross validate to pick best value (after finalizing features)
    knn = KNeighborsClassifier(n_neighbors = 25) 
    knn.fit(feature_train,label_train)
    end = time.time()
    print('Training time: {:4f} seconds'.format(end-start))
    
    start = time.time()
    test_preds = knn.predict(feature_test)
    end = time.time()
    print('Prediction time: {:4f} seconds'.format(end-start))
    
    classification_error = np.mean(np.array(test_preds) != np.array(label_test)) #Classification Error
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = knn.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 0.377309 seconds
Prediction time: 5.328446 seconds

Classification Error: 0.208333
Accuracy: 0.791667

Classification Report:

              precision    recall  f1-score   support

           0       0.79      0.99      0.88       473
           1       0.62      0.04      0.07       127

    accuracy                           0.79       600
   macro avg       0.71      0.52      0.48       600
weighted avg       0.76      0.79      0.71       600

Confusion Matrix:

[[470   3]
 [122   5]]

AUC: 0.674527


### KNN With SMOTE and Undersampling

In [23]:
if run_knn_smote == True:
    
    over = SMOTE(sampling_strategy='auto')
    under = RandomUnderSampler(sampling_strategy='auto')
    sm = Pipeline(steps = [('o', over), ('u', under)])
    feature_train_sm, label_train_sm = sm.fit_resample(feature_train,label_train)
    
    #params = {'n_neighbors':list(range(5,55,5))}
    #gscv = GridSearchCV(KNeighborsClassifier(),params,cv=5).fit(feature_train_sm,label_train_sm)
    #gscv.best_params_
    #output: {'n_neighbors': 5}
    
    start = time.time()
    #need to cross validate to pick best value (after finalizing features)
    knn = KNeighborsClassifier(n_neighbors = 5)
    knn.fit(feature_train_sm,label_train_sm)
    end = time.time()
    print('Training time: {:4f} seconds'.format(end-start))
    
    start = time.time()
    test_preds = knn.predict(feature_test)
    end = time.time()
    print('Prediction time: {:4f} seconds'.format(end-start))
    
    classification_error = np.mean(np.array(test_preds) != np.array(label_test)) #Classification Error
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = knn.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 0.717526 seconds
Prediction time: 7.323308 seconds

Classification Error: 0.408333
Accuracy: 0.591667

Classification Report:

              precision    recall  f1-score   support

           0       0.86      0.57      0.69       473
           1       0.29      0.66      0.41       127

    accuracy                           0.59       600
   macro avg       0.58      0.62      0.55       600
weighted avg       0.74      0.59      0.63       600

Confusion Matrix:

[[271 202]
 [ 43  84]]

AUC: 0.682293


### XGBoost Model

In [24]:
if run_xgboost == True:
    
    # train_labels_xgb = [ x  - 1 for x in label_train ] 
    # test_labels_xgb = [ x  - 1 for x in label_test ] 
    start_time=time.time()

    xgb = XGBClassifier(
     learning_rate =0.1,
     n_estimators= 200,
     max_depth=3,
     min_child_weight=1,
     objective= 'binary:logistic',
     scale_pos_weight=4
    )
    
    xgb.fit(feature_train, label_train ,eval_metric='auc')
    print('Training time: {:4f} seconds'.format(time.time()-start_time))

    start_time = time.time()
    test_preds = xgb.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))

    
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = xgb.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 83.955818 seconds
Prediction time: 0.069285 seconds

Classification Error: 0.186667
Accuracy: 0.813333

Classification Report:

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       473
           1       0.57      0.47      0.52       127

    accuracy                           0.81       600
   macro avg       0.72      0.69      0.70       600
weighted avg       0.80      0.81      0.81       600

Confusion Matrix:

[[428  45]
 [ 67  60]]

AUC: 0.808726


### XGBoost Model With SMOTE

In [25]:
#params = {'learning_rate':[0.1,0.25,0.5],'n_estimators':[100,200,300]}
#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#gscv = GridSearchCV(XGBClassifier(min_child_weight=1,max_depth=3,objective= 'binary:logistic',scale_pos_weight=4),params,cv=cv,scoring='roc_auc',verbose=True).fit(feature_train_sm,label_train_sm)
#gscv.best_params_
#output: {'learning_rate': 0.25, 'n_estimators': 300}

In [26]:
# train_labels_xgb = [ x  - 1 for x in label_train ] 
# test_labels_xgb = [ x  - 1 for x in label_test ] 

if run_xgboost_smote == True:
    #sm = SMOTE(sampling_strategy='auto',k_neighbors=20,random_state=42)
    #feature_train_sm, label_train_sm = sm.fit_resample(feature_train,label_train)

    over = SMOTE(sampling_strategy='auto')
    under = RandomUnderSampler(sampling_strategy='auto')
    sm = Pipeline(steps = [('o', over), ('u', under)])
    feature_train_sm, label_train_sm = sm.fit_resample(feature_train,label_train)
    
    start_time=time.time()

    xgb = XGBClassifier(
     learning_rate =0.25,
     n_estimators= 300,
     max_depth=3,
     min_child_weight=1,
     objective= 'binary:logistic',
     scale_pos_weight=4
    )

    xgb.fit(feature_train_sm, label_train_sm ,eval_metric='auc')
    print('Training time: {:4f} seconds'.format(time.time()-start_time))

    start_time = time.time()
    test_preds = xgb.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))


    classification_error = np.mean(np.array(test_preds) != np.array(label_test))

    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))

    test_probs = xgb.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 197.353105 seconds
Prediction time: 0.076977 seconds

Classification Error: 0.173333
Accuracy: 0.826667

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.90      0.89       473
           1       0.60      0.54      0.57       127

    accuracy                           0.83       600
   macro avg       0.74      0.72      0.73       600
weighted avg       0.82      0.83      0.82       600

Confusion Matrix:

[[427  46]
 [ 58  69]]

AUC: 0.810441


### RandonForest Model

In [27]:
if run_randonforest==True:
    start_time = time.time()
    rf = RandomForestClassifier(n_estimators = 100, criterion = 'gini', min_samples_leaf=1, max_features='sqrt')
    rf_model = rf.fit(feature_train, label_train)
    print('Training time: {:4f} seconds'.format(time.time()-start_time))
    
    start_time = time.time()
    test_preds = rf_model.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))
    
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = rf_model.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 6.330855 seconds
Prediction time: 0.030510 seconds

Classification Error: 0.190000
Accuracy: 0.810000

Classification Report:

              precision    recall  f1-score   support

           0       0.81      0.99      0.89       473
           1       0.78      0.14      0.24       127

    accuracy                           0.81       600
   macro avg       0.80      0.57      0.57       600
weighted avg       0.81      0.81      0.75       600

Confusion Matrix:

[[468   5]
 [109  18]]

AUC: 0.760483


### LDA Model

In [28]:
if run_LDA==True:    
    start_time = time.time()
    lda = LDA(solver='eigen', shrinkage=.1, n_components=1)
    lda_model = lda.fit(feature_train,label_train)
    print('Training time: {:4f} seconds'.format(time.time()-start_time))
    start_time = time.time()
    test_preds=lda_model.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))
    
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = lda_model.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 6.302199 seconds
Prediction time: 0.021439 seconds

Classification Error: 0.178333
Accuracy: 0.821667

Classification Report:

              precision    recall  f1-score   support

           0       0.84      0.96      0.89       473
           1       0.67      0.31      0.43       127

    accuracy                           0.82       600
   macro avg       0.75      0.64      0.66       600
weighted avg       0.80      0.82      0.80       600

Confusion Matrix:

[[453  20]
 [ 87  40]]

AUC: 0.786203


### Logistic Model

In [29]:
if run_logistic==True:    
    start_time = time.time() 
    lr = LogisticRegression(C=0.01, dual=False, fit_intercept=True,
                   intercept_scaling=1, max_iter=1200000,
                   multi_class='multinomial', penalty='l2',
                   solver='lbfgs', tol=0.0001)
    lr_model = lr.fit(feature_train,label_train)
    print('Training time: {:4f} seconds'.format(time.time()-start_time))
    start_time = time.time()
    test_preds = lr_model.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = lr_model.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 106.160704 seconds
Prediction time: 0.023070 seconds

Classification Error: 0.178333
Accuracy: 0.821667

Classification Report:

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       473
           1       0.60      0.49      0.54       127

    accuracy                           0.82       600
   macro avg       0.73      0.70      0.71       600
weighted avg       0.81      0.82      0.81       600

Confusion Matrix:

[[431  42]
 [ 65  62]]

AUC: 0.822310


### Weighted Logistic Model

In [30]:
#params = {'C':[0.1,0.25,0.5,1]}
#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#gscv = GridSearchCV(LogisticRegression(dual=False, fit_intercept=True,
#                   intercept_scaling=1, max_iter=1200000,
#                   multi_class='multinomial', penalty='l2',
#                   solver='lbfgs', tol=0.1),params,cv=cv,scoring='roc_auc',verbose=True).fit(feature_train,label_train)
#gscv.best_params_

In [31]:
if run_logistic==True:    
    
    weights = {0:80.0, 1:20.0}
    start_time = time.time() 
    lr = LogisticRegression(C=0.1, dual=False, fit_intercept=True,
                   intercept_scaling=1, max_iter=12000000,
                   multi_class='ovr', penalty='l2',
                   solver='lbfgs', tol=0.1,class_weight=weights)
    lr_model = lr.fit(feature_train_sm,label_train_sm)
    print('Training time: {:4f} seconds'.format(time.time()-start_time))
    start_time = time.time()
    test_preds = lr_model.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = lr_model.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 190.796331 seconds
Prediction time: 0.030899 seconds

Classification Error: 0.205000
Accuracy: 0.795000

Classification Report:

              precision    recall  f1-score   support

           0       0.87      0.86      0.87       473
           1       0.52      0.54      0.53       127

    accuracy                           0.80       600
   macro avg       0.69      0.70      0.70       600
weighted avg       0.80      0.80      0.80       600

Confusion Matrix:

[[409  64]
 [ 59  68]]

AUC: 0.812106


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### SVM

In [37]:
if run_svm==True:    
    start_time = time.time()
    svm2 = SVC(C=0.00001, kernel='linear', random_state=2020,probability=True) 
    svm2_model = svm2.fit(feature_train,label_train)
    print('Training time: {:4f} seconds'.format(time.time()-start_time))
    
    start_time = time.time()
    test_preds = svm2_model.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = svm2_model.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 51.635884 seconds
Prediction time: 1.817035 seconds

Classification Error: 0.188333
Accuracy: 0.811667

Classification Report:

              precision    recall  f1-score   support

           0       0.81      0.99      0.89       473
           1       0.85      0.13      0.23       127

    accuracy                           0.81       600
   macro avg       0.83      0.56      0.56       600
weighted avg       0.82      0.81      0.75       600

Confusion Matrix:

[[470   3]
 [110  17]]

AUC: 0.758053


In [38]:
# #grid search with cv 3 to find the best performed parameters
# param= {'C': [0.00001,0.0001,0.001,0.01,1,10],
#        'kernel':['linear', 'rbf', 'poly'],
#        'degree':[2,3,4]}

# gscv = GridSearchCV(SVC(random_state = 2020), param, cv=3, return_train_score=True)
# gscv.fit(feature_train,label_train)
# gscv.best_params_
# #output: {'C': 10, 'degree': 4, 'kernel': 'poly'}

In [39]:
#improved svm using parameters from grid search
if run_svm==True:    
    start_time = time.time() 
    svm = SVC(C=10, kernel='poly', degree=4, random_state=2020,probability=True)
    svm_model = svm.fit(feature_train,label_train)
    print('Training time: {:4f} seconds'.format(time.time()-start_time))
    
    start_time = time.time()
    test_preds = svm_model.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = svm_model.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 79.167671 seconds
Prediction time: 1.844987 seconds

Classification Error: 0.145000
Accuracy: 0.855000

Classification Report:

              precision    recall  f1-score   support

           0       0.86      0.98      0.91       473
           1       0.84      0.39      0.53       127

    accuracy                           0.85       600
   macro avg       0.85      0.68      0.72       600
weighted avg       0.85      0.85      0.83       600

Confusion Matrix:

[[464   9]
 [ 78  49]]

AUC: 0.828137


In [40]:
# #grid search with cv 3 to find the best performed parameters
# param= {'C': [0.001,0.01,1,10,15,20],
#        'kernel':['linear', 'rbf', 'poly'],
#        'degree':[2,3,4]}

# gscv = GridSearchCV(SVC(random_state = 2020), param, cv=3, return_train_score=True)
# gscv.fit(feature_train_PCA,label_train)
# gscv.best_params_
# #output: {'C': 10, 'degree': 2, 'kernel': 'rbf'}

In [42]:
#improved svm with PCA
if run_svm==True:    
    start_time = time.time() 
    svm = SVC(C=10, kernel='rbf', degree=2, random_state=2020,probability=True)
    svm_model = svm.fit(feature_train_PCA,label_train)
    print('Training time: {:4f} seconds'.format(time.time()-start_time))
    
    start_time = time.time()
    test_preds = svm_model.predict(feature_test_PCA)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = svm_model.predict_proba(feature_test_PCA)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 0.746107 seconds
Prediction time: 0.015865 seconds

Classification Error: 0.213333
Accuracy: 0.786667

Classification Report:

              precision    recall  f1-score   support

           0       0.82      0.93      0.87       473
           1       0.49      0.24      0.32       127

    accuracy                           0.79       600
   macro avg       0.66      0.59      0.60       600
weighted avg       0.75      0.79      0.76       600

Confusion Matrix:

[[442  31]
 [ 97  30]]

AUC: 0.745093


### Weighted SVM

In [43]:
#weights = {0:80.0, 1:20.0}
#params= {'C': [1,10,15,20],
#        'kernel':['linear', 'rbf', 'poly'],
#        'degree':[2,3,4]}
#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#gscv = GridSearchCV(SVC(class_weight=weights,random_state = 2020,probability=True), params, cv=3, scoring='roc_auc',verbose=True)
#gscv.fit(feature_train,label_train)
#gscv.best_params_

In [45]:
#improved svm using parameters from grid search
if run_svm==True:    
    weights = {0:80.0, 1:20.0}
    start_time = time.time() 
    svm = SVC(C=10, kernel='poly', degree=4, random_state=2020,class_weight=weights,probability=True)
    svm_model = svm.fit(feature_train,label_train)
    print('Training time: {:4f} seconds'.format(time.time()-start_time))
    
    start_time = time.time()
    test_preds = svm_model.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))
    classification_error = np.mean(np.array(test_preds) != np.array(label_test))
    
    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))
    
    test_probs = svm_model.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 382.177514 seconds
Prediction time: 1.555441 seconds

Classification Error: 0.163333
Accuracy: 0.836667

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       473
           1       0.64      0.51      0.57       127

    accuracy                           0.84       600
   macro avg       0.76      0.72      0.73       600
weighted avg       0.83      0.84      0.83       600

Confusion Matrix:

[[437  36]
 [ 62  65]]

AUC: 0.837209


### Naive Bayes

In [34]:
if run_naivebayes == True:
    start = time.time()
    gnb = GaussianNB()
    gaussian = gnb.fit(feature_train, label_train)
    end = time.time()
    print('Training time: {:4f} seconds'.format(end-start))
    
    start = time.time()
    test_pred = gaussian.predict(feature_test)
    end = time.time()
    print('Prediction time: {:4f} seconds'.format(end-start))
    
    classification_error = np.mean(np.array(test_pred) != np.array(label_test)) #Classification Error
    
    print('\\nClassification Error: {:4f}'.format(classification_error))
    print('Accuracy: {:4f}\\n'.format(1-classification_error))
    print('Classification Report:\\n')
    print(classification_report(label_test,test_pred))
    print('Confusion Matrix:\\n')
    print(confusion_matrix(label_test,test_pred))
    
    test_probs = gaussian.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 0.131302 seconds
Prediction time: 0.033022 seconds
\nClassification Error: 0.336667
Accuracy: 0.663333\n
Classification Report:\n
              precision    recall  f1-score   support

           0       0.86      0.69      0.76       473
           1       0.33      0.57      0.42       127

    accuracy                           0.66       600
   macro avg       0.59      0.63      0.59       600
weighted avg       0.74      0.66      0.69       600

Confusion Matrix:\n
[[326 147]
 [ 55  72]]

AUC: 0.660369


### Lasso

### SMOTE Bagging

In [None]:
#params = {'n_estimators':[25,50,75,100]}
#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#gscv = GridSearchCV(BaggingClassifier(),params,cv=cv,scoring='roc_auc').fit(feature_train_sm,label_train_sm)
#gscv.best_params_
#output: {{'n_estimators': 100}}

In [35]:
if run_bagging_smote == True:
    
    over = SMOTE(sampling_strategy='auto')
    under = RandomUnderSampler(sampling_strategy='auto')
    sm = Pipeline(steps = [('o', over), ('u', under)])
    feature_train_sm, label_train_sm = sm.fit_resample(feature_train,label_train)
    
    start_time=time.time()

    bagging_smote = BaggingClassifier(n_estimators = 100)
    bagging_smote.fit(feature_train_sm, label_train_sm)
    print('Training time: {:4f} seconds'.format(time.time()-start_time))

    start_time = time.time()
    test_preds = bagging_smote.predict(feature_test)
    print('Prediction time: {:4f} seconds'.format(time.time()-start_time))

    classification_error = np.mean(np.array(test_preds) != np.array(label_test))

    print('\nClassification Error: {:4f}'.format(classification_error)) 
    print('Accuracy: {:4f}\n'.format(1-classification_error))
    print('Classification Report:\n')
    print(classification_report(label_test,test_preds))
    print('Confusion Matrix:\n')
    print(confusion_matrix(label_test,test_preds))

    test_probs = bagging_smote.predict_proba(feature_test)[:,1]
    #Note: AUC is a better metric than accuracy because of imbalanced classes
    print('\nAUC: {:4f}'.format(roc_auc_score(label_test,test_probs)))

Training time: 660.855384 seconds
Prediction time: 0.435849 seconds

Classification Error: 0.198333
Accuracy: 0.801667

Classification Report:

              precision    recall  f1-score   support

           0       0.84      0.92      0.88       473
           1       0.55      0.35      0.43       127

    accuracy                           0.80       600
   macro avg       0.70      0.64      0.66       600
weighted avg       0.78      0.80      0.78       600

Confusion Matrix:

[[436  37]
 [ 82  45]]

AUC: 0.775174


After undersampling/oversampling, we now have equal number of members in each class

In [36]:
print('Number of records with label 0 (basic emotion):   {:4d} '.format(len(label_train_sm)-sum(label_train_sm)))
print('Number of records with label 1 (complex emotion): {:2d} '.format(sum(label_train_sm)))

Number of records with label 0 (basic emotion):   1929 
Number of records with label 1 (complex emotion): 1929 
