In [1]:
import numpy as np
import pandas as pd
import random as rnd
import pickle
import time

# machine learning
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE

from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score
MODEL_PATH = 'Careless Response Detection/models/'


In [2]:

rr_data = pd.read_csv('data_mod_resp.csv', sep = ';')
rr_data.columns = rr_data.columns.str.replace('17_', '')
current_rate = ''
current_model = ''
current_data_type = ''



In [3]:
# Model fitter
def modelFit(model, X_train, X_test, y_train, y_test):

    if(model=='rf'):

        param_prefix = 'RandomForestClassifier'.lower()+('__')
        new_params = { 
            param_prefix+'n_estimators': [200, 500],
            param_prefix+'max_features': ['sqrt', 'log2'],
            param_prefix+'max_depth' : [8,10,12,14,16],
            param_prefix+'criterion' :['gini', 'entropy']
        }
        imba_pipeline = make_pipeline(SMOTE(random_state=1132), RandomForestClassifier(random_state=1132))

    elif(model=='gbm'):

        param_prefix = 'GradientBoostingClassifier'.lower()+('__')
        new_params = { 
            param_prefix+'n_estimators': [200, 500],
            param_prefix+'learning_rate': np.arange(0.5,2.0,0.5),
            param_prefix+'max_depth' : [8,10,12,14,16],
            param_prefix+'subsample' : np.arange(0.7,1.0,0.1)
        }
        imba_pipeline = make_pipeline(SMOTE(random_state=1132), GradientBoostingClassifier(random_state=1132))

    elif(model=='svm'):

        new_params = { }
        imba_pipeline = make_pipeline(SMOTE(random_state=1132), SVC(random_state=1132))

    elif(model=='knn'):

        param_prefix = 'KNeighborsClassifier'.lower()+('__')
        new_params = {             
            param_prefix+'n_neighbors': np.arange(5,14,2),
            param_prefix+'weights' : ['uniform', 'distance']
        }
        imba_pipeline = make_pipeline(SMOTE(random_state=1132), KNeighborsClassifier())

    elif(model=='nnet'):

        param_prefix = 'MLPClassifier'.lower()+('__')
        new_params = { }
        imba_pipeline = make_pipeline(SMOTE(random_state=1132), MLPClassifier(random_state=1132))

    grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=10, scoring='recall',
                            return_train_score=True)
    return grid_imba.fit(X_train, y_train)


In [None]:

import os
os.environ['R_HOME'] = "C:/Program Files/R/R-4.2.2"


import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import StrVector

# Choosing a CRAN Mirror
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)

# Installing required packages
packages = ('bartMachine', 'caret', 'reticulate', 'data.table', 'dplyr')
utils.install_packages(StrVector(packages))

import rpy2.robjects as robjects

from rpy2.robjects import pandas2ri
pandas2ri.activate()


In [5]:

def BARTCreator(X_train, y_train, file_name):
    
    robjects.r('''
options(java.parameters = "-Xmx2g")
            f <- function(X, y, verbose=TRUE) {

                if (verbose) {
                    cat("R code for BART started\n")
                }

        y <- ifelse(y, 'zeros', 'ones')
        y <- factor(y)
        y <- relevel(y, ref = 'zeros')

            bart_machine_cv = bartMachine::bartMachineCV(X, y, serialize = TRUE,  k_folds = 10,
                                                        num_tree_cvs = 150, k_cvs = c(1,2,3))
            }

            ''')

    r_f = robjects.globalenv['f']
    model = r_f(X_train, y_train)
    # robjects.r.saveRDS(model, file_name)

    return model


In [None]:

for cr_type in ['cr_all', 'cr_human', 'cr_computer']:
    current_data_type  = cr_type
    cr_data = pd.read_csv(cr_type+'.csv', sep = ';')
    cr_data.columns = cr_data.columns.str.replace('17_', '')
    for rate in [5, 10, 15, 20]:
        current_rate = str(rate)
        rr_rate = (1- (rate/100))
        cr_sample_num = int((rr_data.shape[0]/rr_rate)-rr_data.shape[0])
        cr_sample = cr_data.sample(n=cr_sample_num, replace=True)
        merged_data=pd.concat([rr_data,cr_sample],axis=0)
        merged_data= merged_data.sample(frac=1) 
        y = merged_data["Careless"]
        X = merged_data.drop("Careless", axis=1)

        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,shuffle=True)
        for model in ['bart']:
            current_model = model
            file_name = MODEL_PATH+current_model+'_'+current_rate+'_'+current_data_type
            print('-----------------Running ' + file_name + '-----------------')
            start = time.time()
            if(model=='bart'):
                fitted_model = BARTCreator(X_train, y_train, file_name)
            else:
                fitted_model = modelFit(model, X_train, X_test, y_train, y_test)
            end = time.time()
            
            print('Finished ' + file_name + ' - Time taken: ' + str(end-start))
            pickle.dump(fitted_model, open(file_name+'.pkl','wb'))



In [None]:

#Evaluate model performance using confusion matrix
with open(MODEL_PATH + 'bart_20_cr_all.pkl', 'rb') as file:
    model = pickle.load(file)

y_pred=pd.DataFrame(robjects.r.predict(model, X_test))
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
AUROC = np.round(roc_auc_score(y_test, y_pred), 2)

print("false positive rate - ",round(fpr[1],2))
print("true positive rate - ",round(tpr[1],2))
print("AUROC - ",round(AUROC,2))
metrics.confusion_matrix(y_test, y_pred)