# Machine Learning Analyses
### Import Libraries

In [1]:
# Basics
import numpy as np
import pandas as pd
from collections import Counter
# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
# Model evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

### Import Data

In [2]:
mimicry = pd.read_csv("data/Ratings/mimicry.csv").iloc[0::,1::]
coach = pd.read_csv("data/Ratings/coach.csv")
prob_inv = pd.read_csv("data/Ratings/prob_inv.csv")
gaze = pd.DataFrame(pd.read_csv("data/Ratings/gaze.csv").iloc[0::,1::].values.flatten(), columns = ["gaze"]) 

### Data Manipulating

In [3]:
## Settings
y_choice = 1            # Variable to predict: 1 = prob_inv; 2 = coach
n_categories = 2        # prob_inv [2 or 3]; coach [2 or 3 or 4]

# y setting
if y_choice == 1:
    if n_categories == 2:
        prob_inv["PI_cat"] = pd.cut(prob_inv.prob_inv, bins=[-1,30,100], labels = ["low (0-30)", "high (30-100)"])
        y = prob_inv.PI_cat
    elif n_categories == 3:
        prob_inv["PI_cat"] = pd.cut(prob_inv.prob_inv, bins=[-1,10,45,100], labels = ["low (0-10)", "middle (10-45)", "high (45-100)"])
        y = prob_inv.PI_cat
    else:
        print("No accaptable settings. See notes behind the settings.")
elif y_choice == 2:
    if n_categories == 2:
        coach["coach_cat"] = pd.cut(coach.coach, bins=[2,3.556,6], labels = ["low", "high"])
        y = coach.coach_cat
    elif n_categories == 3:
        coach["coach_cat"] = pd.cut(coach.coach, bins=[2,3.445,3.778,6], labels = ["low", "middle", "high"])
        y = coach.coach_cat
    elif n_categories == 4:
        coach["coach_cat"] = pd.cut(coach.coach, bins=[2,3.334,3.556,3.889,6], labels = ["low", "middle low", "middle high", "high"])
        y = coach.coach_cat
    else:
        print("No accaptable settings. See notes behind the settings.")
else:
    print("No accaptable settings. See notes behind the settings.")

In [4]:
# Quick test to check if y is recoded correctly.
Counter(y)

Counter({'high (30-100)': 33, 'low (0-30)': 42})

In [5]:
## Settings
use_gaze = 1                   # Use gaze proportion gaze in predictor dataframe. 0 = no; 1 = yes
scaler_to_use = 1              # Which Scaler to use. 1 = MinMaxScaler(); 2 = StandardScaler()

# Include gaze setting
if use_gaze == 1:
    X = mimicry
    X["gaze"] = gaze
elif use_gaze == 0:
    X = mimicry
elif use_gaze != 0 & use_gaze != 1:
    print("No accaptable gaze settings. See notes behind the settings.")
# Scaler setting
if scaler_to_use == 1:
    scaler = MinMaxScaler()
elif scaler_to_use == 2:
    scaler = StandardScaler()
elif scaler_to_use != 1 & scaler_to_use != 2:
    print("No accaptable scaler settings. See notes behind the setting.")

X = scaler.fit_transform(X)

In [6]:
# Split X and encoded_y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 333)

### Create Models and set parameters

In [7]:
# Create models and add to list.
model1 = DecisionTreeClassifier(random_state = 333)
model2 = RandomForestClassifier(random_state = 333)
model3 = KNeighborsClassifier(n_neighbors = 4)
model4 = SVC(kernel = 'poly', random_state = 333)
model5 = GaussianNB()
model6 = MLPClassifier(random_state = 333, hidden_layer_sizes = [100,200], max_iter = 1500)

classifiers = [model1, model2, model3, model4, model5, model6]
classifiers_names = ["Decision Tree", "Random Forest", "KNN", "SVM", "Gaussian Naive Bayes", "MLP"]

In [8]:
# Use cross-validation on the train set to assess best parameters .
print("model", "\t", "\t", "Average Acc.", "\t", "Max. Acc.")
for i in range(len(classifiers)):
    clf = classifiers[i]
    scores = cross_val_score(clf, X_train, y_train, cv = 4)
    print(classifiers_names[i], "\t", scores.mean(), "\t", scores.max())

model 	 	 Average Acc. 	 Max. Acc.
Decision Tree 	 0.4666666666666667 	 0.5333333333333333
Random Forest 	 0.55 	 0.6666666666666666
KNN 	 0.5833333333333334 	 0.6666666666666666
SVM 	 0.5833333333333333 	 0.6666666666666666
Gaussian Naive Bayes 	 0.5 	 0.6
MLP 	 0.5499999999999999 	 0.7333333333333333


In [9]:
# Train models on the full train set and assess performance on test set. Also printing confusion matrix to check for class imbalance in prediction.
for i in range(len(classifiers)):
    clf = classifiers[i]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = clf.score(X_test, y_test)
    print(classifiers_names[i], "\t", acc)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

Decision Tree 	 0.3333333333333333
[[4 5]
 [5 1]]
Random Forest 	 0.4666666666666667
[[4 5]
 [3 3]]
KNN 	 0.5333333333333333
[[7 2]
 [5 1]]
SVM 	 0.6
[[5 4]
 [2 4]]
Gaussian Naive Bayes 	 0.6666666666666666
[[6 3]
 [2 4]]
MLP 	 0.6
[[5 4]
 [2 4]]
