In [1]:
import pandas as pd
import numpy as np
import numpy as np
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier  
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import decomposition
from sklearn import preprocessing




In [2]:
def load_1csv(csv):
    T = pd.read_csv("facial_expression2.csv", sep=",")
    return T

In [3]:
def load_2csv(face, emp):
    emp = pd.read_csv(emp, sep=",")
    face = pd.read_csv(face, sep=",")
    T = pd.merge(emp, face, left_index=True, right_index=True)
    return T

In [4]:
def clean_data(T, code):
    T = T.drop("participant", axis=1) 
    window = {'start': 1,'submit1': 2, 'submit2': 3}  #creating a dict file- label names to integers  
    T.label = [window[item] for item in T.label] 
    if code == 1:
        T = T.drop(T.columns[[0]], axis=1) #with one CSV, it gives an extra Unnamed:0 column on place 0. Delete it
    if code == 2:
        T.replace([np.inf, -np.inf], np.nan) #things that are Inf or NaN, change to NaN
        T.columns[T.isnull().any()].tolist()
        T = T.drop(T.columns[[25]], axis=1)
        T.mean()
        T.fillna(T.mean())
        T.dropna(axis=1, inplace=True) #drop rows that contain NaN
    return T

In [5]:
def getXy (T):
    return T.drop('label', axis=1), T['label'] #X and y

In [6]:
def StratSplit(X,y):
    X = np.array(X)
    y = np.array(y)

    stratSplit = StratifiedShuffleSplit(y, 1, test_size=0.3,random_state=42)
    StratifiedShuffleSplit(y, n_iter=1, test_size=0.3)
    for train_idx,test_idx in stratSplit:
        X_train=X[train_idx]
        y_train=y[train_idx]
        X_test=X[test_idx]
        y_test=y[test_idx]    
    return X_train, y_train, X_test, y_test

In [7]:
def balanced_accuracy_score(y_true, y_pred, sample_weight=None, adjusted=False):
    C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
    with np.errstate(divide='ignore', invalid='ignore'):
        per_class = np.diag(C) / C.sum(axis=1)
    if np.any(np.isnan(per_class)):
        warnings.warn('y_pred contains classes not in y_true')
        per_class = per_class[~np.isnan(per_class)]
    score = np.mean(per_class)
    if adjusted:
        n_classes = len(per_class)
        chance = 1 / n_classes
        score -= chance
        score /= 1 - chance
    return score

import sklearn.metrics as metr

def model_evaluation(pipeline, train_data, train_labels, test_data, test_labels):
    '''
    This function returns training and testing scores obtained from a pipeline trained and tested on 
    training and testing data+labels passed as input.
    
    =========================== ===============================================
    Parameter                   Description
    =========================== ===============================================
    "pipeline"                  The pipeline to use to obtain performance
                                metrics.
    "train_data"                Training data (pandas dataframe). 
    "train_labels"              Training labels.
    "test_data"                 Test data (pandas dataframe).
    "test_labels"               Testing labels.                                
    =========================== ===============================================
    '''
    
    train_score = pipeline.score(train_data,train_labels)
    train_predict = pipeline.predict(train_data)
    train_F1 = metr.f1_score(y_pred=train_predict,y_true=train_labels,average='weighted')
    train_bACC = balanced_accuracy_score(y_pred=train_predict,y_true=train_labels)
    train_recall_weighted = metr.recall_score(y_pred=train_predict,y_true=train_labels,average='weighted')
    
    y_pred = pipeline.predict(test_data)
    y_true = np.array(test_labels)
    test_score = metr.accuracy_score(y_true, y_pred)
    test_F1 = metr.f1_score(y_pred=y_pred,y_true=y_true,average='weighted')
    test_bACC = balanced_accuracy_score(y_pred=y_pred,y_true=y_true)
    
    print("Training accuracy: ", train_score)    
    print("Test accuracy: ", test_score)
    print("Training F1-score: ", train_F1)
    print("Test F1-score: ", test_F1)
    print("Training balanced accuracy: ", train_bACC)    
    print("Test balanced accuracy: ", test_bACC)

    return train_score, test_score, y_pred, train_predict

In [8]:
def PCA(components_nr, Data, T, array): #Data is X
    pca = decomposition.PCA(n_components=components_nr,svd_solver='full') #SVD solver is maybe something that needs to be adjusted
    pca.fit(Data)
    Data = pca.transform(Data)
    
    print(pca.explained_variance_ratio_)
    T = T.drop('label', axis=1)
    data_scaled = pd.DataFrame(preprocessing.scale(T),columns = T.columns) 
    table = pd.DataFrame(pca.components_,columns=data_scaled.columns,index = array)
    print(table.idxmax(axis=1))
    print(table.max(axis=1))
    
    return Data

In [33]:
T = load_1csv("facial_expression2.csv");
T = clean_data(T, 1)
X_train, y_train, X_test, y_test = StratSplit(X,y)

In [10]:
T = load_2csv("facial_expression.csv", "empaticaDataFinal.csv");
T = clean_data(T, 2)

In [32]:
X, y = getXy(T)
#OPTIONAL
array = ['PC-1','PC-2', 'PC-3','PC-4','PC-5','PC-6','PC-7','PC-8','PC-9','PC-10','PC-11','PC-12','PC-13','PC-14','PC-15','PC-16','PC-17','PC-18','PC-19','PC-20']
# array = ['PC-1','PC-2', 'PC-3','PC-4','PC-5']
X = PCA(20, X, T, array)
X_train, y_train, X_test, y_test = StratSplit(X,y)

[0.4238784  0.27143143 0.1098255  0.05398658 0.05185036 0.02955319
 0.02131619 0.01247655 0.01021891 0.00490226]


ValueError: Shape of passed values is (17, 10), indices imply (17, 20)

In [12]:
#TEST MODELS
logreg = LogisticRegression()
dtc = DecisionTreeClassifier()
svc = SVC(kernel='poly', degree=8)  

p = Pipeline([
            ('Dtc', dtc)
            #('Svc', svc)
            #('Logreg', logreg)
])

#print('Decision Tree')
p.fit(X_train,y_train)
train_score, test_score, test_predict, train_predict = model_evaluation(p, X_train, y_train,
                                                                  X_test, y_test)

Training accuracy:  1.0
Test accuracy:  0.29411764705882354
Training F1-score:  1.0
Test F1-score:  0.25565610859728505
Training balanced accuracy:  1.0
Test balanced accuracy:  0.3111111111111111


In [38]:
from lightgbm import LGBMClassifier as lgb
print("LightGBM results: ")

lgbm = lgb(max_depth=10, num_iterations=1000, num_leaves=100, min_data_in_leaf=10, min_child_samples=5, 
            num_class=3, learning_rate=0.0001,objective='multiclass', 
           boosting_type='gbdt', metric='multi_logloss', max_bin=100)
p = Pipeline([
                ('lgbm', lgbm)
            ])
p.fit(X_train,y_train)
train_score, test_score, test_predict, train_predict = model_evaluation(p, X_train, y_train,
                                                                  X_test, y_test)

LightGBM results: 




Training accuracy:  0.6486486486486487
Test accuracy:  0.47058823529411764
Training F1-score:  0.6102378876572426
Test F1-score:  0.38431372549019605
Training balanced accuracy:  0.6410256410256411
Test balanced accuracy:  0.48888888888888893


  if diff:
  if diff:
  if diff:


In [30]:
T

Unnamed: 0,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,AU15_r,AU17_r,AU20_r,AU23_r,AU25_r,AU26_r,AU45_r,label
0,0.357733,0.171867,0.822933,0.156933,0.1014,0.099933,0.0868,0.765733,0.6732,0.262467,0.149867,0.212133,0.143133,0.1646,0.325933,0.253133,0.213467,1
1,0.2608,0.1992,0.490933,0.057,0.863867,1.015533,0.144267,1.486667,1.2676,1.808933,0.9688,0.629867,0.730267,0.158533,0.815533,0.8502,0.371933,2
2,0.1296,0.116133,0.9402,0.032733,1.387267,0.0004,0.043933,1.593267,2.391133,1.8838,0.1264,0.901133,0.272267,0.159933,1.174267,0.957733,0.2682,3
3,0.059867,0.032333,0.0062,0.029867,0.0,0.0,0.025133,0.284667,0.2146,1.011667,0.048867,0.177867,0.027267,0.0546,0.123467,0.1424,0.064933,1
4,0.047267,0.032867,0.079667,0.028267,0.0,0.0,0.0416,0.991267,0.784333,1.133333,0.064667,0.258933,0.046133,0.122333,0.481667,0.2374,0.0512,2
5,0.0444,0.043333,0.000867,0.035067,0.0,0.0,0.0252,1.006267,0.505733,0.2492,0.103067,0.194867,0.075467,0.157933,0.205667,0.106467,0.054,3
6,0.056467,0.042333,2.912133,0.024067,0.0,0.341867,0.027267,0.884867,0.0,1.222067,0.076267,0.214067,0.071067,0.066,0.135733,0.240067,0.064467,1
7,0.0466,0.0324,3.482333,0.0354,0.0,0.110733,0.025667,0.933733,0.0,1.114867,0.046267,0.2138,0.050067,0.0446,0.1342,0.2248,0.113733,2
8,0.397667,0.3192,1.6076,0.178733,0.274667,0.449467,0.0492,1.287533,0.283667,1.2454,0.3332,0.904667,0.0826,0.5244,0.519933,0.5746,0.395133,3
9,0.035933,0.020533,0.7964,0.023733,0.0,0.006067,0.019933,0.0006,0.0,0.0,0.072,0.223667,0.031,0.047533,0.08,0.104733,0.116,1
