In [None]:
from google.colab import files
import pandas as pd
import numpy as np
from google.colab import drive
import sys
import os

In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/rawPersonalityMatrix.csv')

df['Extraversion'] = df['Q6'] - df['Q1']
df['Agreeableness'] = df['Q2'] - df['Q7']
df['Conscientiousness'] = df['Q3'] - df['Q8']
df['Neuroticism'] = df['Q9'] - df['Q4']
df['Openness'] = df['Q10'] - df['Q5']

In [None]:
judge2 = set(np.array(df[df['Judge ID'] == 2]['Face']))
judge4 = set(np.array(df[df['Judge ID'] == 4]['Face']))

# Missing Entries for Judge 4
{'00848_940307_fa', '00907_960530_fa'}

In [None]:
missingEntries = judge2.difference(judge4)

In [None]:
missingEntries

{'00848_940307_fa', '00907_960530_fa'}

## Removing missing entries from all the judgements

In [None]:
[df.drop(df.loc[df.Face == eachMissing ].index, inplace=True) for eachMissing in missingEntries]

[None, None]

# Declaring the Traits

In [None]:
bigFiveTraits = {'Extraversion', 'Agreeableness', 'Conscientiousness', 'Neuroticism', 'Openness'}
judgeIds = set(df['Judge ID'])

In [None]:
judgementsDemeanedScores = {}
tmpTraitDict = {}

In [None]:

# sum(judgeIds)

## Demeaned Judge scores calculation- To avoid the behavioural biases of judges

In [None]:
tmpTraitDict = {}

for eachJudge in judgeIds:
    for eachTrait in bigFiveTraits:
        judgeTraitArray = np.array(df[df['Judge ID'] == eachJudge][eachTrait])
        tmpTraitDict[eachTrait]= judgeTraitArray - judgeTraitArray.mean()
        judgeTraitArray = {}
    # print(eachJudge, tmpTraitDict['Extraversion'][:10])
    judgementsDemeanedScores[eachJudge] = tmpTraitDict
    tmpTraitDict = {}


In [None]:
judgementsDemeanedScores

{2: {'Agreeableness': array([ 1.86819831,  1.86819831,  1.86819831, -0.13180169,  1.86819831,
          0.86819831, -0.13180169,  0.86819831, -0.13180169,  0.86819831,
          1.86819831,  1.86819831, -0.13180169,  0.86819831,  0.86819831,
          1.86819831,  1.86819831, -2.13180169,  1.86819831,  0.86819831,
         -1.13180169,  1.86819831,  0.86819831, -0.13180169,  0.86819831,
         -3.13180169, -0.13180169,  0.86819831, -1.13180169,  0.86819831,
          0.86819831,  2.86819831,  0.86819831, -0.13180169, -0.13180169,
         -1.13180169,  1.86819831,  0.86819831,  1.86819831, -2.13180169,
         -2.13180169,  1.86819831, -0.13180169, -0.13180169, -2.13180169,
          0.86819831,  1.86819831,  0.86819831, -1.13180169,  1.86819831,
         -2.13180169, -0.13180169, -0.13180169,  0.86819831,  0.86819831,
         -0.13180169, -2.13180169, -1.13180169, -0.13180169, -2.13180169,
         -0.13180169,  0.86819831, -1.13180169,  1.86819831, -0.13180169,
          1.868198

In [None]:
def correlationBetweenJudges(judge1, judge2):
    correlationList = []
    for eachTrait in bigFiveTraits:
        print(eachTrait)
        j_trait1 = judge1[eachTrait]
        j_trait2 = judge2[eachTrait]
        correlationList.append(sum(np.multiply((j_trait1),(j_trait2)))/ np.sqrt(np.multiply(sum(np.square(j_trait1)),sum(np.square(j_trait2)))))
    return correlationList

# correlationBetweenJudges(judgementsDemeanedScores[2], judgementsDemeanedScores[3])

# Calculate the reliability scores for each judge pair - 11 judges makes it 55 pairs.

In [None]:
correlationInPairs = []
for everyIteration in range(len(judgeIds)):
    judgeInFocus = judgeIds.pop()
    for withRespectTo in judgeIds:
        correlationInPairs.append(correlationBetweenJudges(judgementsDemeanedScores[judgeInFocus], judgementsDemeanedScores[withRespectTo]))

In [None]:
bigFiveDataframe = pd.DataFrame(correlationInPairs, columns= list(bigFiveTraits))

# Average Correlation
2 x Sum of 55 pairs/ N(N-1)


In [None]:
def avgCorrelation(listOfCorrs):
    return ((2 * sum(listOfCorrs))/ 110)


In [None]:
bigFiveDataframe.apply(avgCorrelation)

Openness             0.263784
Agreeableness        0.428455
Extraversion         0.450624
Conscientiousness    0.361540
Neuroticism          0.263342
dtype: float64

# Effective Spearman Brown Coefficient: Nr/ 1+ (N-1).r


In [None]:
def effectiveSBCoeff(listOfCorrs):
    avgCorr = avgCorrelation(listOfCorrs)
    return (11* avgCorr/ (1+ (10*avgCorr)))

In [None]:
bigFiveDataframe.apply(effectiveSBCoeff)

Openness             0.797622
Agreeableness        0.891846
Extraversion         0.900227
Conscientiousness    0.861667
Neuroticism          0.797255
dtype: float64

In [None]:
df

Unnamed: 0,Judge ID,Face,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Extraversion,Agreeableness,Conscientiousness,Neuroticism,Openness
0,2,00002_940928_fa,3,4,3,3,3,4,2,3,3,3,1,2,0,0,0
1,2,00003_941121_fa,4,4,2,4,4,3,2,3,4,3,-1,2,-1,0,-1
2,2,00019_940422_fa,3,4,3,4,3,4,2,3,3,3,1,2,0,-1,0
3,2,00028_940128_fa,4,3,3,3,4,2,3,4,4,3,-2,0,-1,1,-1
4,2,00029_960627_fa,3,4,3,4,3,3,2,4,3,3,0,2,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9112,13,01009_960627_fa,4,2,2,2,4,2,2,2,4,2,-2,0,0,2,-2
9113,13,01010_960627_fa,4,2,2,2,5,2,2,3,2,1,-2,0,-1,0,-4
9114,13,01011_960627_fa,1,5,2,5,4,5,2,5,1,2,4,3,-3,-4,-2
9115,13,01012_960627_fa,1,2,2,4,5,5,2,2,1,2,4,0,0,-3,-3


# Drop the Questions and their scores

In [None]:
# df.drop(columns=['Q1','Q2','Q3','Q4','Q5','Q6','Q7','Q8','Q9','Q10'], inplace=True)

# Preparing the Data.

In [None]:
(df[df['Face'] == '01012_960627_fa' ]['Agreeableness'])

827     0
1656   -1
2483    0
3312    0
4141    2
4970   -2
5799    1
6628   -2
7457    2
8286   -1
9115    0
Name: Agreeableness, dtype: int64

In [None]:
np.mean(np.array((df[df['Face'] == eachFace ][eachTrait])))

NameError: ignored

In [None]:
tmpTraitMedian = []
eachFaceTraitMedian = {}
for eachFace in df.Face.unique():
    # print(eachFace)
    for eachTrait in bigFiveTraits:
        tmpTraitMedian.append(np.median(np.array((df[df['Face'] == eachFace ][eachTrait]))))
    eachFaceTraitMedian[eachFace] =tmpTraitMedian
    tmpTraitMedian = []


In [None]:
# from scipy import stats
# stats.mode(np.array(df[df.Face == '00002_940928_fa']['Extraversion']))

In [None]:
aUArrayColumnsList = np.array(pd.read_csv("/content/drive/MyDrive/OpenFaceData/00002_940928_fa.csv").filter(regex=("AU[0-9][0-9]_r")).columns)

In [None]:
# os.listdir('/content/drive/MyDrive/OpenFaceData')
for eachFace in eachFaceTraitMedian.keys():
    # print(eachFace)
    # print(np.array(pd.read_csv("/content/drive/MyDrive/OpenFaceData/%s.csv"%eachFace).filter(regex=("AU[0-9][0-9]_r")).iloc[0]))
    eachFaceTraitMedian[eachFace].extend(np.array(pd.read_csv("/content/drive/MyDrive/OpenFaceData/%s.csv"%eachFace).filter(regex=("AU[0-9][0-9]_r")).iloc[0]))

# eachFaceTraitMedian['00002_940928_fa']

KeyboardInterrupt: ignored

In [None]:
print(eachFaceTraitMedian['00002_940928_fa'])

In [None]:
faceData = pd.DataFrame.from_dict(eachFaceTraitMedian, orient='index', columns=list(bigFiveTraits)+list(aUArrayColumnsList))

In [None]:
cols = faceData.columns.tolist()
print(cols)
cols = cols[-17:] + cols[:-17]

In [None]:
faceData = faceData[cols]

In [None]:
faceData.to_csv("/content/drive/MyDrive/ActionUnitsWithLabels.csv")

NameError: ignored

In [None]:
import seaborn as sns
import yellowbrick

In [None]:
# faceData.drop(1, axis=0)
p = [1,2,3,4,5,6,7,8,9]
p[:-3]
faceData.drop(['Agreeableness', 'Conscientiousness', 'Openness', 'Neuroticism'], axis=1)

NameError: ignored

In [None]:
faceData

NameError: ignored

In [None]:
from yellowbrick.target import FeatureCorrelation

In [None]:
vis = FeatureCorrelation()

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import loguniform, ttest_ind, ttest_1samp
from xgboost import XGBClassifier

X_train= X_test= y_train= y_test= X_val= y_val = None
acc_p_value_dict = {}
traitDataDict = {'Agreeableness': openFaceAgreeableness, 'Conscientiousness':openFaceConscientious, 'Extraversion':openFaceExtraversion, 'Openness':openFaceOpenness, 'Neuroticism':openFaceNeuroticism}

algo_dict = {"SVM": 1}

def tuneFitPredict(binaryLabels, algo, CV = "RandomSearchCV"):
    inter_results = []
    global acc_p_value_dict
    for eachTrait in traitDataDict.keys():
#         for eachIter in range(20):
#             print("Process running for ", eachTrait)
        trainData = traitDataDict[eachTrait].iloc[:, :-1].values
#         print(type(trainData))
        if binaryLabels:
            labels = traitDataDict[eachTrait][eachTrait].apply(lambda x : 1 if x> bigFiveMedians[eachTrait] else 0)
        else:
            labels = traitDataDict[eachTrait][eachTrait].values
#         return
#         X_train, X_test, y_train, y_test = train_test_split(trainData, labels, test_size=0.1, random_state=1, stratify=labels)
        # print(Counter(y_test))
#         X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)
        # configure the cross-validation procedure
        cv_outer = RepeatedStratifiedKFold(n_splits=10, random_state=1, n_repeats=20)
        # enumerate splits
        outer_results = list()
        for train_ix, test_ix in cv_outer.split(trainData, labels):
            X_train, X_test = trainData[train_ix, :], trainData[test_ix, :]
            y_train, y_test = labels[train_ix], y[test_ix]
            kfold = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=1)
            if algo == "XGBC":
                model = XGBClassifier(eval_metric='mlogloss', random_state = 1)
                param_grid={
                "learning_rate"    : [0.1, 0.01, 0.05, ] ,
                "max_depth"        : [ 3, 4, 5, 6, 8],
                "reg_lambda"       : [ 0,1.0,3,4,10.0 ],
                "gamma"            : [ 0.0, 0.1, 0.25, 1 ],
                "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ],
                "scale_pos_weight" : [ 1,3,5],
                "subsample"        : [0,0.25, 0.5, 0.8, 1]
                      }
            elif algo == "KNN":
                model = KNeighborsClassifier()
                param_grid = { 'n_neighbors': [25,27, 29, 31, 33, 35, 37], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']}
                
            elif algo == "SVM":
                model= SVC()
                param_grid={'kernel': ['rbf'], 'C':[1e-1, 1, 10, 100], 'gamma':[0.1, 1,10,100,1000], 'degree': [1,2,3,4,5]}                
                
            else:
                print("Mentioned Algo is not defined")
                return
            if CV == "RandomSearchCV":
                search = RandomizedSearchCV(model,param_grid, cv=kfold, scoring= 'accuracy', refit=True)
            elif CV == "GridSearchCV":
                search = GridSearchCV(model,param_grid, cv=kfold, scoring= 'accuracy', refit=True)
            else:
                print("Specified CV is not mentioned")
                return
            result = search.fit(X_train, y_train)
#             print("Best score :", result.best_score_)
#             print('Best Hyperparameters: %s' % result.best_params_)
#             print('*'*70)
#             print("Training the model with the train data......")
            modelFinal = result.best_estimator_
#             if algo== "XGBC":
#                 modelFinal = XGBClassifier( learning_rate= result.best_params_['learning_rate'], 
#                                        max_depth=result.best_params_['max_depth'],
#                                        reg_lambda=result.best_params_['reg_lambda'],
#                                        gamma=result.best_params_['gamma'],
#                                        scale_pos_weight = result.best_params_['scale_pos_weight'],
#                                        colsample_bytree = result.best_params_['colsample_bytree'],
#                                        subsample = result.best_params_['subsample'],
#                                        silent=None, objective='binary:logistic', booster='gbtree', n_jobs=-1, nthread=None,
#                                        eval_metric='mlogloss',
#                                        random_state = 1
#                                        )
#             elif algo == "KNN":
#                 modelFinal = KNeighborsClassifier(n_neighbors = result.best_params_['n_neighbors'], weights = result.best_params_['weights'], metric = result.best_params_['metric'])
#             elif algo == "SVC":
#                 modelFinal = SVC(result.best_params_['C'], kernel='rbf', gamma=result.best_params_['gamma'], degree = result.best_params_['degree'])
#             else:
#                 print("Something went wrong when choosing the modelFinal")
#                 return
            
            # Train the model
#             modelFinal.predict(X_test,y_test)
            # get importance
#             importance = modelFinal.feature_importances_
#             # summarize feature importance
#             for i,v in enumerate(importance):
#                 print('Feature: %0d, Score: %.5f' % (i,v))
#             # plot feature importance
#             pyplot.bar([x for x in range(len(importance))], importance)
#             pyplot.show()
            y_pred_model = modelFinal.predict(X_test)
#             print("The class ratios of the test labels", Counter(y_test))
#             print("The class ratios of the test labels predicted", Counter(y_pred_model))
            modelAcc = modelFinal.score(X_test, y_test)
#             print("Final model score on the unseen data for %s  is   %s"%(eachTrait,modelAcc))
#             print('*'*70)
#             print("Final model score on the unseen data for apriori %s  is   %s"%(eachTrait,aprioriModel.score(X_test, y_test)))
            # print(y_pred_model, y_pred_dummy)
            modelCorrectness = judgeClassification(y_test ,y_pred_model)
            inter_results.append(np.array(modelCorrectness).mean())
        acc_p_value_dict[eachTrait] = inter_results
        
        # A priori classifier
        aprioriModel = DummyClassifier(strategy='prior')
        aprioriModel.fit(X_test,y_test)
        
        y_pred_apriori = aprioriModel.predict(X_test)
        modelCorrectness = judgeClassification(y_test ,y_pred_apriori)
        trait_mean = np.array(inter_results).mean()
        trait_std = np.array(inter_results).std()
        print("Mean accuracy in 200 repetitions is "+str(trait_mean)+ " with std "+str(trait_std))
#         print(trait_mean, trait_std)
        inter_results = []
        p_value = ttest_1samp(modelCorrectness, trait_mean)
        print( " For trait %s achieved p-value is %f"%(eachTrait, p_value.pvalue))



In [None]:
vis.ax.barh(label=)