# Ajout des bibliothèques

In [107]:
import pandas as pd
from subprocess import call
import seaborn as sns
import numpy as np
from scipy import stats
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import graphviz
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import seaborn as sn
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer

import warnings
import random
from datetime import datetime
import time
import pickle
from IPython.display import HTML, display, Markdown, display_markdown, Pretty
import statsmodels.api as sa
import statsmodels.formula.api as sfa


warnings.filterwarnings('ignore')

# Fonctions utiles

Fonction de classification optimisé : ajoute une variance d'acceptabilité  
Exemple :  
> Si la réponse attentue est 1, alors on accepte aussi 0 et 2 comme réponse

In [93]:
def classification_report_opti(y_test, y_test_predict):
    classification = {}
    for cl in range(max(y_test) + 1):
        tp = 0
        fp = 0
        fn = 0
        tn = 0
        for i in range(len(y_test)):
            if cl > 0 and cl < max(y_test):
                if y_test[i] == cl and y_test_predict[i] in [
                        cl - 1, cl, cl + 1
                ]:
                    tp += 1
                elif y_test[i] == cl and y_test_predict[i] not in [
                        cl - 1, cl, cl + 1
                ]:
                    fn += 1
                elif y_test[i] != cl and y_test_predict[i] in [
                        cl - 1, cl, cl + 1
                ]:
                    fp += 1
                elif y_test[i] != cl and y_test_predict[i] not in [
                        cl - 1, cl, cl + 1
                ]:
                    tn += 1
            elif cl == 0:
                if y_test[i] == cl and y_test_predict[i] in [cl, cl + 1]:
                    tp += 1
                elif y_test[i] == cl and y_test_predict[i] not in [cl, cl + 1]:
                    fn += 1
                elif y_test[i] != cl and y_test_predict[i] in [cl, cl + 1]:
                    fp += 1
                elif y_test[i] != cl and y_test_predict[i] not in [cl, cl + 1]:
                    tn += 1
            elif cl == max(y_test):
                if y_test[i] == cl and y_test_predict[i] in [cl, cl - 1]:
                    tp += 1
                elif y_test[i] == cl and y_test_predict[i] not in [cl, cl - 1]:
                    fn += 1
                elif y_test[i] != cl and y_test_predict[i] in [cl, cl - 1]:
                    fp += 1
                elif y_test[i] != cl and y_test_predict[i] not in [cl, cl - 1]:
                    tn += 1
            classification[cl] = {'TN': tn, 'FN': fn, 'FP': fp, 'TP': tp}

    for i in classification:
        precision = (
            classification[i]['TP'] /
            (classification[i]['TP'] + classification[i]['FP'])
        ) if (classification[i]['TP'] + classification[i]['FP']) != 0 else 0
        recall = (
            classification[i]['TP'] /
            (classification[i]['TP'] + classification[i]['FN'])
        ) if (classification[i]['TP'] + classification[i]['FN']) != 0 else 0
        accuracy = (
            (classification[i]['TP'] + classification[i]['TN']) /
            (classification[i]['TP'] + classification[i]['TN'] +
             classification[i]['FP'] + classification[i]['FN'])
        ) if (classification[i]['TP'] + classification[i]['TN'] +
              classification[i]['FP'] + classification[i]['FN']) != 0 else 0
        classification[i].update({
            'precision': round(precision, 2),
            'recall': round(recall, 2),
            'accuracy': round(accuracy, 2)
        })
        f1_score = (
            2 * (classification[i]['precision'] * classification[i]['recall'])
        ) / (classification[i]['precision'] + classification[i]['recall']) if (
            classification[i]['precision'] +
            classification[i]['recall']) != 0 else 0
        classification[i].update({'f1-score': round(f1_score, 2)})

    num = 0
    denom = 0
    for i in classification:
        num += classification[i]['TP'] + classification[i]['TN']
        denom += classification[i]['TP'] + classification[i][
            'TN'] + classification[i]['FP'] + classification[i]['FN']

    accuracy = num / denom
    classification.update({'accuracy': round(accuracy, 2)})
    return classification
    


Fonction permettant de garder la valeur des répétitions précédentes 
> Si rep_2() est appeler, alors elle stocke sous "rep_1" la valeur choisi pour la 1ère apparition du scénario  
> Si rep_3() est appeler, alors elle stocke sous "rep_1" la valeur choisi pour la 1ère apparition du scénario, et sous "rep_2"   la valeur choisi pour la 2e apparition du scénario  

In [3]:
def rep_2(df):
    df["rep_1"] = 0
    for scen in range(1, 10):
        start_time = datetime.now()
        for i in df[df.scenarios == scen].index:
            if df.loc[i].repetition_question == 1:
                rep_1 = df.loc[i].task_1
            elif df.loc[i].repetition_question == 2:
                df.loc[i, 'rep_1'] = rep_1
    return df

def rep_3(df):
    df["rep_2"] = 0
    for scen in range(1, 10):
        start_time = datetime.now()
        for i in df[df.scenarios == scen].index:
            if df.loc[i].repetition_question == 2:
                rep_1 = df.loc[i].rep_1
                rep_2 = df.loc[i].task_1
            elif df.loc[i].repetition_question == 3:
                df.loc[i, 'rep_1'] = rep_1
                df.loc[i, 'rep_2'] = rep_2
    return df

Fonction qui prend 3 réponses consécutives aléatoires de chaque participants d'un dataframe et les retourne toutes dans un nouveau dataframe

In [4]:
def random_3consecutiveanswers(df):
    participants = df.groupby(['id_participant']).size()
    consecutive_answers = pd.DataFrame()

    for id in participants.keys():
        participant = df[df.id_participant == id].copy()
        participant = participant.head(int(participant.shape[0] / 2)).copy()

        line = participant.sample().index
        if ((line[0] - 1) in participant.index
                and (line[0] + 1) in participant.index):
            first = participant.loc[line - 1].copy()
            second = participant.loc[line].copy()
            third = participant.loc[line + 1].copy()

        elif ((line[0] - 1) in participant.index
              and not (line[0] + 1) in participant.index):
            first = participant.loc[line - 2].copy()
            second = participant.loc[line - 1].copy()
            third = participant.loc[line].copy()

        elif (not (line[0] - 1) in participant.index
              and (line[0] + 1) in participant.index):
            first = participant.loc[line].copy()
            second = participant.loc[line + 1].copy()
            third = participant.loc[line + 2].copy()

        consecutive_answers = pd.concat(
            [consecutive_answers, first, second, third])

    return consecutive_answers



# Importation des données

On importe les réponses des participants au jeu ainsi que leurs réponses aux questions de concentration.

In [5]:
#Import the data
data = pd.read_csv(
    "https://ethicallychoice.alwaysdata.net/wp-content/data.csv", sep=";")

#drop the empty line
data = data.dropna()

#convert to int all the datas
for i in ["1", "2"]:
    data["personnage_" + i] = data["personnage_" + i].astype(int)
    data["force_" + i] = data["force_" + i].astype(int)
data["scenarios"] = data["scenarios"].astype(int)

#Import the concentration question
conc = pd.read_csv(
    "https://ethicallychoice.alwaysdata.net/wp-content/conc.csv", sep=";")


On fait le tri parmis ceux qui ont mal répondu et on les enlève du jeu de données.

In [6]:
#collect people who are bad answering
out = {}
for i in range(len(conc)):
    for j in range(1, 4):
        if conc["q" + str(j)][i] != conc["q" + str(j) + "_rep"][i]:
            if conc.id[i] not in out:
                out[conc.id[i]] = 1
            else:
                out[conc.id[i]] = out[conc.id[i]] + 1
out = [k for (k, v) in out.items() if v > 1]

#collect lines corresponding in data
index = []
for i in range(len(out)):
    index.append(data.index[data["id_participant"] == out[i]].tolist())

#drop these lines
for i in range(len(index)):
    for j in range(len(index[i])):
        data.drop(index[i][j], inplace=True)

On associe un entier correspondant pour le champ enfant et le genre, qui sont des informations personnelles sur le participant.  
> Pour le champ "Enfant" : on attribue un 0 si la réponse est "non", et 1 si la réponse est "oui"  
> Pour le champ "Sexe" : on attribue un 0 si la réponse est "femme", 1 si la réponse est "homme" et 2 si la réponse est "autre".

In [7]:
#Convert to number "Enfant"
data["enfant"] = [0 if i == "Non" else 1 for i in data["enfant"]]

#Associate a number to each gender
data["sexe"] = [
    0 if i == "Femme" else 1 if i == "Homme" else 2 for i in data["sexe"]
]

On suppose que changer l'ordre des 2 personnages incluent dans le scénario, i.e le personnage de gauche devient celui de ddroite et inversement, n'influence pas la prise de décision. Pour cela, on copie le jeu de données en échangeant les personnages de droite et leurs valeurs attribuées avec les personnages de gauche et leurs valeurs attribuées pour chaque question.

In [8]:
#We make the assumption that changing the order does not influence people
data2 = data.copy(deep=True)

for s in ["personnage", "force", "task"]:
    data2[s + "_1"] = data[s + "_2"]
    data2[s + "_2"] = data[s + "_1"]

data = pd.concat([data, data2]).reset_index(drop=True)

data

Unnamed: 0,personnage_1,personnage_2,force_1,force_2,scenarios,task_1,task_2,repetition_question,id_participant,age,sexe,taille,enfant
0,1,5,7,3,4,8,2,1,d60c0832fc30e645ca04f074c44b49eb,57,0,169,1
1,5,9,3,6,2,1,9,1,d60c0832fc30e645ca04f074c44b49eb,57,0,169,1
2,5,8,3,8,3,1,9,1,d60c0832fc30e645ca04f074c44b49eb,57,0,169,1
3,3,5,10,3,9,0,10,1,d60c0832fc30e645ca04f074c44b49eb,57,0,169,1
4,8,9,8,6,7,5,5,1,d60c0832fc30e645ca04f074c44b49eb,57,0,169,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6745,9,5,6,4,9,6,4,3,bbae47f9f9a4f0cb81fac6cc22f59da6,22,1,165,0
6746,8,4,8,1,6,9,1,3,bbae47f9f9a4f0cb81fac6cc22f59da6,22,1,165,0
6747,8,3,8,10,8,10,0,3,bbae47f9f9a4f0cb81fac6cc22f59da6,22,1,165,0
6748,5,4,4,1,4,7,3,3,bbae47f9f9a4f0cb81fac6cc22f59da6,22,1,165,0


# Mise en place du jeu de données pour le test de Turing

On tire au hasard 10 id de participants dans le jeu de données. On récupère les réponses de ces 10 participants que l'on stocke dans un dataframe différent. Ensuite, on supprime ces 10 participants, et leurs valeurs, du jeu de données initial.

In [9]:
#jeu de données de test pour turing
turing=pd.DataFrame()
for id in data.id_participant.sample(n=10,random_state=1) :
    turing = pd.concat([turing, data[data.id_participant==id]])
    data.drop(data[data.id_participant==id].index, inplace=True)
    
data=data.reset_index(drop=True)

On vient récupérer les valeurs attribuées pour chaque répétition et les stocker sous de nouvelles valuers rep_2 et rep_3.

In [10]:
#préparation des jeu de données
data_sample = data.copy()

data_rep2 = rep_2(data)
data_rep3 = rep_3(data_rep2)

turing = rep_2(turing)
turing = rep_3(turing)

data_sample = data_rep3.sample(frac=1, random_state=1)
data_sample

Unnamed: 0,personnage_1,personnage_2,force_1,force_2,scenarios,task_1,task_2,repetition_question,id_participant,age,sexe,taille,enfant,rep_1,rep_2
3451,4,1,1,4,7,5,5,3,f2291a01f4e3b93dce34d53b8c9932b6,23,0,170,0,5,5
6046,9,3,1,10,6,0,10,3,b3170fb4ca4d3f95afda88ed4ab7ff0e,22,1,170,0,0,0
4859,5,3,3,10,8,9,1,3,e121ed68cf79f9a6e0c818a8e6729776,51,1,172,1,9,9
255,1,4,7,1,7,10,0,2,85d9e832c808ccb651266c1e1f6fa391,19,1,180,0,10,0
1437,2,4,3,1,3,8,2,1,ad7c4a73f9c5e2baaeb8b2ad0d71e855,36,0,170,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,1,4,9,1,5,8,2,2,e1b54913810142fb6cee063bac256c50,13,0,163,0,9,0
5192,9,1,7,4,6,7,3,1,8c8ff6b41d2e0af18176298bb8c9eb3e,25,1,177,0,0,0
3980,2,1,8,9,7,5,5,2,e1b54913810142fb6cee063bac256c50,13,0,163,0,5,0
235,5,9,8,13,5,3,7,3,087ebbf1a86d8dbb6b7ef033ed2a188c,20,1,185,0,3,3


# Modèles Learning

## Random Forest pour la 1ère série de question

On récupère les données des neuf 1ers scénarios évalués par les participants. On sépare ce qu'on veut prédire des features. On divise ensuite le jeu de données en jeu de test et jeu d'entraînement.

In [11]:
feature_names = [
    "force_1", "force_2", "scenarios", "age", "sexe"
]
#We get X and y
Y = data_sample[data_sample.repetition_question == 1]["task_1"]
X = data_sample[data_sample.repetition_question == 1][feature_names]

#We obtain train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=42)


In [12]:
X_train

Unnamed: 0,force_1,force_2,scenarios,age,sexe
3702,10,3,2,32,1
1701,10,13,9,20,0
4300,8,1,6,23,1
5864,10,5,7,23,1
2462,1,5,8,22,1
...,...,...,...,...,...
4915,12,7,8,25,1
5161,10,10,6,23,0
5514,10,9,8,45,1
2673,1,7,6,22,1


On génére un random forest avec les meilleures paramètres grâce à un grid search. On fait 5 cross validation. Puis on sauvegarde le modèle.

In [11]:
RFC = RandomForestClassifier(max_depth=10, max_leaf_nodes=95)

# Number of trees in random forest
n_estimators = [(i + 1) * 10 for i in range(10)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

#We add the other parameters for Random Forest
params = {
    "criterion": ["gini", "entropy","log_loss"],
    'min_samples_split': [2, 3, 4, 8, 10, 16, 32],
    "n_estimators": n_estimators,
    "bootstrap": bootstrap
}

grid_search_cv = GridSearchCV(RFC, params, verbose=1, cv=5, scoring='accuracy', return_train_score=True)
grid_search_cv.fit(X_train, Y_train)
Best_RFC = grid_search_cv.best_estimator_

print("Best score:", grid_search_cv.best_score_)

#saving the model
filename = 'RF1F.pkl'
print("Sauvegarde du modèle dans ", filename)
pickle.dump(Best_RFC, open(filename, "wb"))

Fitting 5 folds for each of 420 candidates, totalling 2100 fits
Best score: 0.43508043591074214
Sauvegarde du modèle dans  RF1F.pkl


A partir du modèle chargé, on le teste sur notre jeu de test.

In [13]:
# loading the model
filename = 'RF1F.pkl'
RF1F = pickle.load(open(filename, 'rb'))

# printing the model
print(RF1F)

#Test on the test set

Y_test_predict_proba = RF1F.predict_proba(X_test)
Y_test_predict = RF1F.predict(X_test)

print("Confusion matrix\n",
      multilabel_confusion_matrix(Y_test, Y_test_predict)[0])

print(
    classification_report(Y_test,
                          Y_test_predict,
                          output_dict=False,
                          target_names=[str(i) for i in range(11)]))

Y_test = pd.concat([Y_test], ignore_index=True)

classification_report_opti(Y_test, Y_test_predict)

RandomForestClassifier(max_depth=10, max_leaf_nodes=95, min_samples_split=3,
                       n_estimators=80)
Confusion matrix
 [[341  22]
 [ 21  27]]
              precision    recall  f1-score   support

           0       0.55      0.56      0.56        48
           1       0.14      0.06      0.08        18
           2       0.34      0.30      0.32        33
           3       0.38      0.27      0.32        37
           4       0.27      0.20      0.23        35
           5       0.51      0.79      0.62        97
           6       0.21      0.16      0.18        31
           7       0.26      0.22      0.24        27
           8       0.26      0.18      0.21        28
           9       0.67      0.12      0.20        17
          10       0.39      0.53      0.45        40

    accuracy                           0.42       411
   macro avg       0.36      0.31      0.31       411
weighted avg       0.39      0.42      0.38       411

class | precision | recall | 

## Random Forest pour la 2nd série de question

On récupère les données des scénarios lors de la 1ère répétition et ce qui a été choisi lors la 1ère fois. 
On sépare ce qu'on veut prédire des features. 
On divise ensuite le jeu de données en jeu de test et jeu d'entraînement.

In [14]:

feature_names = [
    "force_1", "force_2", "scenarios", "age", "sexe","rep_1"
]
#We get X and y
Y = data_sample[data_sample.repetition_question == 2]["task_1"]
X = data_sample[data_sample.repetition_question == 2][feature_names]

#We obtain train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=42)

On génére un random forest avec les meilleures paramètres grâce à un grid search. On fait 5 cross validation. Puis on sauvegarde le modèle.

In [14]:
#We get the best parameters for the DT
RFC = RandomForestClassifier(max_depth=10, max_leaf_nodes=95)

# Number of trees in random forest
n_estimators = [(i + 1) * 10 for i in range(10)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

#We add the other parameters for Random Forest
params = {
    "criterion": ["gini", "entropy","log_loss"],
    'min_samples_split': [2, 3, 4, 8, 10, 16, 32],
    "n_estimators": n_estimators,
    "bootstrap": bootstrap
}

grid_search_cv = GridSearchCV(RFC, params, verbose=1, cv=5, scoring='accuracy', return_train_score=True)
grid_search_cv.fit(X_train, Y_train)
Best_RFC = grid_search_cv.best_estimator_

print("Best score:", grid_search_cv.best_score_)

#saving the model
filename = 'RF2F.pkl'
print("Sauvegarde du modèle dans ", filename)
pickle.dump(Best_RFC, open(filename, "wb"))

Fitting 5 folds for each of 420 candidates, totalling 2100 fits
Best score: 0.6703295277633627
Sauvegarde du modèle dans  RF2F.pkl


A partir du modèle chargé, on le teste sur notre jeu de test.

In [15]:
# loading the model
filename = 'RF2F.pkl'
RF2F = pickle.load(open(filename, 'rb'))

# printing the model
print(RF2F)

#Test on the test set

Y_test_predict_proba = RF2F.predict_proba(X_test)
Y_test_predict = RF2F.predict(X_test)

print("Confusion matrix\n",
      multilabel_confusion_matrix(Y_test, Y_test_predict)[0])

print(
    classification_report(Y_test,
                          Y_test_predict,
                          output_dict=False,
                          target_names=[str(i) for i in range(11)]))

Y_test = pd.concat([Y_test], ignore_index=True)

classification_report_opti(Y_test, Y_test_predict)

RandomForestClassifier(criterion='entropy', max_depth=10, max_leaf_nodes=95,
                       min_samples_split=3, n_estimators=60)
Confusion matrix
 [[370  11]
 [  6  24]]
              precision    recall  f1-score   support

           0       0.69      0.80      0.74        30
           1       0.21      0.21      0.21        14
           2       0.42      0.39      0.41        28
           3       0.68      0.57      0.62        49
           4       0.58      0.50      0.54        28
           5       0.80      0.89      0.84       107
           6       0.52      0.62      0.57        24
           7       0.58      0.78      0.67        37
           8       0.65      0.36      0.46        36
           9       0.47      0.39      0.42        18
          10       0.84      0.80      0.82        40

    accuracy                           0.66       411
   macro avg       0.59      0.58      0.57       411
weighted avg       0.66      0.66      0.65       411

class | 

## Random Forest pour la 3e série de question

On récupère les données des scénarios lors de la 2e répétition et ce qui a été choisi lors la 1ère et 2e fois. 
On sépare ce qu'on veut prédire des features. 
On divise ensuite le jeu de données en jeu de test et jeu d'entraînement.

In [16]:

feature_names = [
    "force_1", "force_2", "scenarios", "age", "sexe","rep_1","rep_2"
]
#We get X and y
Y = data_sample[data_sample.repetition_question == 3]["task_1"]
X = data_sample[data_sample.repetition_question == 3][feature_names]

#We obtain train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=42)

On génére un random forest avec les meilleures paramètres grâce à un grid search. On fait 5 cross validation. Puis on sauvegarde le modèle.

In [17]:
#We get the best parameters for the DT
RFC = RandomForestClassifier(max_depth=10, max_leaf_nodes=95)

# Number of trees in random forest
n_estimators = [(i + 1) * 10 for i in range(10)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

#We add the other parameters for Random Forest
params = {
    "criterion": ["gini", "entropy","log_loss"],
    'min_samples_split': [2, 3, 4, 8, 10, 16, 32],
    "n_estimators": n_estimators,
    "bootstrap": bootstrap
}

grid_search_cv = GridSearchCV(RFC, params, verbose=1, cv=5, scoring='accuracy', return_train_score=True)
grid_search_cv.fit(X_train, Y_train)
Best_RFC = grid_search_cv.best_estimator_

print("Best score:", grid_search_cv.best_score_)

#saving the model
filename = 'RF3F.pkl'
print("Sauvegarde du modèle dans ", filename)
pickle.dump(Best_RFC, open(filename, "wb"))

Fitting 5 folds for each of 420 candidates, totalling 2100 fits
Best score: 0.7532155830676848
Sauvegarde du modèle dans  RF3F.pkl


A partir du modèle chargé, on le teste sur notre jeu de test.

In [17]:
# loading the model
filename = 'RF3F.pkl'
RF3F = pickle.load(open(filename, 'rb'))

# printing the model
print(RF3F)

#Test on the test set

Y_test_predict_proba = RF3F.predict_proba(X_test)
Y_test_predict = RF3F.predict(X_test)

print("Confusion matrix\n",
      multilabel_confusion_matrix(Y_test, Y_test_predict)[0])

print(
    classification_report(Y_test,
                          Y_test_predict,
                          output_dict=False,
                          target_names=[str(i) for i in range(11)]))

Y_test = pd.concat([Y_test], ignore_index=True)

classification_report_opti(Y_test, Y_test_predict)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=10,
                       max_leaf_nodes=95, min_samples_split=3, n_estimators=50)
Confusion matrix
 [[361   6]
 [  3  41]]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90        44
           1       0.70      0.58      0.64        12
           2       0.65      0.52      0.58        29
           3       0.67      0.80      0.73        40
           4       0.64      0.55      0.59        33
           5       0.79      0.89      0.84        99
           6       0.83      0.56      0.67        36
           7       0.55      0.77      0.64        30
           8       0.79      0.63      0.70        30
           9       0.88      0.74      0.80        19
          10       0.92      0.90      0.91        39

    accuracy                           0.76       411
   macro avg       0.75      0.71      0.73       411
weighted avg       0.77      0.76      0.76   

# Collecte des données utilisées pour le test de Turing

On récupère dans un premier temps les réponses consécutives pour un humain, humain inversé et un robot.

## Sets de réponses consécutives

### Réponses consécutives des participants

Pour chacun des 10 participants selectionnés au hasard, on récupère 3 réponses consécutives

In [18]:
consH_answers = random_3consecutiveanswers(turing)
consH_answers

Unnamed: 0,personnage_1,personnage_2,force_1,force_2,scenarios,task_1,task_2,repetition_question,id_participant,age,sexe,taille,enfant,rep_1,rep_2
1490,2,3,4,10,3,3,7,1,22d27f6173cf259249d7d5e672e5664f,52,0,168,1,0,0
1491,1,5,5,3,5,8,2,1,22d27f6173cf259249d7d5e672e5664f,52,0,168,1,0,0
1492,3,4,10,1,7,5,5,1,22d27f6173cf259249d7d5e672e5664f,52,0,168,1,0,0
335,4,9,1,6,3,2,8,2,490895544f1107af8fccf3d3930e60a8,21,0,154,0,2,0
336,2,8,4,6,4,4,6,2,490895544f1107af8fccf3d3930e60a8,21,0,154,0,4,0
337,2,5,4,2,8,5,5,2,490895544f1107af8fccf3d3930e60a8,21,0,154,0,6,0
2991,1,9,13,15,2,3,7,3,72a91dde69a4d8a39f5d5865ecc9dfbc,40,0,170,0,3,3
2992,4,9,1,15,3,1,9,3,72a91dde69a4d8a39f5d5865ecc9dfbc,40,0,170,0,2,1
2993,2,4,13,1,6,9,1,3,72a91dde69a4d8a39f5d5865ecc9dfbc,40,0,170,0,9,9
645,1,4,6,1,5,8,2,3,74b27da421004afac65ee8331b575c3b,24,1,185,0,9,8


### Réponses consécutives inversées

Maintenant, on souahite faire la même chose mais en inversant les réponses entre les 2 personnages de chaque questions :  
> On récupère le jeu de données de test de Turing intial et on stocke une copie.  
> On intervertie les valeurs de la colonne task_1 et celles de la colonne task_2.  
> On réindexe le DataFrame comme à l'initial
> On récupère aussi les valeurs pour rep_1 et rep_2.
> Puis on appelle la fonction qui ressort 3 réponses consécutives.

In [19]:
turing_inverted = turing.copy()

col = {'task_1': 'task_2', 'task_2': 'task_1'}
turing_inverted.rename(columns=col, inplace=True)

turing_inverted = turing_inverted.reindex(columns=turing.columns)
turing_inverted.drop(columns=['rep_1', 'rep_2'], inplace=True)

rep_2(turing_inverted)
rep_3(turing_inverted)

consH_inverted = random_3consecutiveanswers(turing_inverted)
consH_inverted

Unnamed: 0,personnage_1,personnage_2,force_1,force_2,scenarios,task_1,task_2,repetition_question,id_participant,age,sexe,taille,enfant,rep_1,rep_2
1500,2,4,4,1,4,1,9,2,22d27f6173cf259249d7d5e672e5664f,52,0,168,1,1,0
1501,2,4,4,1,8,5,5,2,22d27f6173cf259249d7d5e672e5664f,52,0,168,1,5,0
1502,3,5,10,3,2,1,9,2,22d27f6173cf259249d7d5e672e5664f,52,0,168,1,1,0
331,2,5,4,2,8,4,6,1,490895544f1107af8fccf3d3930e60a8,21,0,154,0,0,0
332,8,9,6,6,7,5,5,1,490895544f1107af8fccf3d3930e60a8,21,0,154,0,0,0
333,3,5,10,2,1,2,8,2,490895544f1107af8fccf3d3930e60a8,21,0,154,0,2,0
2987,3,5,10,11,4,1,9,2,72a91dde69a4d8a39f5d5865ecc9dfbc,40,0,170,0,0,0
2988,1,3,13,10,8,1,9,3,72a91dde69a4d8a39f5d5865ecc9dfbc,40,0,170,0,1,1
2989,1,2,13,13,1,5,5,3,72a91dde69a4d8a39f5d5865ecc9dfbc,40,0,170,0,5,5
645,1,4,6,1,5,2,8,3,74b27da421004afac65ee8331b575c3b,24,1,185,0,1,2



### Réponses consécutives du modèle

Il ne nous manque plus que les 3 réponses consécutives du modèles. Pour cela :  
> On prend le jeu de test de Turing initial  
> On teste les 3 modèles sur le jeu de données :  
>> RF1F sur la 1ère série de questions  
>> RF2F sur la 2e série de questions  
>> RF3F sur la 3e série de questions  

> On ajoute les réponses des modèles au jeu de données initial  
> On appelle random_3followedanswers()

In [28]:
#On prend le jeu de test de Turing initial
model_prediction = turing.copy()

In [29]:
#On définit les features à donner en entrée du modèle
feature_namesRF1 = ["force_1", "force_2", "scenarios", "age", "sexe"]
feature_namesRF2 = feature_namesRF1 + ["rep_1"]
feature_namesRF3 = feature_namesRF2 + ["rep_2"]

In [30]:
#On récupère les X et Y pour chacun des 3 modèles
Y_RF1F = model_prediction[model_prediction.repetition_question == 1]["task_1"]
X_RF1F = model_prediction[model_prediction.repetition_question == 1][feature_namesRF1]

Y_RF2F = model_prediction[model_prediction.repetition_question == 2]["task_1"]
X_RF2F = model_prediction[model_prediction.repetition_question == 2][feature_namesRF2]

Y_RF3F = model_prediction[model_prediction.repetition_question == 3]["task_1"]
X_RF3F = model_prediction[model_prediction.repetition_question == 3][feature_namesRF3]


In [31]:
#On teste chaque modèle
y_test_predict_probaRF1F = RF1F.predict_proba(X_RF1F)
y_test_predict_RF1F = RF1F.predict(X_RF1F)

y_test_predict_probaRF2F = RF2F.predict_proba(X_RF2F)
y_test_predict_RF2F = RF2F.predict(X_RF2F)

y_test_predict_probaFR3F = RF3F.predict_proba(X_RF3F)
y_test_predict_RF3F = RF3F.predict(X_RF3F)



In [80]:
#On affiche les matrices de confusion
RF1Ftn, RF1Ffp, RF1Ffn, RF1Ftp = multilabel_confusion_matrix(
    Y_RF1F, y_test_predict_RF1F)[0].ravel()
RF2Ftn, RF2Ffp, RF2Ffn, RF2Ftp = multilabel_confusion_matrix(
    Y_RF2F, y_test_predict_RF2F)[0].ravel()
RF3Ftn, RF3Ffp, RF3Ffn, RF3Ftp = multilabel_confusion_matrix(
    Y_RF3F, y_test_predict_RF3F)[0].ravel()

def md_conf_matrice(tn,tp,fn,fp):    
    table = "<table><tr><th></th><th>N</th><th>P</th></tr><tr><th>T</th><td> " + str(
        tn) + " </td><td> " + str(tp) + " </td></tr><tr><th>F</th><td> " + str(
            fn) + " </td><td> " + str(fp) + " </td></tr></table> "
    return table

display(Markdown("#### Matrice de confusion\n"))
display(
    Markdown("<table><tr><th>RF1F</th><th>RF2F</th><th>RF2F</th></tr><tr><td>" +
             md_conf_matrice(RF1Ftn, RF1Ftp, RF1Ffn, RF1Ffp) + "</td><td>" +
             md_conf_matrice(RF2Ftn, RF2Ftp, RF2Ffn, RF2Ffp) + "</td><td>" +
             md_conf_matrice(RF3Ftn, RF3Ftp, RF3Ffn, RF3Ffp) +
             "</tr></td></table>"))

#### Matrice de confusion


<table><tr><th>RF1F</th><th>RF2F</th><th>RF2F</th></tr><tr><td><table><tr><th></th><th>N</th><th>P</th></tr><tr><th>T</th><td> 170 </td><td> 6 </td></tr><tr><th>F</th><td> 5 </td><td> 17 </td></tr></table> </td><td><table><tr><th></th><th>N</th><th>P</th></tr><tr><th>T</th><td> 180 </td><td> 8 </td></tr><tr><th>F</th><td> 4 </td><td> 6 </td></tr></table> </td><td><table><tr><th></th><th>N</th><th>P</th></tr><tr><th>T</th><td> 179 </td><td> 12 </td></tr><tr><th>F</th><td> 6 </td><td> 1 </td></tr></table> </tr></td></table>

In [183]:
#On affiche les rapport de classification, dont l'optimisé
def md_classification_report(head_names, report) :
    
    md_table = "<table><tr>"

    for name in head_names :
        md_table += "<th>" + name + "</th>"

    md_table += "</tr>"

    for i in range(11) :
        md_table += "<tr><th>" + str(i) +"</th>"
        for metrics in report[str(i)] :
            md_table += "<td>" + str(round(report[str(i)][metrics], 2)) + "</td>"
        md_table += "</tr>"

    md_table += "<tr><th> Accuracy </th><td></td><td></td><td></td><th>" + str(round(report["accuracy"], 2)) + "</th></tr></table>"
    return md_table

report = classification_report(Y_RF1F,
                          y_test_predict_RF1F,
                          output_dict=True,
                          target_names=[str(i) for i in range(11)])
head_names = ['Class','Precision','Recall','F1-score','Support']

table = md_classification_report(head_names, report)
display(Markdown("#### Rapport de classification\n"))
display(
    Markdown("<table><tr><th>Classification</th><th>Classification optimisée</th></tr><tr><td>" + table + "</td><td>" + "</td><td></tr></td></table>"))


#### Rapport de classification


<table><tr><th>Classification</th><th>Classification optimisée</th></tr><tr><td><table><tr><th>Class</th><th>Precision</th><th>Recall</th><th>F1-score</th><th>Support</th></tr><tr><th>0</th><td>0.26</td><td>0.55</td><td>0.35</td><td>11</td></tr><tr><th>1</th><td>1.0</td><td>0.11</td><td>0.2</td><td>18</td></tr><tr><th>2</th><td>0.29</td><td>0.38</td><td>0.33</td><td>13</td></tr><tr><th>3</th><td>0.0</td><td>0.0</td><td>0.0</td><td>20</td></tr><tr><th>4</th><td>0.18</td><td>0.12</td><td>0.15</td><td>16</td></tr><tr><th>5</th><td>0.41</td><td>0.81</td><td>0.55</td><td>42</td></tr><tr><th>6</th><td>0.0</td><td>0.0</td><td>0.0</td><td>16</td></tr><tr><th>7</th><td>0.0</td><td>0.0</td><td>0.0</td><td>20</td></tr><tr><th>8</th><td>0.25</td><td>0.31</td><td>0.28</td><td>13</td></tr><tr><th>9</th><td>0.5</td><td>0.06</td><td>0.1</td><td>18</td></tr><tr><th>10</th><td>0.28</td><td>0.64</td><td>0.39</td><td>11</td></tr><tr><th> Accuracy </th><td></td><td></td><td></td><th>0.31</th></tr></table></td><td></td><td></tr></td></table>

In [184]:
y_RF1F = pd.concat([Y_RF1F], ignore_index=True)

classification = classification_report_opti(y_RF1F, y_test_predict_RF1F)
head_names = ['Class','Precision','Recall','F1-score','Accuracy']

table = md_classification_report(head_names, classification)

print('class | precision | recall | f1-score | accuracy')
for i in range(11):
    print(i, '    | ', classification[i]['precision'], '    | ',
          classification[i]['recall'], ' | ',
          classification[i]['f1-score'], '   | ',
          classification[i]['accuracy'])
print()
print('accuracy :    ', classification['accuracy'])


KeyError: '0'

In [None]:
y_pred = pd.Series(
    y_test_predict_RF1F,
    index=model_prediction[model_prediction.repetition_question == 1].index,
    name='task_1 prédite')
result_rep1 = pd.concat([
    y_pred, Y_RF1F, model_prediction[model_prediction.repetition_question == 1]
],
                        axis=1)
result_rep1