In [15]:
# Import des librairies
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Chargement du fichier CSV
df = pd.read_csv("merged_data.csv")

# Afficher les premiÃ¨res lignes pour vÃ©rifier
df.head()


Unnamed: 0,_id_x,genre,age,nationalite,niveauEtudes,sessionId,__v_x,_id_y,textId,associationType,personAType,valueOneA,valueTwoA,forceA,personBType,valueOneB,valueTwoB,forceB,createdAt,__v_y
0,67e05b30dacaf24880d98fe8,homme,23,HRV,bac+4,420008aa-5136-48fb-9a6f-4347673fce87,0,67e05bb9dacaf24880d98feb,67a243ea6d8e29a001947946,risk-reward,Robot,8,5,8,Personne Ã¢gÃ©e,2,5,3,2025-03-23T19:06:33.515Z,0
1,67e05b30dacaf24880d98fe8,homme,23,HRV,bac+4,420008aa-5136-48fb-9a6f-4347673fce87,0,67e05c02dacaf24880d98fed,67a243ea6d8e29a001947945,risk-reward,Femme petite taille,4,7,5,Robot,6,3,8,2025-03-23T19:07:46.725Z,0
2,67e05b30dacaf24880d98fe8,homme,23,HRV,bac+4,420008aa-5136-48fb-9a6f-4347673fce87,0,67e05c25dacaf24880d98fef,67a51efa18a0dfe14aac016b,risk-effort,Enfant,5,3,3,Femme petite taille,5,7,5,2025-03-23T19:08:21.185Z,0
3,67e05b30dacaf24880d98fe8,homme,23,HRV,bac+4,420008aa-5136-48fb-9a6f-4347673fce87,0,67e05c33dacaf24880d98ff1,67a243ea6d8e29a001947947,effort-reward,Enfant,3,3,3,Femme petite taille,7,7,5,2025-03-23T19:08:35.846Z,0
4,67e05b30dacaf24880d98fe8,homme,23,HRV,bac+4,420008aa-5136-48fb-9a6f-4347673fce87,0,67e05c4bdacaf24880d98ff3,67a243ea6d8e29a001947949,effort-reward,Femme grande taille,5,5,7,Homme petite taille,5,5,6,2025-03-23T19:08:59.087Z,0


In [16]:

# Binariser les cibles effort / risque / reward
def binariser_targets(row):
    row["effort"] = 1 if "effort" in row["associationType"] else 0
    row["risque"] = 1 if "risk" in row["associationType"] else 0
    row["reward"] = 1 if "reward" in row["associationType"] else 0
    return row

df_bin = df.copy()
df_bin = df_bin.apply(binariser_targets, axis=1)

# SÃ©parer les features (X) et les cibles (y)
# Variables explicatives (features) â†’ ce sont les colonnes qu'on donne au modÃ¨le pour quâ€™il apprenne
X = df_bin[["genre", "age", "nationalite", "niveauEtudes", "personAType", "personBType"]]
# Variables cibles (targets) â†’ ce sont les valeurs Ã  prÃ©dire, ici binarisÃ©es :
y = df_bin[["effort", "risque", "reward"]].astype(int)

#  Split des donnÃ©es
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# PrÃ©traitement : encodage + normalisation, dans notre dataset les colonne suivantes sont catÃ©gorielles :
#genre, nationalite, niveauEtude, PersonTypeA...B   -> ces colonnes sont encodÃ©es avec oneHotEncoder
#cela crrer une nouvelle colonne binaire 0 ou 1 pour chaque catÃ©gorie
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), ["age"]),
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["genre", "nationalite", "niveauEtudes", "personAType", "personBType"])
])
#------------------------------------------------------------------------------------------
# dans cette partie on met le modÃ¨le Ã  tester (RandomForest ici, mais facilement remplaÃ§able)
#exemple de model:-----mettre Ã§a --
#from sklearn.svm import SVC
#base_model = SVC()
#  a la place de ca 
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
multioutput_model = MultiOutputClassifier(base_model)
#---------------------------------------------------------------------------------------------------

#  Construire le pipeline complet (prÃ©traitement + modÃ¨le)
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", multioutput_model)
])

#  EntraÃ®ner le pipeline sur les donnÃ©es
pipeline.fit(X_train, y_train)

#  Ã‰valuer le modÃ¨le
y_pred = pipeline.predict(X_test)
for i, col in enumerate(["effort", "risque", "reward"]):
    print(f"\nðŸ“Š Ã‰valuation du modÃ¨le pour la cible : {col.upper()}")
    
    y_true = y_test[col]
    y_hat = y_pred[:, i]
    
    acc = accuracy_score(y_true, y_hat)
    prec = precision_score(y_true, y_hat, zero_division=0)
    rec = recall_score(y_true, y_hat, zero_division=0)
    f1 = f1_score(y_true, y_hat, zero_division=0)
    cm = confusion_matrix(y_true, y_hat)
    
    print(f"ðŸ”¹ Accuracy     : {acc:.2f}")
    print(f"ðŸ”¹ PrÃ©cision    : {prec:.2f}")
    print(f"ðŸ”¹ Rappel       : {rec:.2f}")
    print(f"ðŸ”¹ F1-score     : {f1:.2f}")
    print(f"ðŸ”¹ Matrice de confusion :\n{cm}")


ðŸ“Š Ã‰valuation du modÃ¨le pour la cible : EFFORT
ðŸ”¹ Accuracy     : 0.65
ðŸ”¹ PrÃ©cision    : 0.68
ðŸ”¹ Rappel       : 0.84
ðŸ”¹ F1-score     : 0.75
ðŸ”¹ Matrice de confusion :
[[ 29  64]
 [ 25 134]]

ðŸ“Š Ã‰valuation du modÃ¨le pour la cible : RISQUE
ðŸ”¹ Accuracy     : 0.66
ðŸ”¹ PrÃ©cision    : 0.72
ðŸ”¹ Rappel       : 0.82
ðŸ”¹ F1-score     : 0.77
ðŸ”¹ Matrice de confusion :
[[ 27  54]
 [ 31 140]]

ðŸ“Š Ã‰valuation du modÃ¨le pour la cible : REWARD
ðŸ”¹ Accuracy     : 0.62
ðŸ”¹ PrÃ©cision    : 0.71
ðŸ”¹ Rappel       : 0.76
ðŸ”¹ F1-score     : 0.73
ðŸ”¹ Matrice de confusion :
[[ 23  55]
 [ 41 133]]
