In [1]:
from train_classifieur import train_classifier, pred_classifier
import utils 
import os
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix


In [2]:
utils.load_env_file()
data_dir = os.getenv("DATA_DIR", "data/default/")
print(data_dir)

./data/SAILLANT_ARTHUR/selected_data


In [3]:
import plotly.express as px
df = pd.read_csv(data_dir+"/metadata.csv")
px.pie(df, 'Patient Gender')

In [4]:
utils.plot_age_dist(df, vlines=[30,60])

In [5]:
utils.plot_age_dist(df, gender=True)

- Introduction (/3)
- Preparation et analyse des données (/3)
- Application des méthodes de pre processing (/5)
- Application des méthodes de post processing (/5)
- Analyse, compréhension (/3)
- Conclusion (/1)

In [6]:
# Charger le DataFrame
preddf = pd.read_csv("expe_log/preds.csv")

# Nettoyer les espaces dans les noms de colonnes
preddf.columns = preddf.columns.str.strip()

y_pred = preddf["preds"]
y_true = preddf["labels"]
y_pred = y_pred.map({"sain": 0, "malade": 1})
y_true = y_true.map({"sain": 0, "malade": 1})


In [7]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np

def display_error_rate(y_true, y_pred, df, group_columns):
    for group_value, group_df in df.groupby(group_columns):
        y_true_group = y_true[group_df.index]
        y_pred_group = y_pred[group_df.index]
        
        cm = confusion_matrix(y_true_group, y_pred_group)    
    
        total = cm.sum()
        correct = np.trace(cm) 
        error_rate = (total - correct) / total * 100 
        
        print(f"Taux d'erreur pour le groupe {group_columns}: {group_value} = {error_rate:.2f}%")

display_error_rate(y_true, y_pred, preddf, group_columns=["Patient Gender"])
preddf['+40ans']=preddf['Patient Age'] >= 40
display_error_rate(y_true, y_pred, preddf, group_columns=["+40ans"])


Taux d'erreur pour le groupe ['Patient Gender']: ('F',) = 27.10%
Taux d'erreur pour le groupe ['Patient Gender']: ('M',) = 27.53%
Taux d'erreur pour le groupe ['+40ans']: (False,) = 22.95%
Taux d'erreur pour le groupe ['+40ans']: (True,) = 29.45%


In [13]:
import pandas as pd
import plotly.express as px
from sklearn.metrics import confusion_matrix
import numpy as np

def plot_confusion_matrix(y_true, y_pred, labels=["sain", "malade"], normalize=False, title="Matrice de Confusion"):
    cm = confusion_matrix(y_true, y_pred)
    
    if normalize:
        cm = cm.astype('float') / len(y_true) * 100
    
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    
    fig = px.imshow(cm_df, 
                    labels=dict(x="Prédiction", y="Vérité", color="Fréquence (%)" if normalize else "Fréquence"), 
                    x=labels, 
                    y=labels, 
                    color_continuous_scale='Blues',
                    range_color=[0, 100] if normalize else None) 
    
    for i in range(len(cm_df)):
        for j in range(len(cm_df.columns)):
            fig.add_annotation(
                x=j,
                y=i,
                text=f'{cm_df.iloc[i, j]:.2f}%' if normalize else f'{cm_df.iloc[i, j]}',
                showarrow=False,
                font=dict(color="black", size=14),
                align="center"
            )
    
    fig.update_layout(title=title, xaxis_title="Prédiction", yaxis_title="Vérité")
    fig.show()

In [14]:
plot_confusion_matrix(y_true, y_pred, normalize=True)
plot_confusion_matrix(y_true, y_pred)


Oula il predit a tendence a se tromper sur de faux negatif, ce n'est pas ce qu'on veut !!!

In [10]:
def plot_confusion_matrix_by_group(y_true, y_pred, df, group_columns, labels=None, normalize=False):
    """
    Affiche des matrices de confusion séparées pour chaque groupe défini par group_columns.
    """
    for group_value, group_df in df.groupby(group_columns):
        y_true_group = y_true[group_df.index]
        y_pred_group = y_pred[group_df.index]
        
        print(f"Matrice de confusion pour {group_columns}: {group_value}")
        plot_confusion_matrix(y_true_group, y_pred_group, labels, normalize, title=f"Matrice de Confusion ({group_columns}={group_value})")


In [12]:
preddf['+40ans'] = preddf['Patient Age'] >= 40
plot_confusion_matrix_by_group(y_true, y_pred, preddf, group_columns=["+40ans"], labels=["sain", "malade"])

Matrice de confusion pour ['+40ans']: (False,)


Matrice de confusion pour ['+40ans']: (True,)
