In [49]:
from train_classifieur import train_classifier, pred_classifier
import utils 
import os
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import plotly.express as px
import plotly.graph_objects as go


In [50]:
utils.load_env_file()
data_dir = os.getenv("DATA_DIR", "data/default/")
print(data_dir)

../LALAOUI_RAYAN/selected_data


In [51]:
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv(data_dir+"/metadata.csv")
mlb = MultiLabelBinarizer()
one_hot = df["Finding Labels"].str.get_dummies(sep="|")
df = pd.concat([df.drop(columns="Finding Labels"), one_hot] , axis=1 )
#Suppression des outliers
print(df[df["Patient Age"] > utils.MAX_AGE].shape)
df = df[df["Patient Age"] <= utils.MAX_AGE]
df


(0, 26)


Unnamed: 0,Image Index,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],...,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
0,00000001_000.png,0,1,58,M,PA,2682,2749,0.143000,0.143000,...,0,0,0,0,0,0,0,0,0,0
1,00000005_006.png,6,5,70,F,PA,2992,2991,0.143000,0.143000,...,0,0,0,1,0,0,0,0,0,0
2,00000014_000.png,0,14,61,F,PA,2048,2500,0.171000,0.171000,...,0,0,0,0,0,1,0,0,0,0
3,00000048_000.png,0,48,46,F,PA,2834,2641,0.143000,0.143000,...,0,0,0,0,0,1,0,0,0,0
4,00000051_000.png,0,51,55,M,PA,3056,2544,0.139000,0.139000,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,00030633_000.png,0,30633,50,F,PA,2021,2021,0.194311,0.194311,...,0,0,0,1,0,0,0,0,0,0
1496,00030681_000.png,0,30681,39,M,PA,2021,2021,0.194311,0.194311,...,0,0,0,0,0,1,0,0,0,0
1497,00030683_000.png,0,30683,43,M,AP,3056,2544,0.139000,0.139000,...,0,0,0,0,0,0,0,0,0,0
1498,00030759_000.png,0,30759,51,M,PA,2021,2021,0.194311,0.194311,...,0,0,0,0,0,0,0,0,0,0


# Analyse Descriptive

## Analyse Univarié
On s'interesse ici d'abord au sexe:

In [52]:

def print_sex_count(df):
    gender_counts = df["Patient Gender"].value_counts()
    print(f"Il y a : {gender_counts.get(0, None)} hommes et {gender_counts.get(1, None)} femmes")

    gender_counts = df["Patient Gender"].value_counts(normalize=True)
    print(f"Soit {gender_counts.get(0, None):.2%} d'hommes et {gender_counts.get(1, None):.2%} de femmes")
print_sex_count(df)

px.pie(df, 'Patient Gender')

Il y a : 802 hommes et 698 femmes
Soit 53.47% d'hommes et 46.53% de femmes



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



puis à l'âge:

In [53]:
print(pd.cut(df["Patient Age"], [0,30,60,utils.MAX_AGE], labels=["jeunes", "adultes", "seniors"]).value_counts())
utils.plot_age_dist(df, vlines=[30,60])

Patient Age
adultes    894
jeunes     315
seniors    291
Name: count, dtype: int64


Enfin à la distribution des maladies:

In [54]:
utils.trace_effectif_maladie(df, gender=False)

In [55]:
print("------- Count -------")
print(df[utils.maladies].sum(axis=0).sort_values(ascending=False))
print("------- Moyennes -------")
print(df[utils.maladies].mean().sort_values(ascending=False))

------- Count -------
No Finding            809
Infiltration          270
Atelectasis           126
Effusion              101
Nodule                 97
Mass                   80
Pleural_Thickening     48
Cardiomegaly           40
Fibrosis               36
Consolidation          34
Pneumothorax           30
Emphysema              21
Pneumonia              20
Edema                  14
Hernia                  8
dtype: int64
------- Moyennes -------
No Finding            0.539333
Infiltration          0.180000
Atelectasis           0.084000
Effusion              0.067333
Nodule                0.064667
Mass                  0.053333
Pleural_Thickening    0.032000
Cardiomegaly          0.026667
Fibrosis              0.024000
Consolidation         0.022667
Pneumothorax          0.020000
Emphysema             0.014000
Pneumonia             0.013333
Edema                 0.009333
Hernia                0.005333
dtype: float64


In [56]:

jaccard_matrix = np.zeros((len(utils.maladies), len(utils.maladies)))
for i, disease1 in enumerate(utils.maladies):
    for j, disease2 in enumerate(utils.maladies):
        intersection = np.logical_and(df[disease1], df[disease2]).sum()
        union = np.logical_or(df[disease1], df[disease2]).sum()
        jaccard_matrix[i, j] = intersection / union if union != 0 else 0

jaccard_df = pd.DataFrame(jaccard_matrix, index=utils.maladies, columns=utils.maladies)

fig = go.Figure(data=go.Heatmap(
    z=jaccard_df.values,
    x=jaccard_df.columns,
    y=jaccard_df.index,
    colorscale='Blues'
))

fig.update_layout(title='Disease Co-Occurrence Matrix (Jaccard Distance)',
                  xaxis_title='Disease',
                  yaxis_title='Disease',
                  xaxis=dict(tickangle=-45),
                  width=800, height=800)

fig.show()

## Analyse Bivarié

On a vu durant le mi-projet que les colonnes: `View Position ,OriginalImage[Width ,Height]	,OriginalImagePixelSpacing[x,	y]` ne contiennent pas de biais significatifs , on se concentrera dans la suite sur l'âge et le sexe.

Essayons d'abord de visualiser la distribution d'un attribut par rapport à l'autre.

In [57]:
utils.plot_age_dist(df, gender=True)

## Analyse de l'influence du sexe sur les labels:

In [58]:
utils.trace_effectif_maladie(df, gender=True)

## Passons à l'âge:

In [59]:
utils.plot_avg_diseases_by_age(df, maladies=utils.onlymaladies, window_size=5, vlines=[40])
utils.plot_avg_diseases_by_age(df, maladies=utils.onlymaladies, vlines=[40], gender=True)

In [60]:

def plot_age_histogram(df):
    bins = list(range(0, df["Patient Age"].max() + 20, 20)) 
    labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins)-1)]  
    df["Age Group"] = pd.cut(df["Patient Age"], bins=bins, labels=labels, right=False)
    fig = px.bar(df["Age Group"].value_counts().sort_index(),
                 x=df["Age Group"].unique(),
                 y=df["Age Group"].value_counts(),
                 title="Répartition des patients par tranche d'âge",
                 labels={"x": "Tranche d'âge", "y": "Nombre de patients"},
                 color=df["Age Group"].unique(),
                 color_discrete_sequence=px.colors.qualitative.Set3) 

    fig.update_layout(xaxis_title="Tranche d'âge", yaxis_title="Nombre de patients")
    fig.show()


plot_age_histogram(df)


In [61]:
utils.plot_disease_trends(df, "No Finding", gender=False)
utils.plot_disease_trends(df, utils.onlymaladies, gender=False)

In [62]:
utils.plot_patient_age_distribution(df)


In [63]:
utils.plot_disease_trends(df, utils.maladies, gender=True)
utils.stacked_area_chart(df, maladies=utils.onlymaladies, gender=True)

## Zoom sur les images:
La nouveauté par rapport au mi-projet c'est l'ajout des images au dataset, ces nouvelles données pourrait introduire de nouveaux biais comme une faible luminosité, la présence de dispositif intra-corporelle etc.

In [64]:
from PIL import Image
import image_utils
from pathlib import Path



def load_images_from_directory(directory_path):
    # Initialize empty lists to store image objects and their names
    image_objects = []
    image_names = []

    # Use Path to recursively find all image files in the directory and subdirectories
    image_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".gif"}
    for img_path in Path(directory_path).rglob("*"):
        if img_path.suffix.lower() in image_extensions:
            # Open image and store the image object
            img = Image.open(img_path)
            image_objects.append(img)
            
            # Store the image file name
            image_names.append(img_path.name)
    
    # Create a DataFrame with image objects and their names
    df = pd.DataFrame({
        'image': image_objects,
        'file_name': image_names
    })
    
    return df

images_df = load_images_from_directory(data_dir)
images = images_df["image"]

N=6
image_utils.display_images(images.loc[:N-1])

On remarque la présence de pictogram visuels liée au metadonnées qui parfois se supperpose aux images.

In [65]:
filtered_imgs = [image_utils.transparent_background(img, ~image_utils.foreground_filter(img)) for img in images[:N] ]
image_utils.display_images(filtered_imgs)

In [66]:
def lightness(img: Image.Image) -> float:
    """Return the darkness of a PIL image."""
    M = np.array(img)
    G = M[:, :, 1] * 1.0
    R = M[:, :, 0] * 1.0
    B = M[:, :, 2] * 1.0
    gris = (R + G + B) / 3
    M = [
        [[gris[i][j] for k in range(3)] for j in range(len(M[i]))]
        for i in range(len(M))
    ]
    F = image_utils.foreground_filter(M, 130)
    return np.mean(gris[F])

""" lightness_metric = pd.DataFrame([lightness(img) for img in images], columns=["lightness"])
images_df["lightness"] = lightness_metric
images_df = images_df.drop(columns=["image"])
images_df.to_csv("lightness.csv", index=False) """
lightness_metric = pd.read_csv("./lightness.csv")
lightness_metric = lightness_metric.rename(columns={"file_name" : "Image Index"})
lightness_metric 

Unnamed: 0,Image Index,lightness
0,00029210_000.png,77.073721
1,00029643_000.png,76.790874
2,00011500_000.png,66.018137
3,00002617_000.png,71.700541
4,00018772_000.png,64.387512
...,...,...
1495,00011810_000.png,70.876629
1496,00025124_003.png,69.104207
1497,00023629_004.png,59.802403
1498,00030078_000.png,88.036470


In [67]:
print(lightness_metric['lightness'].describe(include=all))
fig = go.Figure(data=go.Box(y=lightness_metric["lightness"], name="Lightness"))
fig.update_layout(
    title="Lightness",
    yaxis_title="Values",
)
fig.show()

count    1500.000000
mean       72.938384
std        14.437157
min        15.075729
25%        63.479997
50%        70.741514
75%        83.213571
max       109.922028
Name: lightness, dtype: float64


In [70]:
df = pd.merge(df, lightness_metric, on="Image Index")
df

Unnamed: 0,Image Index,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],...,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,Age Group,lightness
0,00000001_000.png,0,1,58,M,PA,2682,2749,0.143000,0.143000,...,0,0,0,0,0,0,0,0,40-59,67.919649
1,00000005_006.png,6,5,70,F,PA,2992,2991,0.143000,0.143000,...,0,1,0,0,0,0,0,0,60-79,52.914027
2,00000014_000.png,0,14,61,F,PA,2048,2500,0.171000,0.171000,...,0,0,0,1,0,0,0,0,60-79,73.978926
3,00000048_000.png,0,48,46,F,PA,2834,2641,0.143000,0.143000,...,0,0,0,1,0,0,0,0,40-59,64.454487
4,00000051_000.png,0,51,55,M,PA,3056,2544,0.139000,0.139000,...,0,1,0,0,0,0,0,0,40-59,71.254098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,00030633_000.png,0,30633,50,F,PA,2021,2021,0.194311,0.194311,...,0,1,0,0,0,0,0,0,40-59,93.543533
1496,00030681_000.png,0,30681,39,M,PA,2021,2021,0.194311,0.194311,...,0,0,0,1,0,0,0,0,20-39,94.068207
1497,00030683_000.png,0,30683,43,M,AP,3056,2544,0.139000,0.139000,...,0,0,0,0,0,0,0,0,40-59,92.758716
1498,00030759_000.png,0,30759,51,M,PA,2021,2021,0.194311,0.194311,...,0,0,0,0,0,0,0,0,40-59,99.808464


- Introduction (/3)
- Preparation et analyse des données (/3)
- Application des méthodes de pre processing (/5)
- Application des méthodes de post processing (/5)
- Analyse, compréhension (/3)
- Conclusion (/1)

In [21]:
# Charger le DataFrame
preddf = pd.read_csv("expe_log/preds.csv")

# Nettoyer les espaces dans les noms de colonnes
preddf.columns = preddf.columns.str.strip()

y_pred = preddf["preds"]
y_true = preddf["labels"]
y_pred = y_pred.map({"sain": 0, "malade": 1})
y_true = y_true.map({"sain": 0, "malade": 1})


In [22]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np

def display_error_rate(y_true, y_pred, df, group_columns):
    for group_value, group_df in df.groupby(group_columns):
        y_true_group = y_true[group_df.index]
        y_pred_group = y_pred[group_df.index]
        
        cm = confusion_matrix(y_true_group, y_pred_group)    
    
        total = cm.sum()
        correct = np.trace(cm) 
        error_rate = (total - correct) / total * 100 
        
        print(f"Taux d'erreur pour le groupe {group_columns}: {group_value} = {error_rate:.2f}%")

display_error_rate(y_true, y_pred, preddf, group_columns=["Patient Gender"])
preddf['+40ans']=preddf['Patient Age'] >= 40
display_error_rate(y_true, y_pred, preddf, group_columns=["+40ans"])


Taux d'erreur pour le groupe ['Patient Gender']: ('F',) = 30.80%
Taux d'erreur pour le groupe ['Patient Gender']: ('M',) = 31.17%
Taux d'erreur pour le groupe ['+40ans']: (False,) = 24.52%
Taux d'erreur pour le groupe ['+40ans']: (True,) = 34.42%


In [23]:
import pandas as pd
import plotly.express as px
from sklearn.metrics import confusion_matrix
import numpy as np

def plot_confusion_matrix(y_true, y_pred, labels=["sain", "malade"], normalize=False, title="Matrice de Confusion"):
    cm = confusion_matrix(y_true, y_pred)
    
    if normalize:
        cm = cm.astype('float') / len(y_true) * 100
    
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    
    fig = px.imshow(cm_df, 
                    labels=dict(x="Prédiction", y="Vérité", color="Fréquence (%)" if normalize else "Fréquence"), 
                    x=labels, 
                    y=labels, 
                    color_continuous_scale='Blues',
                    range_color=[0, 100] if normalize else None) 
    
    for i in range(len(cm_df)):
        for j in range(len(cm_df.columns)):
            fig.add_annotation(
                x=j,
                y=i,
                text=f'{cm_df.iloc[i, j]:.2f}%' if normalize else f'{cm_df.iloc[i, j]}',
                showarrow=False,
                font=dict(color="black", size=14),
                align="center"
            )
    
    fig.update_layout(title=title, xaxis_title="Prédiction", yaxis_title="Vérité")
    fig.show()

In [24]:
plot_confusion_matrix(y_true, y_pred, normalize=True)
plot_confusion_matrix(y_true, y_pred)


Oula il predit a tendence a se tromper sur de faux negatif, ce n'est pas ce qu'on veut !!!

In [25]:
def plot_confusion_matrix_by_group(y_true, y_pred, df, group_columns, labels=None, normalize=False):
    """
    Affiche des matrices de confusion séparées pour chaque groupe défini par group_columns.
    """
    for group_value, group_df in df.groupby(group_columns):
        y_true_group = y_true[group_df.index]
        y_pred_group = y_pred[group_df.index]
        
        print(f"Matrice de confusion pour {group_columns}: {group_value}")
        plot_confusion_matrix(y_true_group, y_pred_group, labels, normalize, title=f"Matrice de Confusion ({group_columns}={group_value})")


In [None]:
preddf['+40ans'] = preddf['Patient Age'] >= 40
plot_confusion_matrix_by_group(y_true, y_pred, preddf, group_columns=["+40ans"], labels=["sain", "malade"])

Matrice de confusion pour ['+40ans']: (False,)


Matrice de confusion pour ['+40ans']: (True,)
