# Analyse des outputs de CLIP

In [11]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt


def import_data(file_path):
    try:
        data = pd.read_csv(file_path)
        return data
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        sys.exit(1)

df_real = import_data('./scv_1/val_labels.csv')
df_real["file"] = df_real["file"].str.replace('val/', '')
df_real.rename(columns={"file": "image"}, inplace=True)
df_real

Unnamed: 0,image,age,gender,race,service_test
0,1.jpg,3-9,Male,East Asian,False
1,2.jpg,50-59,Female,East Asian,True
2,3.jpg,30-39,Male,White,True
3,4.jpg,20-29,Female,Latino_Hispanic,True
4,5.jpg,20-29,Male,Southeast Asian,False
...,...,...,...,...,...
10949,10950.jpg,30-39,Male,White,True
10950,10951.jpg,50-59,Male,White,False
10951,10952.jpg,60-69,Male,Latino_Hispanic,False
10952,10953.jpg,20-29,Female,East Asian,False


In [15]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

def plot_all_stats(df: pd.DataFrame,
                   base_column: str,
                   pred_suffix: str = '_pred',
                   true_suffix: str = '_true') -> go.Figure:

    pred_col = f"{base_column}{pred_suffix}"
    true_col = f"{base_column}{true_suffix}"

    if pred_col not in df.columns or true_col not in df.columns:
        raise ValueError(f"Columns '{pred_col}' and/or '{true_col}' not found in DataFrame.")

    # Value counts
    pred_counts = df[pred_col].value_counts().sort_index()
    true_counts = df[true_col].value_counts().sort_index()

    # Error rate per class
    error_rate = (df[pred_col] != df[true_col]).groupby(df[true_col]).mean()

    # Confusion matrix
    confusion = pd.crosstab(df[true_col], df[pred_col], rownames=['True'], colnames=['Predicted'], margins=False)
    annotations = [[f"{value}" for value in row] for row in confusion.values]

    # Make subplot layout
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            f"{base_column.capitalize()} Prediction Count",
            f"{base_column.capitalize()} True Count",
            f"Error Rate per {base_column.capitalize()}",
            f"{base_column.capitalize()} Confusion Matrix"
        ),
        specs=[
            [{"type": "bar"}, {"type": "bar"}],
            [{"type": "bar"}, {"type": "heatmap"}]
        ]
    )

    # Prediction count bar chart
    fig.add_trace(go.Bar(x=pred_counts.index.astype(str), y=pred_counts.values, name='Predicted'), row=1, col=1)

    # True count bar chart
    fig.add_trace(go.Bar(x=true_counts.index.astype(str), y=true_counts.values, name='True'), row=1, col=2)

    # Error rate bar chart
    fig.add_trace(go.Bar(
        x=error_rate.index.astype(str),
        y=error_rate.values,
        marker=dict(color='skyblue'),
        text=error_rate.values.round(2),
        textposition='auto',
        name='Error Rate'
    ), row=2, col=1)

    # Confusion matrix heatmap
    fig.add_trace(go.Heatmap(
        z=confusion.values,
        x=confusion.columns.astype(str),
        y=confusion.index.astype(str),
        colorscale='Blues',
        text=annotations,
        texttemplate="%{text}",
        hoverinfo="z"
    ), row=2, col=2)

    # Final layout tweaks
    fig.update_layout(
        title_text=f"Classification Report for '{base_column}'",
        height=800,
        width=1000,
        showlegend=False
    )

    return fig

def run_stats(preds, cat):
    df_merged = pd.merge(preds, df_real, on='image', suffixes=('_pred', '_true'))
    plot_all_stats(df_merged, cat).show()


## Ethnie

#### Sans contexte

In [16]:
df_r_sans_contexte = import_data('./scv_1/r_sans_contexte.csv')
run_stats(df_r_sans_contexte, 'race')

In [17]:
df_r_sans_contexte_caucasian = import_data('./scv_1/r_sans_contexte_caucasian.csv')
run_stats(df_r_sans_contexte_caucasian, 'race')

In [18]:
df_r_labels_corrected = import_data('./scv_1/r_labels_corrected.csv')
run_stats(df_r_labels_corrected, 'race')

On voit que sans même rajouter du contexte, simplement en changeant les labels par des noms qui ont un rapport plus étriot avec des éthnies donnent de meilleurs résultats.

#### C1

In [19]:
df_r_cultural_heritage = import_data('./scv_1/r_cultural_heritage.csv')
run_stats(df_r_cultural_heritage, 'race')

#### C2

In [20]:
df_r_portrait_individual = import_data('./scv_1/r_portrait_individual.csv')
run_stats(df_r_portrait_individual, 'race')


Utiliser Caucasian à la place de White augmente la probabilité que ces derniers seront mieux prédits.

In [21]:
df_r_feutures = import_data('./scv_1/r_feutures.csv')
run_stats(df_r_feutures, 'race')


In [22]:

df_r_photo_of = import_data('./scv_1/r_photo_of.csv')
run_stats(df_r_photo_of, 'race')


In [23]:
df_g_sans = import_data('./scv_1/g_sans.csv')
run_stats(df_g_sans, 'gender')



In [24]:
df_g_formal = import_data('./scv_1/g_formal.csv')
run_stats(df_g_formal, 'gender')

In [25]:

df_g_picture_of = import_data('./scv_1/g_picture_of.csv')
run_stats(df_g_picture_of, 'gender')


In [26]:

df_g_wo_man = import_data('./scv_1/g_wo_man.csv')
run_stats(df_g_wo_man, 'gender')
