In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

## Distribution des récidives réelles

Le score de récidive théorique

### Distribution globale

In [17]:
file_path = 'data/clean_data.csv' 
df = pd.read_csv(file_path)
column_names = df.columns.tolist()
column_names

['id',
 'first',
 'last',
 'sex',
 'dob',
 'age',
 'age_cat',
 'race',
 'juv_fel_count',
 'decile_score',
 'juv_misd_count',
 'juv_other_count',
 'priors_count',
 'days_b_screening_arrest',
 'c_jail_in',
 'c_jail_out',
 'c_days_from_compas',
 'c_charge_degree',
 'c_charge_desc',
 'is_recid',
 'r_charge_degree',
 'r_days_from_arrest',
 'r_offense_date',
 'r_charge_desc',
 'r_jail_in',
 'is_violent_recid',
 'vr_charge_degree',
 'vr_offense_date',
 'vr_charge_desc',
 'decile_score.1',
 'score_text',
 'screening_date',
 'v_decile_score',
 'v_score_text',
 'priors_count.1',
 'event']

In [18]:
df['c_jail_in'] = pd.to_datetime(df['c_jail_in'])
df['c_jail_out'] = pd.to_datetime(df['c_jail_out'])
df['duration'] = (df['c_jail_out'] - df['c_jail_in']).dt.days
df_glob = df.drop(['id', 'c_jail_in', 'c_jail_out', 'first', 'last', 'age', 'dob', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'decile_score', 'days_b_screening_arrest', 'c_days_from_compas', 'c_charge_desc', 'r_offense_date', 'r_charge_desc', 'r_jail_in','vr_offense_date','vr_charge_desc','decile_score.1','score_text','screening_date','v_decile_score','v_score_text','priors_count.1'], axis=1) 
df_glob







Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,is_recid,r_charge_degree,r_days_from_arrest,is_violent_recid,vr_charge_degree,event,duration
0,Male,Greater than 45,Other,0,(F3),0,,,0,,0,0.0
1,Male,Greater than 45,Other,0,(F3),0,,,0,,0,0.0
2,Male,25 - 45,Caucasian,0,,-1,,,0,,0,
3,Male,25 - 45,African-American,0,(F3),1,(F3),,1,(F3),1,10.0
4,Male,Less than 25,African-American,4,(F3),1,(M1),0.0,0,,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
15508,Male,Less than 25,African-American,0,(F3),0,,,0,,0,1.0
15509,Female,25 - 45,African-American,5,(M1),0,,,0,,0,10.0
15510,Male,Greater than 45,Other,0,(F2),0,,,0,,0,1.0
15511,Female,25 - 45,African-American,3,(M1),0,,,0,,0,1.0


In [19]:
# One-hot encoding using pandas
sex_encoded = pd.get_dummies(df['sex'], prefix='sex')
age_encoded = pd.get_dummies(df['age_cat'], prefix='sex')
race_encoded = pd.get_dummies(df['race'], prefix='race')
c_degree_encoded = pd.get_dummies(df['c_charge_degree'], prefix='c_degree')
r_degree_encoded = pd.get_dummies(df['r_charge_degree'], prefix='r_degree')
vr_degree_encoded = pd.get_dummies(df['vr_charge_degree'], prefix='vr_degree')

# Concatenate the one-hot encoded columns with the original DataFrame
df_encoded = pd.concat([df_glob, sex_encoded, race_encoded, c_degree_encoded, r_degree_encoded, vr_degree_encoded], axis=1)
df_encoded = df_encoded.drop(['sex', 'age_cat', 'race', 'c_charge_degree', 'r_charge_degree', 'vr_charge_degree'], axis=1)
df_encoded

Unnamed: 0,priors_count,is_recid,r_days_from_arrest,is_violent_recid,event,duration,sex_Female,sex_Male,race_African-American,race_Asian,...,r_degree_(MO3),vr_degree_(F1),vr_degree_(F2),vr_degree_(F3),vr_degree_(F5),vr_degree_(F6),vr_degree_(F7),vr_degree_(M1),vr_degree_(M2),vr_degree_(MO3)
0,0,0,,0,0,0.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,0,,0,0,0.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,-1,,0,0,,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,1,,1,1,10.0,False,True,True,False,...,False,False,False,True,False,False,False,False,False,False
4,4,1,0.0,0,0,1.0,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15508,0,0,,0,0,1.0,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
15509,5,0,,0,0,10.0,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
15510,0,0,,0,0,1.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
15511,3,0,,0,0,1.0,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
def plot_correlation_matrix(df):
    """
    Plots a correlation matrix using Matplotlib.
    
    Args:
    df (pandas DataFrame): The DataFrame for which the correlation matrix will be plotted.
    """
    # Calculate the correlation matrix
    corr_matrix = df.corr()

    # Create heatmap with hover tooltips using Plotly Express
    fig = px.imshow(corr_matrix,
                    labels=dict(x="Features", y="Features", color="Correlation"),
                    x=corr_matrix.index,
                    y=corr_matrix.columns,
                    color_continuous_scale='Viridis')

    # Update hover template
    fig.update_traces(hovertemplate='Feature 1: %{y}<br>Feature 2: %{x}<br>Correlation: %{z}<extra></extra>')

    # Set layout
    fig.update_layout(title='Correlation Matrix')

    # Show plot
    fig.show()

plot_correlation_matrix(df_encoded)

In [21]:
df_noNan = df_encoded.dropna()
plot_correlation_matrix(df_noNan)