# Modélisation

Importation des modules

In [66]:
import pandas as pd  
import numpy as np, pandas as pd, matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, chi2_contingency
from scipy.stats.contingency import association
from scipy.stats import mannwhitneyu, norm
import chardet
import unicodedata
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler



## Importation des bases de données

In [82]:

base_finale= pd.read_csv("base_finale.csv", delimiter=",", low_memory=False)

In [83]:
base_finale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126073 entries, 0 to 126072
Data columns (total 41 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   126073 non-null  int64  
 1   Num_Acc      126073 non-null  int64  
 2   id_vehicule  126073 non-null  object 
 3   num_veh      126073 non-null  object 
 4   place        126073 non-null  int64  
 5   catu         126073 non-null  int64  
 6   grav         126073 non-null  int64  
 7   sexe         126073 non-null  int64  
 8   an_nais      126073 non-null  float64
 9   trajet       126073 non-null  int64  
 10  secu1        126073 non-null  int64  
 11  secu2        126073 non-null  int64  
 12  secu3        126073 non-null  int64  
 13  locp         126073 non-null  int64  
 14  actp         126073 non-null  object 
 15  etatp        126073 non-null  int64  
 16  senc         126073 non-null  int64  
 17  catv         126073 non-null  int64  
 18  obs          126073 non-

In [75]:
catv_labels = {
    1: "VP", 2: "VU", 3: "PL <= 3.5t", 4: "PL > 3.5t", 5: "PL + remorque",
    6: "Véhicule de transport en commun", 7: "Taxi", 8: "Deux-roues motorisé",
    9: "Cyclomoteur", 10: "Bicyclette", 11: "Engin spécial", 12: "Tracteur agricole",
    13: "Autobus", 14: "Tramway", 15: "Autre"
}

catu_labels = {
    1: "Conducteur",2: "Passager",3: "Piéton"
}
atm_labels = {
    1: "Normale", 2: "Pluie légère", 3: "Pluie forte", 4: "Neige - grêle", 5: "Brouillard - fumée", 6: "Vent fort",
    7: "Temps éblouissant",8: "Temps couvert",9: "Autre"
}
lum_labels = {
    1: "Plein jour",2: "Crépuscule ou aube",3: "Nuit sans éclairage public",4: "Nuit avec éclairage non allumé",
    5: "Nuit avec éclairage allumé"
}
trajet_labels = {
    1: "Domicile - travail",2: "Domicile - école",3: "Professionnel ",4: "Personnel - loisirs",
    5: "Autre"
}


In [76]:
base_finale['catu'] = base_finale['catu'].map(catu_labels)
base_finale['catv'] = base_finale['catv'].map(catv_labels)
base_finale['atm']  = base_finale['atm'].map(atm_labels)
base_finale['lum']  = base_finale['lum'].map(lum_labels)
base_finale['trajet'] = base_finale['trajet'].map(trajet_labels)


Régression logistique : prédire la gravité
* Objectif : prédire si un usager est blessé (grav > 1) ou indemne.

In [84]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Création de la variable cible : blessé (1) ou indemne (0)
base_finale['cible_blesse'] = base_finale['grav'].apply(lambda x: 1 if x in [3, 4] else 0)

# Liste des variables explicatives sélectionnées
variables = ['sexe', 'age', 'catv', 'catu', 'atm', 'lum', 'trajet', 'secu1', 'secu2', 'choc', 'manv']

# Création de la base
df_model = base_finale[variables + ['cible_blesse']].dropna()

# Encodage des variables catégorielles
X = pd.get_dummies(df_model.drop(columns='cible_blesse'), drop_first=True)
X = sm.add_constant(X)

# Cible
y = df_model['cible_blesse'].astype(float)

# Suppression des NaN ou inf
X.replace([np.inf, -np.inf], np.nan, inplace=True)
df_clean = pd.concat([X, y], axis=1).dropna()
X_clean = df_clean.drop(columns='cible_blesse')
y_clean = df_clean['cible_blesse']

# Régression logistique binaire
logit_model = sm.Logit(y_clean, X_clean)
result = logit_model.fit()

# Résumé
print("=== Modèle logistique binaire : Blessé vs Indemne ===")
print(result.summary2())


Optimization terminated successfully.
         Current function value: 0.628451
         Iterations 5
=== Modèle logistique binaire : Blessé vs Indemne ===
                          Results: Logit
Model:              Logit            Method:           MLE        
Dependent Variable: cible_blesse     Pseudo R-squared: 0.083      
Date:               2025-05-12 00:48 AIC:              158481.6408
No. Observations:   126070           BIC:              158598.5759
Df Model:           11               Log-Likelihood:   -79229.    
Df Residuals:       126058           LL-Null:          -86437.    
Converged:          1.0000           LLR p-value:      0.0000     
No. Iterations:     5.0000           Scale:            1.0000     
--------------------------------------------------------------------
           Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
--------------------------------------------------------------------
const     -1.5427     0.0326   -47.3002   0.0000   -1.6067  

In [85]:
# Coefficients et odds ratios
params = result.params
conf = result.conf_int()
conf.columns = ['2.5%', '97.5%']
odds = np.exp(params)
conf_exp = np.exp(conf)
summary_table = pd.concat([params, odds, conf_exp], axis=1)
summary_table.columns = ['Coef.', 'Odds Ratio', 'OR 2.5%', 'OR 97.5%']
print(summary_table.round(4))


         Coef.  Odds Ratio  OR 2.5%  OR 97.5%
const  -1.5427      0.2138   0.2006    0.2279
sexe    0.2783      1.3209   1.2865    1.3561
age    -0.0066      0.9934   0.9927    0.9940
catv    0.0371      1.0378   1.0366    1.0389
catu    0.7819      2.1856   2.1361    2.2362
atm    -0.0005      0.9995   0.9926    1.0064
lum     0.0396      1.0404   1.0317    1.0492
trajet -0.0132      0.9869   0.9826    0.9912
secu1   0.0829      1.0864   1.0802    1.0927
secu2   0.0459      1.0470   1.0437    1.0504
choc   -0.0163      0.9839   0.9790    0.9888
manv   -0.0180      0.9822   0.9807    0.9836


In [86]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

# Variables explicatives
variables = ['sexe', 'age', 'catv', 'catu', 'atm', 'lum', 'trajet', 'secu1', 'secu2', 'choc', 'manv']

# Création de la base avec la gravité comme cible multinomiale
df_multi = base_finale[variables + ['grav']].dropna()

# Encodage
X = pd.get_dummies(df_multi.drop(columns='grav'), drop_first=True)
X = sm.add_constant(X)
y = df_multi['grav'].astype(int)

# Nettoyage
X.replace([np.inf, -np.inf], np.nan, inplace=True)
df_clean = pd.concat([X, y], axis=1).dropna()
X_clean = df_clean.drop(columns='grav')
y_clean = df_clean['grav']

# Régression logistique multinomiale (MNLogit)
mnlogit_model = sm.MNLogit(y_clean, X_clean)
mnlogit_result = mnlogit_model.fit()

# Résumé
print("=== Modèle logistique multinomial : prédiction de la gravité exacte ===")
print(mnlogit_result.summary())


Optimization terminated successfully.
         Current function value: 1.016981
         Iterations 8
=== Modèle logistique multinomial : prédiction de la gravité exacte ===
                          MNLogit Regression Results                          
Dep. Variable:                   grav   No. Observations:               126070
Model:                        MNLogit   Df Residuals:                   126034
Method:                           MLE   Df Model:                           33
Date:                Mon, 12 May 2025   Pseudo R-squ.:                 0.08394
Time:                        00:48:13   Log-Likelihood:            -1.2821e+05
converged:                       True   LL-Null:                   -1.3996e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
    grav=2       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.1749      0.106    