In [43]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,classification_report

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

## Import des données

In [44]:
df = pd.read_csv("dataset_final.csv")
df.head()

Unnamed: 0,place,catu,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,...,senc,catv,obs,obsm,choc,manv,motor,an_acc,heure,minute
0,1,1,3,1,2008.0,5,2,8,-1,-1,...,1,2,0,2,1,9,1,2022,16,15
1,1,1,1,1,1948.0,5,1,8,-1,-1,...,1,7,0,2,2,1,1,2022,16,15
2,1,1,4,1,1988.0,9,1,0,-1,0,...,2,7,0,2,8,15,1,2022,8,34
3,1,1,1,1,1970.0,4,1,0,-1,0,...,2,10,0,2,1,1,1,2022,8,34
4,1,1,1,1,2002.0,0,1,0,-1,-1,...,2,7,0,2,1,2,1,2022,17,15


## Préprocessing

In [45]:
print(df.grav.value_counts())

print("-1 : non concerné, 1 – Indemne, 2 – Tué, 3 – Blessé hospitalisé, 4 – Blessé léger")


grav
 1    207354
 4    197494
 3     75986
 2     13047
-1       301
Name: count, dtype: int64
-1 : non concerné, 1 – Indemne, 2 – Tué, 3 – Blessé hospitalisé, 4 – Blessé léger


Comme on a pu le voir précédemment, le jeu de données est déséquilibré sur la variable cible. Par exemple, la proportion d'accident mortel est plus faible que celle d'indemne.

### Combinaison de indemne et non concerné

In [46]:
# combiner les variables indemne et non concerné car si la personne 
# n'est pas concerné par la gravité c'est qu'elle est indemne

df.grav = df.grav.astype(str)
df.grav = [value.replace("-1", "1") for value in df["grav"]]
df.grav = df.grav.astype(int)

print(df.grav.value_counts())

print("1 – Indemne, 2 – Tué, 3 – Blessé hospitalisé, 4 – Blessé léger")

# Pour la suite, je souhaite conserver pour la variable gravité les catégories indemne, tué, blessé hospitalisé 
# et blessé léger qui me paraissent intéressantes pour traiter le problème.

grav
1    207655
4    197494
3     75986
2     13047
Name: count, dtype: int64
1 – Indemne, 2 – Tué, 3 – Blessé hospitalisé, 4 – Blessé léger


### Séparation du jeu de données en train set et test set

In [47]:
from sklearn.model_selection import train_test_split

df = df.drop(['Accident_Id', 
                      "an_acc",
                      "larrout"], # doublon avec la variable "an"
                     axis=1)

# Supression des lignes contenant des données manquantes
df = df.dropna()

X_train, X_test, y_train, y_test = train_test_split(df.drop(["grav"], axis=1), df.grav, test_size=0.25)
print("Séparation des labels et targets :", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Séparation des labels et targets : (347733, 43) (115912, 43) (347733,) (115912,)


df = df.drop(['Accident_Id', 
                      "an_acc"], # doublon avec la variable "an"
                     axis=1)

# Supression des lignes contenant des données manquantes
df = df.dropna()

# Séparation du jeu de données pour entraîner le modèle
#train = df.loc[df.an.between(2019, 2021)] 
df_bis = df.loc[df.an.between(2020, 2022)]

train = df_bis.loc[df_bis.an.between(2020, 2021)]
test = df_bis.loc[df_bis.an == 2022]

print("DF: ", df_bis.shape,"TRAIN :", train.shape,"TEST :", test.shape)
print(" ")
print("La proportion du dataset train est de", round((train.shape[0]/ df_bis.shape[0]*100)),
      "% et du dataset test est de", round((test.shape[0]/ df_bis.shape[0]*100)), "%.")
print(" ")
X_train = train.drop(["grav"], axis=1)
y_train = train.grav

X_test = test.drop(["grav"], axis=1)
y_test = test.grav

print("Séparation des labels et targets :", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### Standardisation

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled_std = scaler.fit_transform(X_train)
X_test_scaled_std = scaler.transform(X_test)

from sklearn.preprocessing import RobustScaler

rbs = RobustScaler().fit(X_train)
X_train_scaled_rbs = rbs.transform(X_train)
X_test_scaled_rbs = rbs.transform(X_test)

### Ré-équilibrage de la variable cible

#### Undersampler avec std

from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced

undersampler = RandomUnderSampler(random_state=42)
X_train_resampled_std, y_train_resampled = undersampler.fit_resample(X_train_scaled_std, y_train)

model_baseline_res = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)

model_baseline_res.fit(X_train_resampled_std, y_train_resampled)
accuracy_res = model_baseline_res.score(X_test_scaled_std, y_test)
print("Accuracy :", round(accuracy_res,6))

y_pred = model_baseline_res.predict(X_test_scaled_std)

print(pd.crosstab(y_test, y_pred, colnames=['Predictions']))

print(classification_report_imbalanced(y_test, y_pred))

#### Undersampler avec robustscaler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced

undersampler = RandomUnderSampler(random_state=42)
X_train_resampled_rbs, y_train_resampled = undersampler.fit_resample(X_train_scaled_rbs, y_train)

model_baseline_res = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)

model_baseline_res.fit(X_train_resampled_rbs, y_train_resampled)
accuracy_res = model_baseline_res.score(X_test_scaled_rbs, y_test)
print("Accuracy :", round(accuracy_res,6))

y_pred = model_baseline_res.predict(X_test_scaled_rbs)

print(pd.crosstab(y_test, y_pred, colnames=['Predictions']))

print(classification_report_imbalanced(y_test, y_pred))

#### Oversampler avec std

from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced

oversampler = RandomOverSampler(random_state=42)
X_train_resampov_std, y_train_resampov = oversampler.fit_resample(X_train_scaled_std, y_train)

model_baseline_res = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)

model_baseline_res.fit(X_train_resampov_std, y_train_resampov)
accuracy_res = model_baseline_res.score(X_test_scaled_std, y_test)
print("Accuracy :", round(accuracy_res,6))

y_pred = model_baseline_res.predict(X_test_scaled_std)

print(pd.crosstab(y_test, y_pred, colnames=['Predictions']))

print(classification_report_imbalanced(y_test, y_pred))

#### Oversampler avec rbs

from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced

oversampler = RandomOverSampler(random_state=42)
X_train_resampov_rbs, y_train_resampov = oversampler.fit_resample(X_train_scaled_rbs, y_train)

model_baseline_res = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)

model_baseline_res.fit(X_train_resampov_rbs, y_train_resampov)
accuracy_res = model_baseline_res.score(X_test_scaled_rbs, y_test)
print("Accuracy :", round(accuracy_res,6))

y_pred = model_baseline_res.predict(X_test_scaled_rbs)

print(pd.crosstab(y_test, y_pred, colnames=['Predictions']))

print(classification_report_imbalanced(y_test, y_pred))

### Features select

###### Faire la standardisation avant de sélectionner les variables est moins coûteux en terme de temps de calculs.

# Recherche de la meilleure sélection de variables avec undersampling

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

best_accuracy = 0
best_k = 0

for k in range(1, X_train.shape[1] + 1):
    select_k_best = SelectKBest(score_func=f_classif, k=k)
    
    X_train_k_best = select_k_best.fit_transform(X_train, y_train)
    X_test_k_best = select_k_best.transform(X_test)
    
    model_2 = OneVsRestClassifier(LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000))
    model_2.fit(X_train_k_best, y_train)
    
    y_pred = model_2.predict(X_test_k_best)
    accuracy = accuracy_score(y_test, y_pred)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k

print("Best number of features:", best_k)
print("Best accuracy:", best_accuracy)

# Recherche de la meilleure sélection de variables avec oversampling

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

best_accuracy = 0
best_k = 0

for k in range(1, X_train.shape[1] + 1):
    select_k_best = SelectKBest(score_func=f_classif, k=k)
    
    X_train_k_best = select_k_best.fit_transform(X_train_resampov_, y_train_resampov)
    X_test_k_best = select_k_best.transform(X_test_scaled_rbs)
    
    model_2 = OneVsRestClassifier(LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000))
    model_2.fit(X_train_k_best, y_train_resampov)
    
    y_pred = model_2.predict(X_test_k_best)
    accuracy = accuracy_score(y_test, y_pred)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k

print("Best number of features:", best_k)
print("Best accuracy:", best_accuracy)

# sélection des k des meilleures features définies précédemment
test_stat =  SelectKBest(f_classif, k=33)
test_stat.fit(X_train_scaled, y_train)

for col, score in zip(X_train.columns, test_stat.scores_):
    print(col, ":", score)

X_train_selected = test_stat.transform(X_train_scaled)
X_test_selected = test_stat.transform(X_test_scaled)

### Baseline model

#### Régression logistique : recherche de la meilleure "normalisation" des données

In [49]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression

scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for scaler_name, scaler in scalers.items():
    pipeline = make_pipeline(scaler, model)
    scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')

    print(f"{scaler_name}: Accuracy moyenne = {scores.mean()}, Écart-type = {scores.std()}")
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    report = classification_report(y_test, y_pred)
    
    print(f"{scaler_name} Classification Report:")
    print(report)

StandardScaler: Accuracy moyenne = 0.5920519420436421, Écart-type = 0.0012786740888283547
StandardScaler Classification Report:
              precision    recall  f1-score   support

           1       0.61      0.77      0.68     48151
           2       0.42      0.00      0.01      2834
           3       0.46      0.19      0.27     17228
           4       0.58      0.59      0.59     47699

    accuracy                           0.59    115912
   macro avg       0.52      0.39      0.39    115912
weighted avg       0.57      0.59      0.56    115912

MinMaxScaler: Accuracy moyenne = 0.5920979551268069, Écart-type = 0.0011513296019927224
MinMaxScaler Classification Report:
              precision    recall  f1-score   support

           1       0.61      0.77      0.68     48151
           2       0.33      0.00      0.00      2834
           3       0.46      0.19      0.27     17228
           4       0.58      0.59      0.59     47699

    accuracy                           0.

#### Random forest : recherche de la meilleure "normalisation" des données

In [None]:
model_rf = RandomForestClassifier(random_state=0, class_weight='balanced')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for scaler_name, scaler in scalers.items():
    pipeline = make_pipeline(scaler, model_rf)
    scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')

    print(f"{scaler_name}: Accuracy moyenne = {scores.mean()}, Écart-type = {scores.std()}")

StandardScaler: Accuracy moyenne = 0.6836480886355647, Écart-type = 0.002237072983282108
MinMaxScaler: Accuracy moyenne = 0.6841743512852378, Écart-type = 0.0019054375241854794
RobustScaler: Accuracy moyenne = 0.6847408803498939, Écart-type = 0.0017425095708648525


#### KNN : recherche de la meilleure "normalisation" des données

In [None]:
model_knn = KNeighborsClassifier()

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for scaler_name, scaler in scalers.items():
    pipeline = make_pipeline(scaler, model_knn)
    
    undersampler = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
    
    scores = cross_val_score(pipeline, X_train_resampled, y_train_resampled, cv=cv, scoring='accuracy')
    
    print(f"{scaler_name}: Accuracy moyenne = {scores.mean()}, Écart-type = {scores.std()}")


StandardScaler: Accuracy moyenne = 0.45268491790183385, Écart-type = 0.002397667752622828
MinMaxScaler: Accuracy moyenne = 0.4256641721000968, Écart-type = 0.006467772030212853
RobustScaler: Accuracy moyenne = 0.4761988469142128, Écart-type = 0.0044322465651383085


#### XGBoost : recherche de la meilleure "normalisation" des données

In [None]:
model_xgb = GradientBoostingClassifier(random_state=0)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for scaler_name, scaler in scalers.items():
    pipeline = make_pipeline(scaler, model_xgb)
    
    undersampler = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
    
    scores = cross_val_score(pipeline, X_train_resampled, y_train_resampled, cv=cv, scoring='accuracy')
    
    print(f"{scaler_name}: Accuracy moyenne = {scores.mean()}, Écart-type = {scores.std()}")

StandardScaler: Accuracy moyenne = 0.5778431280923595, Écart-type = 0.004693858083687763
MinMaxScaler: Accuracy moyenne = 0.5788491849679156, Écart-type = 0.004196471115224787
RobustScaler: Accuracy moyenne = 0.5779293638997789, Écart-type = 0.004602568547734059
