In [2]:
import pandas as pd
import numpy as np
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
import prince

from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV, RandomizedSearchCV, validation_curve
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder, PolynomialFeatures, StandardScaler, SplineTransformer, KBinsDiscretizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score, roc_curve, auc, recall_score, precision_recall_curve, make_scorer
from sklearn.kernel_approximation import Nystroem
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.svm import LinearSVC 


from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.metrics import classification_report_imbalanced

from prince import FAMD

from scipy.stats import chi2_contingency

from yellowbrick.classifier import DiscriminationThreshold
from sklego.meta import Thresholder

In [3]:
file = open('../../../conf/global_conf.yml', 'r') 
conf = yaml.safe_load(file)
df = pd.read_csv(conf['local_data_path'] + "data_cleaned_final_sans_dummies.csv",index_col=0,low_memory=False)

In [4]:
print(df.shape)

(447136, 41)


In [5]:
data = df.drop(['grav_rec', 'an', 'jour', 'date', 'dep', 'grav'], axis=1)
y = df.grav
numerical_features = ['age_usager', 'mois', 'heure', 'lat', 'long']
categorical_features = [col for col in data.columns if col not in numerical_features]
data[categorical_features] = data[categorical_features].astype('object')
#data.info()

In [6]:
data.head()

Unnamed: 0,mois,lum,agg,int,atm,col,lat,long,catr,circ,...,eq_ceinture,eq_casque,eq_siege,eq_gilet,eq_airbag,eq_gants,eq_indetermine,eq_autre,jour_chome,prox_pt_choc
0,11,2.0,0,0.0,0.0,2.0,48.89621,2.47012,1,0.0,...,1,0,0,0,0,0,0,0,1,0
1,11,2.0,0,0.0,0.0,2.0,48.89621,2.47012,1,0.0,...,1,0,0,0,0,0,0,0,1,0
2,11,2.0,0,0.0,0.0,2.0,48.89621,2.47012,1,0.0,...,1,0,0,0,0,0,0,0,1,1
3,11,2.0,0,0.0,0.0,6.0,48.9307,2.3688,1,0.0,...,1,0,0,0,0,0,0,0,1,1
4,11,0.0,0,0.0,0.0,4.0,48.935872,2.319174,1,0.0,...,1,0,0,0,0,0,0,0,1,1


In [7]:
data.shape

(447136, 35)

In [8]:
data_train, data_test, y_train, y_test = train_test_split(data, y, test_size=0.25, random_state=42, stratify=y)

In [9]:
class TransfoHour(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transfo = X.copy()
        X_transfo[f'{self.column_name}_sin'] = np.sin(X_transfo[self.column_name]*(2.*np.pi/24))
        X_transfo[f'{self.column_name}_cos'] = np.cos(X_transfo[self.column_name]*(2.*np.pi/24))
        X_transfo = X_transfo.drop(self.column_name, axis=1)
        return X_transfo
    
    def get_feature_names_out(self):
        pass
    
class TransfoMonth(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transfo = X.copy()
        X_transfo[f'{self.column_name}_sin'] = np.sin((X_transfo[self.column_name]-1)*(2.*np.pi/12))
        X_transfo[f'{self.column_name}_cos'] = np.cos((X_transfo[self.column_name]-1)*(2.*np.pi/12))
        X_transfo = X_transfo.drop(self.column_name, axis=1)
        return X_transfo

    def get_feature_names_out(self):
        pass
    

**Suppport Vector Classifier**

Complexité des SVM entre O(n_features x n_samples**2) et O(n_features x n_samples**3)
==> Difficilement envisageable compte-tenu de notre base de données

Il est alors recommandé dans la documentation de considérer plutôt LinearSVC ou SGDClassifier, après éventuellement une transformation de type Nystroem (ou autre Kernel Approx)

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ("transfo_heure", TransfoHour('heure'), ['heure']),
    ("transfo_mois", TransfoMonth('mois'), ['mois']),
    ("transfo_lat_long", RobustScaler(), ['lat', 'long']),
    ('transfo_age_usager', StandardScaler(), ['age_usager']),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ['place_rec','catr', 'surf', 'infra', 'situ', 'motor', 'manv', 'obsm', 'catv', 'col', 'lum'])
    ],
    remainder="passthrough"
).set_output(transform="pandas")

svm = Pipeline(steps=[("preprocessor", preprocessor),
                      ("nystroem", Nystroem(degree=2, n_components=50)),
                      ("svm", SGDClassifier(class_weight='balanced', random_state=42))])
svm

In [10]:
f1 = make_scorer(f1_score, average='micro')
cv_results = cross_validate(svm, data, y, cv=3, 
                            return_estimator=True, return_train_score=True, scoring=f1,
                            n_jobs=-1, verbose=10)
cv_results = pd.DataFrame(cv_results)
cv_results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   29.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   29.5s finished


Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,16.136083,3.16034,"(ColumnTransformer(remainder='passthrough',\n ...",0.501557,0.503435
1,15.320088,3.142998,"(ColumnTransformer(remainder='passthrough',\n ...",0.53031,0.534662
2,15.593114,2.765673,"(ColumnTransformer(remainder='passthrough',\n ...",0.487168,0.492232


**Hyperparameter tuning**

In [28]:
svm = Pipeline(steps=[("preprocessor", preprocessor),
                      ("nystroem", Nystroem()),
                      ("svm", SGDClassifier(class_weight='balanced', penalty='elasticnet'))])

param_grid ={"nystroem__n_components": [50, 100, 150],"nystroem__degree": [2, 3], "svm__l1_ratio": [0.25, 0.5, 0.75], "svm__loss": ['hinge', 'squared_hinge', 'perceptron'] }

f1 = make_scorer(f1_score, average='micro')
model_gds = GridSearchCV(svm,
                         param_grid=param_grid,
                         scoring=f1,  
                         cv=2,
                         return_train_score=True,
                         n_jobs=-1)
model_gds.fit(data_train, y_train)

In [33]:
cv_results = pd.DataFrame(model_gds.cv_results_)
cv_results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_nystroem__degree,param_nystroem__n_components,param_svm__l1_ratio,param_svm__loss,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
18,25.519807,1.264855,6.052255,0.111375,2,150,0.25,hinge,"{'nystroem__degree': 2, 'nystroem__n_component...",0.57663,0.586268,0.581449,0.004819,1,0.577006,0.586339,0.581672,0.004667
42,18.469924,1.376257,5.145642,0.181162,3,100,0.75,hinge,"{'nystroem__degree': 3, 'nystroem__n_component...",0.574549,0.577292,0.57592,0.001372,2,0.575509,0.578443,0.576976,0.001467
24,27.38385,1.183665,6.264781,0.039962,2,150,0.75,hinge,"{'nystroem__degree': 2, 'nystroem__n_component...",0.587073,0.563867,0.57547,0.011603,3,0.588546,0.564875,0.57671,0.011835
9,16.889954,0.460381,4.858773,0.072728,2,100,0.25,hinge,"{'nystroem__degree': 2, 'nystroem__n_component...",0.572849,0.575819,0.574334,0.001485,4,0.57227,0.576928,0.574599,0.002329
51,34.37286,4.763933,7.088606,0.460832,3,150,0.75,hinge,"{'nystroem__degree': 3, 'nystroem__n_component...",0.57066,0.566694,0.568677,0.001983,5,0.571746,0.567183,0.569464,0.002281
15,25.458669,0.240117,6.074267,0.486528,2,100,0.75,hinge,"{'nystroem__degree': 2, 'nystroem__n_component...",0.562567,0.564744,0.563656,0.001088,6,0.561971,0.566312,0.564142,0.002171
21,26.145347,2.08504,6.337249,0.366278,2,150,0.5,hinge,"{'nystroem__degree': 2, 'nystroem__n_component...",0.569694,0.555649,0.562671,0.007022,7,0.570392,0.557217,0.563805,0.006587
45,28.876656,1.113448,6.152997,0.179766,3,150,0.25,hinge,"{'nystroem__degree': 3, 'nystroem__n_component...",0.530016,0.578759,0.554388,0.024371,8,0.530726,0.57994,0.555333,0.024607
36,24.011825,0.391377,6.267576,0.166827,3,100,0.25,hinge,"{'nystroem__degree': 3, 'nystroem__n_component...",0.560384,0.545904,0.553144,0.00724,9,0.559919,0.54721,0.553565,0.006355
48,25.968691,0.142175,6.229208,0.17299,3,150,0.5,hinge,"{'nystroem__degree': 3, 'nystroem__n_component...",0.540155,0.562537,0.551346,0.011191,10,0.540042,0.563891,0.551966,0.011925


In [32]:
model_gds.best_params_

{'nystroem__degree': 2,
 'nystroem__n_components': 150,
 'svm__l1_ratio': 0.25,
 'svm__loss': 'hinge'}

In [37]:
svm = Pipeline(steps=[("preprocessor", preprocessor),
                      ("nystroem", Nystroem(degree=2)),
                      ("svm", SGDClassifier(class_weight='balanced', penalty='elasticnet', loss="hinge", learning_rate='optimal'))])

param_grid ={"nystroem__n_components":[100, 200, 300], "svm__l1_ratio": [0, 0.05, 0.1] }

f1 = make_scorer(f1_score, average='micro')
model_gds = GridSearchCV(svm,
                         param_grid=param_grid,
                         scoring=f1,  
                         cv=2,
                         return_train_score=True,
                         n_jobs=-1)
model_gds.fit(data_train, y_train)

In [38]:
cv_results = pd.DataFrame(model_gds.cv_results_)
cv_results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_nystroem__n_components,param_svm__l1_ratio,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
5,35.20693,1.702688,7.301023,0.328096,200,0.1,"{'nystroem__n_components': 200, 'svm__l1_ratio...",0.59312,0.583196,0.588158,0.004962,1,0.594575,0.584443,0.589509,0.005066
8,29.858306,1.811267,4.720834,0.120876,300,0.1,"{'nystroem__n_components': 300, 'svm__l1_ratio...",0.58579,0.585761,0.585776,1.5e-05,2,0.587609,0.586733,0.587171,0.000438
4,35.929936,1.503262,8.163847,0.052063,200,0.05,"{'nystroem__n_components': 200, 'svm__l1_ratio...",0.586464,0.582498,0.584481,0.001983,3,0.587615,0.583089,0.585352,0.002263
6,43.31224,0.460773,6.828526,0.664712,300,0.0,"{'nystroem__n_components': 300, 'svm__l1_ratio...",0.582385,0.574447,0.578416,0.003969,4,0.583471,0.575181,0.579326,0.004145
1,22.866445,0.806484,6.313234,0.375257,100,0.05,"{'nystroem__n_components': 100, 'svm__l1_ratio...",0.588051,0.567368,0.577709,0.010341,5,0.589041,0.568406,0.578723,0.010318
0,20.443409,0.547311,5.364825,0.092179,100,0.0,"{'nystroem__n_components': 100, 'svm__l1_ratio...",0.576701,0.573356,0.575029,0.001673,6,0.57802,0.574608,0.576314,0.001706
7,39.412665,0.04861,5.747487,0.343825,300,0.05,"{'nystroem__n_components': 300, 'svm__l1_ratio...",0.577274,0.568966,0.57312,0.004154,7,0.577787,0.569276,0.573532,0.004255
3,34.962783,0.455129,7.715434,0.027442,200,0.0,"{'nystroem__n_components': 200, 'svm__l1_ratio...",0.583071,0.55785,0.57046,0.012611,8,0.584037,0.558488,0.571262,0.012775
2,25.212926,3.166011,6.513477,0.133649,100,0.1,"{'nystroem__n_components': 100, 'svm__l1_ratio...",0.575586,0.521386,0.548486,0.0271,9,0.575467,0.522066,0.548767,0.0267


In [39]:
y_pred_best_est = model_gds.best_estimator_.predict(data_test)
cr = classification_report(y_test, y_pred_best_est)
print(cr)
pd.crosstab(y_test, y_pred_best_est)

              precision    recall  f1-score   support

         1.0       0.68      0.80      0.74     46137
         2.0       0.10      0.62      0.18      3050
         3.0       0.38      0.25      0.30     17500
         4.0       0.68      0.42      0.52     45097

    accuracy                           0.56    111784
   macro avg       0.46      0.52      0.43    111784
weighted avg       0.62      0.56      0.57    111784



col_0,1.0,2.0,3.0,4.0
grav,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,36940,3648,1684,3865
2.0,322,1890,460,378
3.0,2233,6056,4369,4842
4.0,14759,6431,4853,19054


**LinearSVC**

In [11]:
preprocessor = ColumnTransformer(transformers=[
    ("transfo_heure", TransfoHour('heure'), ['heure']),
    ("transfo_mois", TransfoMonth('mois'), ['mois']),
    ("transfo_lat_long", RobustScaler(), ['lat', 'long']),
    ('transfo_age_usager', StandardScaler(), ['age_usager']),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ['place_rec','catr', 'surf', 'infra', 'situ', 'motor', 'manv', 'obsm', 'catv', 'col', 'lum'])
    ],
    remainder="passthrough"
).set_output(transform="pandas")

svm = Pipeline(steps=[("preprocessor", preprocessor),
                      ("nystroem", Nystroem(degree=2, n_components=300)),
                      ("svm", LinearSVC(class_weight='balanced', loss="hinge", random_state=42))])

In [42]:
f1 = make_scorer(f1_score, average='micro')
cv_results = cross_validate(svm, data, y, cv=3, 
                            return_estimator=True, return_train_score=True, scoring=f1,
                            n_jobs=-1, verbose=10)
cv_results = pd.DataFrame(cv_results)
cv_results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.2min finished


Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,108.942628,5.208589,"(ColumnTransformer(remainder='passthrough',\n ...",0.602344,0.601416
1,113.88791,5.932202,"(ColumnTransformer(remainder='passthrough',\n ...",0.598309,0.602121
2,103.371919,4.895887,"(ColumnTransformer(remainder='passthrough',\n ...",0.59586,0.602715


In [43]:
svm = Pipeline(steps=[("preprocessor", preprocessor),
                      ("nystroem", Nystroem(degree=2, n_components=300)),
                      ("svm", LinearSVC(class_weight='balanced', loss="hinge", random_state=42))])

param_grid ={"svm__penalty": ['l1', 'l2'], "svm__loss": ['hinge', 'squared_hinge'], "svm__C": [0.1, 1, 10], "svm__multi_class": ['ovr', 'crammer_singer'] }

f1 = make_scorer(f1_score, average='micro')
model_gds = GridSearchCV(svm,
                         param_grid=param_grid,
                         scoring=f1,  
                         cv=2,
                         return_train_score=True,
                         n_jobs=-1)
model_gds.fit(data_train, y_train)



12 fits failed out of a total of 48.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dcons\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dcons\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 297, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "c:\Users\dcons\anaconda3\Lib\site-packages\sklearn\svm\_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                                           ^^^^^^^^^^^^^^^
  File "c:\Users\dcons\anaconda3\Lib\site-packages

In [44]:
cv_results = pd.DataFrame(model_gds.cv_results_)
cv_results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svm__C,param_svm__loss,param_svm__multi_class,param_svm__penalty,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
13,194.513765,3.059416,11.66086,0.269053,1.0,squared_hinge,ovr,l2,"{'svm__C': 1, 'svm__loss': 'squared_hinge', 's...",0.618174,0.620989,0.619582,0.001407,1,0.624114,0.619797,0.621955,0.002159
21,1482.742662,16.583725,16.998456,1.464546,10.0,squared_hinge,ovr,l2,"{'svm__C': 10, 'svm__loss': 'squared_hinge', '...",0.619188,0.619254,0.619221,3.3e-05,2,0.625456,0.618514,0.621985,0.003471
5,53.1525,0.041097,11.611428,1.109172,0.1,squared_hinge,ovr,l2,"{'svm__C': 0.1, 'svm__loss': 'squared_hinge', ...",0.611179,0.615139,0.613159,0.00198,3,0.616582,0.614328,0.615455,0.001127
17,442.206008,11.418085,15.04621,0.164176,10.0,hinge,ovr,l2,"{'svm__C': 10, 'svm__loss': 'hinge', 'svm__mul...",0.598171,0.6024,0.600286,0.002114,4,0.60364,0.603253,0.603447,0.000194
9,102.118411,2.047174,12.74716,0.418798,1.0,hinge,ovr,l2,"{'svm__C': 1, 'svm__loss': 'hinge', 'svm__mult...",0.600181,0.598494,0.599337,0.000844,5,0.603408,0.597629,0.600518,0.00289
1,50.426952,3.478836,12.25404,1.094324,0.1,hinge,ovr,l2,"{'svm__C': 0.1, 'svm__loss': 'hinge', 'svm__mu...",0.586924,0.583906,0.585415,0.001509,6,0.588051,0.584878,0.586464,0.001586
22,1964.823188,97.996867,9.990258,0.644712,10.0,squared_hinge,crammer_singer,l1,"{'svm__C': 10, 'svm__loss': 'squared_hinge', '...",0.563193,0.558637,0.560915,0.002278,7,0.565316,0.560456,0.562886,0.00243
19,2029.753111,179.441696,11.431055,0.406208,10.0,hinge,crammer_singer,l2,"{'svm__C': 10, 'svm__loss': 'hinge', 'svm__mul...",0.563796,0.557206,0.560501,0.003295,8,0.566438,0.55949,0.562964,0.003474
18,2425.241696,625.227105,10.563661,5.131244,10.0,hinge,crammer_singer,l1,"{'svm__C': 10, 'svm__loss': 'hinge', 'svm__mul...",0.55872,0.559448,0.559084,0.000364,9,0.561207,0.561511,0.561359,0.000152
23,1276.179487,1.770386,3.98132,0.364202,10.0,squared_hinge,crammer_singer,l2,"{'svm__C': 10, 'svm__loss': 'squared_hinge', '...",0.557546,0.560223,0.558884,0.001339,10,0.562209,0.56172,0.561965,0.000245


In [45]:
y_pred_best_est = model_gds.best_estimator_.predict(data_test)
cr = classification_report(y_test, y_pred_best_est)
print(cr)
pd.crosstab(y_test, y_pred_best_est)

              precision    recall  f1-score   support

         1.0       0.71      0.80      0.75     46137
         2.0       0.15      0.39      0.22      3050
         3.0       0.45      0.39      0.42     17500
         4.0       0.67      0.54      0.60     45097

    accuracy                           0.62    111784
   macro avg       0.49      0.53      0.49    111784
weighted avg       0.64      0.62      0.62    111784



col_0,1.0,2.0,3.0,4.0
grav,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,37054,1638,1715,5730
2.0,325,1198,1026,501
3.0,2176,2744,6759,5821
4.0,12971,2405,5491,24230


In [23]:
svm = Pipeline(steps=[("preprocessor", preprocessor),
                      ("nystroem", Nystroem(degree=2, n_components=300)),
                      ("svm", LinearSVC(loss="hinge", random_state=42, max_iter = 15000))])
dic1 = {1.0:0.606, 2.0:12, 3.0:1.597, 4.0:0.620}
dic2 = {1.0:0.2, 2.0:9.162, 3.0:1.597, 4.0:0.620}
dic3 = {1.0:0.6, 2.0:9.162, 3.0:3., 4.0:0.620}

param_grid ={"svm__class_weight": ['balanced', dic1, dic2, dic3]}

f1 = make_scorer(f1_score, average='micro')
model_gds = GridSearchCV(svm,
                         param_grid=param_grid,
                         scoring=f1,  
                         cv=2,
                         return_train_score=True,
                         n_jobs=-1)
model_gds.fit(data_train, y_train)



In [24]:
cv_results = pd.DataFrame(model_gds.cv_results_)
cv_results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svm__class_weight,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
0,251.241895,26.240875,6.054982,1.601264,balanced,{'svm__class_weight': 'balanced'},0.599024,0.597665,0.598344,0.00068,1,0.602776,0.598171,0.600474,0.002302
3,216.691419,8.194533,7.004648,1.038653,"{1.0: 0.6, 2.0: 9.162, 3.0: 3.0, 4.0: 0.62}","{'svm__class_weight': {1.0: 0.6, 2.0: 9.162, 3...",0.589124,0.59086,0.589992,0.000868,2,0.594062,0.589309,0.591686,0.002377
1,176.900285,47.95382,9.120908,0.173061,"{1.0: 0.606, 2.0: 12, 3.0: 1.597, 4.0: 0.62}","{'svm__class_weight': {1.0: 0.606, 2.0: 12, 3....",0.585796,0.578795,0.582296,0.003501,3,0.588969,0.57892,0.583945,0.005025
2,280.167841,5.845052,4.770036,0.214188,"{1.0: 0.2, 2.0: 9.162, 3.0: 1.597, 4.0: 0.62}","{'svm__class_weight': {1.0: 0.2, 2.0: 9.162, 3...",0.559156,0.557838,0.558497,0.000659,4,0.563217,0.557551,0.560384,0.002833


"balanced" conduit aux meilleurs résultats

In [13]:
svm = Pipeline(steps=[("preprocessor", preprocessor),
                      ("nystroem", Nystroem(degree=2, n_components=300)),
                      ("svm", LinearSVC(loss="hinge", random_state=42, max_iter = 15000, class_weight='balanced'))])
svm.fit(data_train, y_train)
y_pred = svm.predict(data_test)
cr = classification_report_imbalanced(y_test, y_pred)
print(cr)
pd.crosstab(y_test, y_pred)



                   pre       rec       spe        f1       geo       iba       sup

        1.0       0.69      0.84      0.74      0.76      0.78      0.62     46137
        2.0       0.13      0.48      0.91      0.21      0.66      0.42      3050
        3.0       0.43      0.32      0.92      0.36      0.54      0.27     17500
        4.0       0.68      0.49      0.85      0.57      0.64      0.40     45097

avg / total       0.63      0.60      0.82      0.60      0.68      0.47    111784



col_0,1.0,2.0,3.0,4.0
grav,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,38531,1989,1705,3912
2.0,321,1470,811,448
3.0,2323,3887,5513,5777
4.0,14691,3595,4895,21916
