___
___
# **Important Note**
**READ ME :**

The final solution has been splitted in several notebooks.
- 1. Create features
- 2. Train LGBM (8 CPU) + inference on test data
- 3. Train XGBOOST and CATBOOST (GPU) + inference on test data
- 4. Final ensemble and submission (this notebook)
    
I will provide all notebooks if needed at the end of the competition.


___
___
# **Librairies**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.ensemble import VotingRegressor

import xgboost as xgb
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier
from tqdm import tqdm

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

___
___
# **Open Data**

In [None]:
data_path = "/kaggle/input/trustii-processed-data/"
data_path = "Data/"

In [None]:
#df       = pd.read_csv('/kaggle/input/trustii-aim/train.csv')
#test_df  = pd.read_csv('/kaggle/input/trustii-aim/test.csv')


df       = pd.read_csv(data_path + 'train.csv')
test_df  = pd.read_csv(data_path + 'test.csv')


print(test_df.shape)
display(test_df.head())

In [None]:
dico_y = {'A corriger de manière urgente' : 0,
          'A améliorer' : 1,
          'Satisfaisant' : 2,
          'Très satisfaisant' : 3,
         }
dico_y_INVERSE = {v:k for k, v in dico_y.items()}

In [None]:
target_classes = ['A corriger de manière urgente',
                  'A améliorer',
                  'Satisfaisant',
                  'Très satisfaisant']

___
___
# **Combine oof**

In [None]:
df = pd.merge(df,
              pd.read_csv('OOF/train_oof_lgbm.csv'),
              how='left',
              on='Numero_inspection',
             )

df = pd.merge(df,
              pd.read_csv('OOF/train_oof_xgb_catboost.csv'),
              how='left',
              on='Numero_inspection',
             )

print(df.shape)
display(df.head(3))

In [None]:
test_df = pd.merge(test_df,
                   pd.read_csv('OOF/test_oof_lgbm.csv').drop(columns = ['trustii_id']),
                   how='left',
                   on='Numero_inspection',
                  )

test_df = pd.merge(test_df,
                   pd.read_csv('OOF/test_oof_xgb_catboost.csv').drop(columns = ['trustii_id']),
                   how='left',
                   on='Numero_inspection',
                  )

print(test_df.shape)
display(test_df.head(3))

___
___
# **Ensemble**

In [None]:
cols_oof_ = [x for x in df if x.startswith('oof_') and not(x.startswith('oof_class')) and df[x].dtypes != 'str' and '_param' in x]

cols_oof = []
for x in cols_oof_ :
    for i_ in target_classes :
        i = '_' + i_
        if not(x.endswith(i)) :
            continue
        new_x = x.replace(i, '')
        if new_x not in cols_oof :
            cols_oof.append(new_x)
            
cols_pred = [x.replace('oof_', 'pred_') for x in cols_oof]
print(cols_oof)
print(cols_pred)

In [None]:
weights = {'w0': 0.080504,
           'w1': 0.054101,
           'w2': 0.070493,
           'w3': 0.097491,
           'w4': 0.001788,
           'w5': 0.082893,
           'w6': 0.039908,
           'w7': 0.072237,
           'w8': 0.073922,
           'w9': 0.005372,
           'w10': 0.100776,
           'w11': 0.001794,
           'w12': 0.003368,
           'w13': 0.0531,
           'w14': 0.044548,
           'w15': 0.045245,
           'w16': 0.019479,
           'w17': 0.085234,
           'w18': 0.067748}

# Uniform
S = np.sum(list(weights.values()))
weights = {k:v/S for k, v in weights.items()}

weights

In [None]:
%%time

# Create ensemble-preds
for t_class in target_classes :
    test_df[f'ensemble_pred_{t_class}'] = np.average(test_df[[f"{col}_{t_class}" for col in cols_pred]].values, weights=list(weights.values()), axis=1)
        
test_df['Synthese_eval_sanit'] = test_df[[f'ensemble_pred_{i}' for i in target_classes]].idxmax(axis=1).apply(lambda x : x.replace('ensemble_pred_', ''))

test_df['Synthese_eval_sanit'].value_counts()

In [None]:
for t_class in target_classes :
    df[f'ensemble_pred_{t_class}'] = np.average(df[[f"{col}_{t_class}" for col in cols_oof]].values, weights=list(weights.values()), axis=1)
        
predictions = df[[f'ensemble_pred_{i}' for i in target_classes]].idxmax(axis=1).apply(lambda x : x.replace('ensemble_pred_', ''))
    
print("CV :", accuracy_score(df['Synthese_eval_sanit'], predictions))

___
___
# **Submission**

In [None]:
sub = test_df[['trustii_id', 'Synthese_eval_sanit']].reset_index(drop=True)
sub['trustii_id'] = sub['trustii_id'].astype(int)

sub.to_csv('submission.csv', index=False)

print(sub.shape)
sub

In [None]:
sub['Synthese_eval_sanit'].value_counts()