<a href="https://colab.research.google.com/github/Pascalinooo/Info/blob/master/catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost
!pip install logitboost

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4
Collecting logitboost
  Downloading logitboost-0.7-py3-none-any.whl (9.1 kB)
Installing collected packages: logitboost
Successfully installed logitboost-0.7


In [None]:
import pandas as pd
import numpy as np

import os
# MODELS
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import MultinomialNB, CategoricalNB, GaussianNB
from catboost import CatBoostClassifier
import logitboost
# TUNING
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# SCORES
from sklearn.metrics import matthews_corrcoef, make_scorer, balanced_accuracy_score
# PREPROCESSING
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from scipy.stats import uniform, truncnorm, randint

# DATA
df_train = pd.read_csv('sample_data/train_complete.csv')
df_test = pd.read_csv('sample_data/test_complete.csv')

In [None]:
###################
#### DATA SETS ####
###################

y_train = df_train['cps19_y']
y_test = df_test['cps19_y']
X_train = df_train.drop(columns=['cps19_y', 'cps19_prov_id'])
X_test = df_test.drop(columns=['cps19_y', 'cps19_prov_id'])

In [None]:
#######################
### LABEL ENCONDING ###
#######################

# SOL : LabelEncoder est pour les y, pas les X!!!
# Étant donné que la fonction fit ne prend pas de "string", on doit LabelEncode ou HotEncode
# toutes nos strings:
# OneHotEncoder : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
# OrdinalEncoder : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html#sklearn.preprocessing.OrdinalEncoder

def hot_encoder(data):
    encoder = preprocessing.OneHotEncoder(sparse=False)
    for column in data.columns:
        if data[column].dtype == object:
            encoded_column = pd.DataFrame(encoder.fit_transform(data[[column]]))
            encoded_column.columns = encoder.get_feature_names_out([column])
            data = data.join(encoded_column)
            data = data.drop(column, axis=1)
    return data

X_train_encoded = hot_encoder(X_train)
X_test_encoded = hot_encoder(X_test)

def label_encoder(data):
    encoder = preprocessing.LabelEncoder()
    data_encoded = encoder.fit_transform(data)
    return data_encoded

y_train_encoded = label_encoder(y_train)
y_test_encoded = label_encoder(y_test)

#Regex pour le nom des colonnes qui satisfont XGBOOST
X_train_encoded.columns = X_train_encoded.columns.str.replace(" ","", regex=False).str.replace("'","", regex=False).str.replace("<","", regex=False).str.replace(">","", regex=False).str.replace("(","", regex=False).str.replace(")","", regex=False).str.replace(",","", regex=False).str.replace("/","", regex=False).str.replace(".","_", regex=False).str.replace("-","_", regex=False)
X_test_encoded.columns = X_test_encoded.columns.str.replace(" ","", regex=False).str.replace("'","", regex=False).str.replace("<","", regex=False).str.replace(">","", regex=False).str.replace("(","", regex=False).str.replace(")","", regex=False).str.replace(",","", regex=False).str.replace("/","", regex=False).str.replace(".","_", regex=False).str.replace("-","_", regex=False)

X_train_encoded.pop("Unnamed:0")
X_test_encoded.pop("Unnamed:0")

0          1
1          2
2          3
3          4
4          5
        ... 
3654    3655
3655    3656
3656    3657
3657    3658
3658    3659
Name: Unnamed:0, Length: 3659, dtype: int64

In [None]:
from itertools import combinations
from sklearn.metrics import confusion_matrix
from math import sqrt
from math import factorial
def number_combinations(n, k):
    return factorial(n)/(factorial(k)*factorial(n-k))

def custom_MMCC(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)

    class_labels = np.sort(pd.unique(y_true))
    nbr_classes = len(class_labels)
    combin = combinations(class_labels, r=2)

    MCC = np.repeat(-np.inf, number_combinations(nbr_classes, 2))

    for i, combinaison in enumerate(combin):
        class_0, class_1 = combinaison[0], combinaison[1]

        # aller chercher les observations seulement in class0 et class1
        # faire la matrice de confusion pour ces observations
        TP = cm[class_0, class_0]
        TN = cm[class_1, class_1]
        FN = cm[class_0, class_1]
        FP = cm[class_1, class_0]

        MCC_i = (TP*TN-FP*FN)/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
        if MCC_i > 1:
            print("ERROR MCC > 1", MCC_i)
        elif MCC_i < -1:
            print("ERROR MCC < 1:", MCC_i)
        MCC[i] = MCC_i
        
    print(MCC)
    MMCC = MCC.sum()*2/(nbr_classes*(nbr_classes-1))

    return MMCC

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score

scores_dict = {
  #'MMCC': make_scorer(custom_MMCC)
  "Multiclass MCC": make_scorer(matthews_corrcoef) # semble accepter multiclasse https://scikit-learn.org/stable/modules/model_evaluation.html#matthews-corrcoef
  ,"macro_f2_score": make_scorer(fbeta_score, beta=2, average='macro')
  ,"balanced_accuracy": make_scorer(balanced_accuracy_score) # "balanced_accuracy"
  ,'Average Recall (micro)': make_scorer(recall_score, average = 'micro')
}

In [None]:
model_params_random_rf = {
    'min_samples_split': randint(2, 10),
    'max_depth': [2, 5, 10, 30, 50, 100, None],
    'n_estimators': [500],
    'min_samples_leaf': randint(1, 10),
    'class_weight': ['balanced_subsample', 'balanced']
}

model_params_random_logitboost = {
    'base_estimator': [DecisionTreeRegressor(max_depth=1), DecisionTreeRegressor(max_depth=3)],
    'learning_rate': uniform(0.01, 1),
    'n_estimators': randint(10, 200)
}

model_params_random_xgboost = {
    'n_estimators': [500],
    'learning_rate': uniform(0.01, 1),
    'max_depth': [2, 5, 10, 30, 50, 100, None],
    'colsample_bytree': uniform(0.01, 1),
    'subsample': uniform(0.01, 1)
}

model_params_random_catboost = {
    'n_estimators': randint(100, 500),
    'max_depth': [3, 4, 5],
    'learning_rate': np.linspace(0.05, 0.2, 10),
}

grid_rf = {
    'min_samples_split': [2, 5],
    'max_depth': [2, 5, 10, 30, 50, 100, None],
    'n_estimators': [10, 25, 50, 100, 250, 500],
    'min_samples_leaf': [1, 2, 5, 8, 10],
    'class_weight': ['balanced_subsample', 'balanced']
}
grid_logit = {
    'base_estimator': [DecisionTreeRegressor(max_depth=1), DecisionTreeRegressor(max_depth=3)],
    'learning_rate': [0.025, 0.1, 0.25, 0.75, 1],
    'n_estimators': [10, 25, 50, 100]
}
grid_xgboost = {
    'n_estimators': [10, 50, 100, 200, 500],
    'learning_rate': np.linspace(0.05, 0.2, 10),
    'max_depth': [2, 5, 10, 30, 50, 100, None],
    #,'predictor':"pes19_y"
}
grid_catboost = {
    'max_depth': [3, 4, 5],
    'n_estimators': [10, 25, 50, 100],
    'learning_rate': np.linspace(0.05, 0.2, 10)
}

In [None]:
def eval(model,
         grid,
         search_type,
         n_iter=500,
         cv=7,
         scoring=scores_dict,
         refit="macro_f2_score",
         X_train=X_train,
         y_train=y_train,
         X_train_encoded=X_train_encoded,
         y_train_encoded=y_train_encoded,
         oversampling=False,
         ):
    nbr_cpus = os.cpu_count()
  
    if model == "RandomForestClassifier":
        model = RandomForestClassifier(random_state=0)

    elif model == "LogitBoost":
        model = logitboost.LogitBoost(random_state=0)

    elif model == "XGBoost":
        model = xgb.XGBClassifier(objective="multi:softprob", random_state=0, use_label_encoder=False,
                                  verbosity=0)   
    elif model == "CatBoost":
        categorical_features_indices = np.where(X_train.dtypes == object)[0]
        model = CatBoostClassifier(random_seed=1)

    # Recherche des meilleurs hyperparamètres par validation croisée
    if search_type == "grid":
        search = GridSearchCV(estimator=model, cv=cv, scoring=scoring, refit=refit, param_grid=grid,
                                   n_jobs=nbr_cpus - 2)
    elif search_type == "random":
        search = RandomizedSearchCV(estimator=model, cv=cv, scoring=scoring, refit=refit, param_distributions=grid,
                                         n_iter=n_iter, random_state=1)

    if model == "CatBoost":
        #Warning
        #Do not use one-hot encoding during preprocessing. This affects both the training speed and the resulting quality
        search.fit(X_train, y_train, cat_features=categorical_features_indices)
    else:
        search.fit(X_train_encoded, y_train_encoded)

    print(search.best_params_)
    print("Multiclass MCC:", search.cv_results_["mean_test_Multiclass MCC"])
    print("Macro F2 Score:", search.cv_results_["mean_test_macro_f2_score"])
    print("Balanced Accuracy:", search.cv_results_["mean_test_balanced_accuracy"])
    print("Average Recall:", search.cv_results_["mean_test_Average Recall (micro)"])

    return search

def search_info(search):
    return search.cv_results_, search.best_estimator_, search.best_score_, search.best_params_

In [None]:
random_search_params_catboost = eval(model="CatBoost",
     search_type="random",
     grid=model_params_random_catboost)

In [None]:
import joblib
import pickle
with open('catboost_random.pkl', 'wb') as file:  
    joblib.dump(random_search_params_catboost, file)


NameError: ignored

In [None]:
random_search_params_catboost

NameError: ignored