In [2]:
!pip install imbalanced-learn



In [1]:
import pandas as pd
import numpy as np
from phik.report import plot_correlation_matrix
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, \
    classification_report, roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
plt.style.use('dark_background')

In [2]:
df_raw = pd.read_csv('/kaggle/input/leopard-challenge-classification/train.csv')
test = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/test.csv')
df_raw.shape, test.shape

((13863, 26), (5942, 25))

In [3]:
test_cv = test.drop(['oral', 'ID'], axis = 1).copy()
test_cv['tartar'] = test_cv['tartar'].apply(lambda x: 1 if x == 'Y' else 0)

In [4]:
merged_data = df_raw.drop(['oral', 'ID'], axis=1).copy()
merged_data['tartar'] = merged_data['tartar'].apply(lambda x: 1 if x == 'Y' else 0)

In [5]:
# Создание бинарного признака на основе BMI
merged_data['BMI'] = merged_data['weight(kg)'] / ((merged_data['height(cm)'] / 100) ** 2)
merged_data['BMI_status'] = (merged_data['BMI'] < 18.5) | (merged_data['BMI'] >= 24.9)
merged_data['BMI_status'] = merged_data['BMI_status'].astype(int)
merged_data['log_tr'] = merged_data['triglyceride'].apply(np.log)
merged_data['log_gtp'] = merged_data['Gtp'].apply(np.log)
merged_data['log_log_alt'] = np.log(np.log(merged_data['ALT']))
merged_data['BMI_log_gtp'] = merged_data['BMI'].apply(np.log)
merged_data = merged_data.drop(['triglyceride', 'Gtp', 'ALT', 'BMI'], axis=1)

test_cv['BMI'] = test_cv['weight(kg)'] / ((test_cv['height(cm)'] / 100) ** 2)
test_cv['BMI_status'] = (test_cv['BMI'] < 18.5) | (test_cv['BMI'] >= 24.9)
test_cv['BMI_status'] = test_cv['BMI_status'].astype(int)
test_cv['log_tr'] = test_cv['triglyceride'].apply(np.log)
test_cv['log_gtp'] = test_cv['Gtp'].apply(np.log)
test_cv['log_log_alt'] = np.log(np.log(test_cv['ALT']))
test_cv['BMI_log_gtp'] = test_cv['BMI'].apply(np.log)
test_cv = test_cv.drop(['triglyceride', 'Gtp', 'ALT', 'BMI'], axis=1)

In [6]:
from sklearn.model_selection import train_test_split
# polynom = PolynomialFeatures(interaction_only=True, include_bias=False)
# from imblearn.over_sampling import SMOTE

X = merged_data.drop(['smoking'], axis=1) 
Y = merged_data['smoking']

# X_poly = pd.DataFrame(polynom.fit_transform(X))
# test_cv_poly = pd.DataFrame(polynom.transform(test_cv))

# Применение SMOTE
# X_new_train, X_test_sim, Y_new_train, Y_test_sim = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
# X_new_train.shape, X_test_sim.shape, Y_new_train.shape, Y_test_sim.shape

In [7]:
from sklearn.preprocessing import MinMaxScaler

# Масштабирование признаков
scaler = MinMaxScaler()

X_scale = scaler.fit_transform(X)
test_cv = scaler.transform(test_cv)

X = pd.DataFrame(X_scale)
X.shape, test_cv.shape

((13863, 25), (5942, 25))

In [8]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, cohen_kappa_score, log_loss, f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import CalibrationDisplay
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.manifold import TSNE
import optuna

In [10]:
import json
import pickle

# Загрузка сохраненных параметров
with open('/kaggle/input/lgbm-optuna-v-1-best-params-no-exta-features/LGBM_Optuna_V_1_best_params_no_exta_features.json', 'r') as f:
    LGBM_best_params = json.load(f)

# Обучение модели с загруженными параметрами
# model_with_best_params = LGBMClassifier(**loaded_params)

# Загрузка гиперпараметров из файла
with open("/kaggle/input/best-xgb-params/best_xgb_params.pkl", "rb") as f:
    loaded_best_xgb_params = pickle.load(f)
    
with open("/kaggle/input/cb-best-parameters/best_parameters.json", "r") as f:
    CatBoost_loaded_best_params = json.load(f)


In [11]:
%%time
ens_cv_scores, ens_preds = list(), list()
hill_ens_cv_scores, hill_ens_preds =  list(), list()

results_df = pd.DataFrame(columns=['Fold', 'Model', 'ROC AUC', 'Optimal Threshold', 'F1 Score'])

ens_cv_scores = []
ens_preds = []

sk = RepeatedStratifiedKFold(n_splits = 7, n_repeats = 1, random_state = 42)
ratio = float(np.sum(Y == 0)) / np.sum(Y == 1)

for i, (train_idx, test_idx) in enumerate(sk.split(X, Y)):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    X_train_sub, X_val_sub, Y_train_sub, Y_val_sub = train_test_split(X_train, Y_train, test_size=0.2, random_state=42, stratify=Y_train)
    
    print('----------------------------------------------------------')

    ##########
    ## LGBM ##
    ##########

    LGBM_md = LGBMClassifier(**LGBM_best_params
#                              class_weight='balanced',
#                              n_estimators = 3500,
#                              max_depth = 9,
#                              learning_rate = 0.03,
#                              num_leaves = 20,
#                              reg_alpha = 2,
#                              reg_lambda = 5,
#                              subsample = 0.7,
#                              colsample_bytree = 0.7
                                                    ).fit(X_train_sub, Y_train_sub, 
                                                            eval_set=[(X_val_sub, Y_val_sub)], 
                                                            early_stopping_rounds=100, 
                                                            verbose=100)

    lgb_pred = LGBM_md.predict_proba(X_test)[:, 1]
    lgb_score = roc_auc_score(Y_test, lgb_pred)

    print('Fold', i, '==> LGBM oof ROC-AUC score is ==>', lgb_score) 

    lgb_pred_test = LGBM_md.predict_proba(test_cv)[:, 1]

    #########
    ## XGB ##
    #########

    XGB_md = XGBClassifier(**loaded_best_xgb_params
#                             objective = 'binary:logistic',
#                            tree_method = 'hist',
#                            colsample_bytree = 0.7, 
#                            gamma = 2, 
#                            learning_rate = 0.03, 
#                            max_depth = 10, 
#                            min_child_weight = 10, 
#                            n_estimators = 2500, 
#                            reg_lambda=5,
#                            scale_pos_weight=ratio,
#                            subsample = 0.7
                                  ).fit(X_train_sub, Y_train_sub, 
                                                   eval_metric="auc", 
                                                   eval_set=[(X_val_sub, Y_val_sub)], 
                                                   early_stopping_rounds=100, 
                                                   verbose=100)

    xgb_pred = XGB_md.predict_proba(X_test)[:, 1]
    xgb_score = roc_auc_score(Y_test, xgb_pred)

    print('Fold', i, '==> XGB oof ROC-AUC score is ==>', xgb_score)

    xgb_pred_test = XGB_md.predict_proba(test_cv)[:, 1]

    ##############
    ## CatBoost ##
    ##############

    Cat_md = CatBoostClassifier(**CatBoost_loaded_best_params
#                                 auto_class_weights='Balanced',
#                                 eval_metric='AUC',
#                                 iterations = 3_000,
#                                 learning_rate = 0.05,
#                                 depth = 8,
#                                 random_strength = 0.5,
#                                 bagging_temperature = 0.7,
#                                 border_count = 30,
#                                 l2_leaf_reg = 7,
#                                 verbose = False, 
#                                 task_type = 'CPU'
                                                ).fit(X_train_sub, Y_train_sub, 
                                                           eval_set=(X_val_sub, Y_val_sub), 
                                                           early_stopping_rounds=100, 
                                                           verbose=100, 
                                                           use_best_model=True)

    cat_pred = Cat_md.predict_proba(X_test)[:, 1]
    cat_score = roc_auc_score(Y_test, cat_pred)

    print('Fold', i, '==> CatBoost oof ROC-AUC score is ==>', cat_score)

    cat_pred_test = Cat_md.predict_proba(test_cv)[:, 1]    
    
    ##############
    ## Ensemble ##
    ##############
    
    
    
    
    # Ансамбль моделей
    ens_pred_1 = ( lgb_pred + xgb_pred + cat_pred ) / 3
    ens_pred_2 = ( lgb_pred_test + xgb_pred_test + cat_pred_test ) / 3
    ens_score_fold = roc_auc_score(Y_test, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)

    print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)
    
    # Поиск оптимального порога для F1
    thresholds = np.linspace(0, 1, 300)
    f1_scores = [f1_score(Y_test, ens_pred_1 > thresh) for thresh in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    ens_f1_score = max(f1_scores)
    
    results_df.loc[len(results_df)] = [i, 'LGBM', lgb_score, optimal_threshold, ens_f1_score]
    results_df.loc[len(results_df)] = [i, 'XGB', xgb_score, optimal_threshold, ens_f1_score]
    results_df.loc[len(results_df)] = [i, 'CatBoost', cat_score, optimal_threshold, ens_f1_score]
    

ens_mean_score = np.mean(ens_cv_scores)
optimal_threshold, ens_mean_score

----------------------------------------------------------




[100]	valid_0's binary_logloss: 0.468304
[200]	valid_0's binary_logloss: 0.459608
[300]	valid_0's binary_logloss: 0.457919
[400]	valid_0's binary_logloss: 0.457052
[500]	valid_0's binary_logloss: 0.455739
Fold 0 ==> LGBM oof ROC-AUC score is ==> 0.7093959519291587
[0]	validation_0-auc:0.62506




[100]	validation_0-auc:0.71015
[162]	validation_0-auc:0.70908
Fold 0 ==> XGB oof ROC-AUC score is ==> 0.7157827324478179
0:	learn: 0.6868449	test: 0.6869006	best: 0.6869006 (0)	total: 115ms	remaining: 2m 17s
100:	learn: 0.4476326	test: 0.4857462	best: 0.4857462 (100)	total: 4.1s	remaining: 44.4s
200:	learn: 0.3764441	test: 0.4636184	best: 0.4636184 (200)	total: 8.14s	remaining: 40.3s
300:	learn: 0.3294015	test: 0.4567743	best: 0.4567455 (299)	total: 12.2s	remaining: 36.2s
400:	learn: 0.2955431	test: 0.4537018	best: 0.4537018 (400)	total: 16.3s	remaining: 32.3s
500:	learn: 0.2669852	test: 0.4521899	best: 0.4521793 (493)	total: 20.3s	remaining: 28.2s
600:	learn: 0.2406800	test: 0.4511089	best: 0.4511089 (600)	total: 24.4s	remaining: 24.1s
700:	learn: 0.2168064	test: 0.4503129	best: 0.4502209 (690)	total: 28.7s	remaining: 20.3s
800:	learn: 0.1968009	test: 0.4501061	best: 0.4500347 (791)	total: 32.6s	remaining: 16.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.4500



[100]	valid_0's binary_logloss: 0.462922
[200]	valid_0's binary_logloss: 0.452201
[300]	valid_0's binary_logloss: 0.447959
[400]	valid_0's binary_logloss: 0.446055
[500]	valid_0's binary_logloss: 0.445265
[600]	valid_0's binary_logloss: 0.444289
[700]	valid_0's binary_logloss: 0.443458
Fold 1 ==> LGBM oof ROC-AUC score is ==> 0.7004411764705882
[0]	validation_0-auc:0.63570




[100]	validation_0-auc:0.73597
[200]	validation_0-auc:0.73868
[228]	validation_0-auc:0.73947
Fold 1 ==> XGB oof ROC-AUC score is ==> 0.6972659709044908
0:	learn: 0.6867344	test: 0.6869135	best: 0.6869135 (0)	total: 42.2ms	remaining: 50.3s
100:	learn: 0.4449822	test: 0.4821244	best: 0.4821244 (100)	total: 3.98s	remaining: 43.1s
200:	learn: 0.3774917	test: 0.4568344	best: 0.4568344 (200)	total: 8s	remaining: 39.6s
300:	learn: 0.3350005	test: 0.4475254	best: 0.4475254 (300)	total: 12s	remaining: 35.8s
400:	learn: 0.3012027	test: 0.4422426	best: 0.4422407 (399)	total: 16s	remaining: 31.8s
500:	learn: 0.2705010	test: 0.4386552	best: 0.4386552 (500)	total: 20.1s	remaining: 27.9s
600:	learn: 0.2459627	test: 0.4362807	best: 0.4362807 (600)	total: 24.2s	remaining: 23.9s
700:	learn: 0.2231232	test: 0.4350780	best: 0.4350428 (698)	total: 28.5s	remaining: 20.1s
800:	learn: 0.2032213	test: 0.4335847	best: 0.4335609 (797)	total: 32.5s	remaining: 16s
900:	learn: 0.1841639	test: 0.4326636	best: 0.4326



[100]	valid_0's binary_logloss: 0.467022
[200]	valid_0's binary_logloss: 0.457514
[300]	valid_0's binary_logloss: 0.454234
[400]	valid_0's binary_logloss: 0.45372
[500]	valid_0's binary_logloss: 0.452465
[600]	valid_0's binary_logloss: 0.451698
[700]	valid_0's binary_logloss: 0.45204
Fold 2 ==> LGBM oof ROC-AUC score is ==> 0.7056625553447186
[0]	validation_0-auc:0.61860




[100]	validation_0-auc:0.71957
[200]	validation_0-auc:0.72350
[300]	validation_0-auc:0.72839
[400]	validation_0-auc:0.72900
[437]	validation_0-auc:0.72725
Fold 2 ==> XGB oof ROC-AUC score is ==> 0.7215812776723592
0:	learn: 0.6869987	test: 0.6870965	best: 0.6870965 (0)	total: 21.8ms	remaining: 26s
100:	learn: 0.4485392	test: 0.4825554	best: 0.4825554 (100)	total: 3.91s	remaining: 42.3s
200:	learn: 0.3789922	test: 0.4593068	best: 0.4593068 (200)	total: 7.95s	remaining: 39.3s
300:	learn: 0.3311194	test: 0.4517767	best: 0.4517767 (300)	total: 12.3s	remaining: 36.6s
400:	learn: 0.2944890	test: 0.4476399	best: 0.4476399 (400)	total: 16.4s	remaining: 32.4s
500:	learn: 0.2677140	test: 0.4449501	best: 0.4449501 (500)	total: 20.4s	remaining: 28.3s
600:	learn: 0.2419516	test: 0.4434749	best: 0.4434749 (600)	total: 24.5s	remaining: 24.2s
700:	learn: 0.2192758	test: 0.4422281	best: 0.4422281 (700)	total: 28.5s	remaining: 20.1s
800:	learn: 0.1974320	test: 0.4414699	best: 0.4414699 (800)	total: 32.5



[100]	valid_0's binary_logloss: 0.461885
[200]	valid_0's binary_logloss: 0.452664
[300]	valid_0's binary_logloss: 0.449148
[400]	valid_0's binary_logloss: 0.448947
Fold 3 ==> LGBM oof ROC-AUC score is ==> 0.7148180379746836
[0]	validation_0-auc:0.62096




[100]	validation_0-auc:0.70229
[200]	validation_0-auc:0.71295
[300]	validation_0-auc:0.71465
[400]	validation_0-auc:0.71537
[500]	validation_0-auc:0.71529
[533]	validation_0-auc:0.71468
Fold 3 ==> XGB oof ROC-AUC score is ==> 0.7251803797468355
0:	learn: 0.6868294	test: 0.6869985	best: 0.6869985 (0)	total: 38.8ms	remaining: 46.3s
100:	learn: 0.4475486	test: 0.4828829	best: 0.4828829 (100)	total: 3.97s	remaining: 43s
200:	learn: 0.3814047	test: 0.4590317	best: 0.4590317 (200)	total: 8.01s	remaining: 39.6s
300:	learn: 0.3335468	test: 0.4521232	best: 0.4521232 (300)	total: 12.1s	remaining: 35.9s
400:	learn: 0.2985942	test: 0.4481280	best: 0.4481280 (400)	total: 16.2s	remaining: 32s
500:	learn: 0.2716286	test: 0.4452090	best: 0.4452090 (500)	total: 20.2s	remaining: 28s
600:	learn: 0.2454775	test: 0.4431939	best: 0.4431680 (599)	total: 24.2s	remaining: 23.9s
700:	learn: 0.2234122	test: 0.4423268	best: 0.4422198 (684)	total: 28.6s	remaining: 20.2s
800:	learn: 0.2023360	test: 0.4417446	best: 



[100]	valid_0's binary_logloss: 0.468085
[200]	valid_0's binary_logloss: 0.461064
[300]	valid_0's binary_logloss: 0.458915
[400]	valid_0's binary_logloss: 0.459518
Fold 4 ==> LGBM oof ROC-AUC score is ==> 0.7155300632911392
[0]	validation_0-auc:0.60913




[100]	validation_0-auc:0.72816
[200]	validation_0-auc:0.72488
[225]	validation_0-auc:0.72406
Fold 4 ==> XGB oof ROC-AUC score is ==> 0.7000585443037974
0:	learn: 0.6870816	test: 0.6871871	best: 0.6871871 (0)	total: 41.5ms	remaining: 49.6s
100:	learn: 0.4518359	test: 0.4863430	best: 0.4863430 (100)	total: 3.91s	remaining: 42.4s
200:	learn: 0.3780845	test: 0.4632672	best: 0.4632672 (200)	total: 7.96s	remaining: 39.3s
300:	learn: 0.3302640	test: 0.4566884	best: 0.4566884 (300)	total: 12s	remaining: 35.6s
400:	learn: 0.2924516	test: 0.4533045	best: 0.4532947 (397)	total: 16s	remaining: 31.6s
500:	learn: 0.2607812	test: 0.4510175	best: 0.4510118 (499)	total: 20s	remaining: 27.7s
600:	learn: 0.2349001	test: 0.4493334	best: 0.4492123 (595)	total: 24s	remaining: 23.7s
700:	learn: 0.2109964	test: 0.4487695	best: 0.4486210 (694)	total: 28.3s	remaining: 19.9s
800:	learn: 0.1881472	test: 0.4487002	best: 0.4485139 (786)	total: 32.3s	remaining: 15.9s
Stopped by overfitting detector  (100 iterations 



[100]	valid_0's binary_logloss: 0.470588
[200]	valid_0's binary_logloss: 0.463815
[300]	valid_0's binary_logloss: 0.461693
[400]	valid_0's binary_logloss: 0.461197
[500]	valid_0's binary_logloss: 0.461131
Fold 5 ==> LGBM oof ROC-AUC score is ==> 0.7356107594936708
[0]	validation_0-auc:0.59945




[100]	validation_0-auc:0.69805
[200]	validation_0-auc:0.70013
[300]	validation_0-auc:0.70128
[400]	validation_0-auc:0.70159
[434]	validation_0-auc:0.70144
Fold 5 ==> XGB oof ROC-AUC score is ==> 0.7266360759493671
0:	learn: 0.6867374	test: 0.6869424	best: 0.6869424 (0)	total: 40.5ms	remaining: 48.3s
100:	learn: 0.4473322	test: 0.4894586	best: 0.4894586 (100)	total: 3.95s	remaining: 42.8s
200:	learn: 0.3790086	test: 0.4684789	best: 0.4684789 (200)	total: 7.97s	remaining: 39.4s
300:	learn: 0.3309766	test: 0.4617597	best: 0.4617597 (300)	total: 12s	remaining: 35.6s
400:	learn: 0.2966779	test: 0.4581618	best: 0.4581618 (400)	total: 16s	remaining: 31.7s
500:	learn: 0.2673025	test: 0.4561764	best: 0.4561764 (500)	total: 20s	remaining: 27.8s
600:	learn: 0.2406205	test: 0.4551096	best: 0.4550755 (590)	total: 24.3s	remaining: 24s
700:	learn: 0.2161709	test: 0.4540551	best: 0.4540551 (700)	total: 28.4s	remaining: 20s
800:	learn: 0.1950627	test: 0.4536990	best: 0.4536496 (798)	total: 32.4s	remain



[100]	valid_0's binary_logloss: 0.469825
[200]	valid_0's binary_logloss: 0.462244
[300]	valid_0's binary_logloss: 0.459729
[400]	valid_0's binary_logloss: 0.460093
Fold 6 ==> LGBM oof ROC-AUC score is ==> 0.7350316455696202
[0]	validation_0-auc:0.58728




[100]	validation_0-auc:0.70858
[200]	validation_0-auc:0.70956
[204]	validation_0-auc:0.70899
Fold 6 ==> XGB oof ROC-AUC score is ==> 0.7255712025316456
0:	learn: 0.6869543	test: 0.6871118	best: 0.6871118 (0)	total: 39.4ms	remaining: 47.1s
100:	learn: 0.4508187	test: 0.4883369	best: 0.4883369 (100)	total: 4.35s	remaining: 47.1s
200:	learn: 0.3789511	test: 0.4663792	best: 0.4663792 (200)	total: 8.38s	remaining: 41.4s
300:	learn: 0.3304091	test: 0.4592494	best: 0.4592494 (300)	total: 12.5s	remaining: 37.1s
400:	learn: 0.2942722	test: 0.4547976	best: 0.4547976 (400)	total: 16.5s	remaining: 32.7s
500:	learn: 0.2647090	test: 0.4524843	best: 0.4524357 (496)	total: 20.6s	remaining: 28.5s
600:	learn: 0.2388793	test: 0.4507755	best: 0.4507755 (600)	total: 24.7s	remaining: 24.4s
700:	learn: 0.2169251	test: 0.4496923	best: 0.4495754 (696)	total: 28.7s	remaining: 20.2s
800:	learn: 0.1954816	test: 0.4493669	best: 0.4493577 (798)	total: 33s	remaining: 16.2s
900:	learn: 0.1771844	test: 0.4500496	best:

(0.1906354515050167, 0.7288154115668318)

In [18]:
results_df

Unnamed: 0,Fold,Model,ROC AUC,Optimal Threshold,F1 Score
0,0,LGBM,0.703652,0.294314,0.434515
1,0,XGB,0.698606,0.294314,0.434515
2,0,CatBoost,0.702623,0.294314,0.434515
3,1,LGBM,0.676868,0.284281,0.411834
4,1,XGB,0.673497,0.284281,0.411834
5,1,CatBoost,0.681975,0.284281,0.411834
6,2,LGBM,0.716152,0.267559,0.454194
7,2,XGB,0.700759,0.267559,0.454194
8,2,CatBoost,0.715045,0.267559,0.454194
9,3,LGBM,0.717722,0.244147,0.465753


In [12]:
results_df # this is new

Unnamed: 0,Fold,Model,ROC AUC,Optimal Threshold,F1 Score
0,0,LGBM,0.709396,0.187291,0.452962
1,0,XGB,0.715783,0.187291,0.452962
2,0,CatBoost,0.730982,0.187291,0.452962
3,1,LGBM,0.700441,0.173913,0.440183
4,1,XGB,0.697266,0.173913,0.440183
5,1,CatBoost,0.716244,0.173913,0.440183
6,2,LGBM,0.705663,0.160535,0.446673
7,2,XGB,0.721581,0.160535,0.446673
8,2,CatBoost,0.721795,0.160535,0.446673
9,3,LGBM,0.714818,0.173913,0.456863


In [13]:
avg_ens_preds = np.mean(ens_preds, axis=0)
final_predictions = (avg_ens_preds > optimal_threshold).astype(int)
final_predictions.sum()

1810

In [14]:
final_predictions.shape

(5942,)

In [80]:
from sklearn.metrics import confusion_matrix

# Теперь вы можете сравнить прогнозы с реальными значениями
cm = confusion_matrix(Y_test_sim, final_predictions)

cm

ValueError: Found input variables with inconsistent numbers of samples: [4159, 5942]

In [73]:
# 1. Прогнозирование с помощью каждой отдельной модели на новых данных
lgb_new_pred = LGBM_md.predict_proba(X)[:, 1]
xgb_new_pred = XGB_md.predict_proba(X)[:, 1]
cat_new_pred = Cat_md.predict_proba(X)[:, 1]

# 2. Усреднение прогнозов от разных моделей
ens_new_pred = (lgb_new_pred + xgb_new_pred + cat_new_pred) / 3

# Если вам нужны бинарные прогнозы (0 или 1), вы можете применить порог, который был найден ранее
ens_new_binary_pred = (ens_new_pred > optimal_threshold).astype(int)
ens_new_binary_pred.shape

(13863,)

In [15]:
tester_df = pd.DataFrame()
tester_df['smoking'] = Y
tester_df['smoking_pred'] = ens_new_binary_pred

NameError: name 'ens_new_binary_pred' is not defined

In [16]:
tester_df_cm = confusion_matrix(tester_df['smoking'], tester_df['smoking_pred'])
tester_df_cm

KeyError: 'smoking_pred'

In [17]:
submission = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/sample_submission.csv')

In [20]:
submission['smoking'] = final_predictions
submission.to_csv('Ensemble_submission_08.csv', index = False)

In [19]:
submission['smoking'].sum()

1810