In [3]:
import pandas as pd
import numpy as np
from phik.report import plot_correlation_matrix
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, \
    classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
import optuna
from sklearn.metrics import roc_auc_score, average_precision_score
from optuna.pruners import MedianPruner
from sklearn.metrics import f1_score
from tqdm import tqdm
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [240]:
df_raw = pd.read_csv('/kaggle/input/leopard-challenge-classification/train.csv')
test = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/test.csv')
df_raw.shape, test.shape

((13863, 26), (5942, 25))

In [241]:
test_cv = test.drop(['oral', 'ID'], axis = 1).copy()
test_cv['tartar'] = test_cv['tartar'].apply(lambda x: 1 if x == 'Y' else 0)
merged_data = df_raw.drop(['oral', 'ID'], axis=1).copy()
merged_data['tartar'] = merged_data['tartar'].apply(lambda x: 1 if x == 'Y' else 0)
merged_data.shape, test_cv.shape

((13863, 24), (5942, 23))

In [263]:
merged_data['BMI'] = merged_data['weight(kg)'] / ((merged_data['height(cm)'] / 100) ** 2)
merged_data['Chol_HDL_ratio'] = merged_data['Cholesterol'] / merged_data['HDL']
merged_data['ldl_hdl'] = merged_data['LDL'] / merged_data['HDL']
merged_data['map'] = (merged_data['systolic'] + (2 * merged_data['relaxation'])) / 3
merged_data['Waist_Height_ratio'] = merged_data['waist(cm)'] / merged_data['height(cm)']
merged_data['height_nin_110'] = (merged_data['height(cm)'] - 110) / merged_data['weight(kg)']
merged_data['Average_hearing'] = (merged_data['hearing(left)'] + merged_data['hearing(right)']) / 2
merged_data['Average_eyesight'] = (merged_data['eyesight(left)'] + merged_data['eyesight(right)']) / 2
merged_data['ast_alt'] = merged_data['AST'] / merged_data['ALT']
merged_data['Systolic_Diastolic_ratio'] = merged_data['systolic'] / merged_data['relaxation']
merged_data['Atherogenic_coefficient'] = (merged_data['Cholesterol'] - merged_data['HDL']) / merged_data['HDL']
merged_data['BMI_to_age'] = merged_data['BMI'] / merged_data['age']
merged_data['Glucose_to_Cholesterol'] = merged_data['fasting blood sugar'] / merged_data['Cholesterol']
merged_data['Triglycerides_to_HDL'] = merged_data['triglyceride'] / merged_data['HDL']
merged_data['Systolic_to_age'] = merged_data['systolic'] / merged_data['age']
merged_data['Diastolic_to_age'] = merged_data['relaxation'] / merged_data['age']
merged_data['Hemoglobin_to_age'] = merged_data['hemoglobin'] / merged_data['age']
merged_data['GTP_to_age'] = merged_data['Gtp'] / merged_data['age']
merged_data['GTP_to_AST_ALT'] = merged_data['Gtp'] / (merged_data['AST'] + merged_data['ALT'])
merged_data['GTP_to_Cholesterol'] = merged_data['Gtp'] / merged_data['Cholesterol']
merged_data['GTP_index'] = merged_data['Gtp'] / (merged_data['Cholesterol'] + merged_data['triglyceride'])

merged_data['age_normal'] = (merged_data['age'] < 62.5).astype(int)
merged_data['weight(kg)_normal'] = (merged_data['weight(kg)'] < 52.5).astype(int)
merged_data['waist(cm)_normal'] = (merged_data['waist(cm)'] < 94.05).astype(int)
merged_data['eyesight(left)_normal'] = (merged_data['eyesight(left)'] < 0.35).astype(int)
merged_data['eyesight(right)_normal'] = (merged_data['eyesight(right)'] < 1.55).astype(int)
merged_data['systolic_normal'] = (merged_data['systolic'] < 140.5).astype(int)
merged_data['relaxation_normal'] = (merged_data['relaxation'] < 69.5).astype(int)
merged_data['fasting_blood_sugar_normal'] = (merged_data['fasting blood sugar'] < 238.5).astype(int)
merged_data['Cholesterol_normal'] = (merged_data['Cholesterol'] < 316.5).astype(int)
merged_data['triglyceride_normal'] = (merged_data['triglyceride'] < 114.5).astype(int)
merged_data['HDL_normal'] = (merged_data['HDL'] < 50.5).astype(int)
merged_data['LDL_normal'] = (merged_data['LDL'] < 88.5).astype(int)
merged_data['hemoglobin_normal'] = (merged_data['hemoglobin'] < 16.45).astype(int)
merged_data['Urine_protein_normal'] = (merged_data['Urine protein'] < 1.5).astype(int)
merged_data['serum_creatinine_normal'] = (merged_data['serum creatinine'] < 0.75).astype(int)
merged_data['AST_normal'] = (merged_data['AST'] < 45.5).astype(int)
merged_data['ALT_normal'] = (merged_data['ALT'] < 29.5).astype(int)
merged_data['Gtp_normal'] = (merged_data['Gtp'] < 43.5).astype(int)
merged_data['BMI_normal'] = (merged_data['BMI'] < 19.8).astype(int)
merged_data['Chol_HDL_ratio_normal'] = (merged_data['Chol_HDL_ratio'] < 4.46).astype(int)
merged_data['ldl_hdl_normal'] = (merged_data['ldl_hdl'] < 0.71).astype(int)
merged_data['map_normal'] = (merged_data['map'] < 134.3).astype(int)
merged_data['Waist_Height_ratio_normal'] = (merged_data['Waist_Height_ratio'] < 0.41).astype(int)
merged_data['height_nin_110_normal'] = (merged_data['height_nin_110'] < 1.08).astype(int)
merged_data['Average_hearing_normal'] = (merged_data['Average_hearing'] < 1.25).astype(int)
merged_data['Average_eyesight_normal'] = (merged_data['Average_eyesight'] < 0.325).astype(int)
merged_data['ast_alt_normal'] = (merged_data['ast_alt'] < 2.81).astype(int)
merged_data['Systolic_Diastolic_ratio_normal'] = (merged_data['Systolic_Diastolic_ratio'] < 1.67).astype(int) 
merged_data['Atherogenic_coefficient_normal'] = (merged_data['Atherogenic_coefficient'] < 3.456).astype(int)
merged_data['BMI_to_age_normal'] = (merged_data['BMI_to_age'] < 0.797).astype(int)
merged_data['Glucose_to_Cholesterol_normal'] = (merged_data['Glucose_to_Cholesterol'] < 0.64897).astype(int)
merged_data['Triglycerides_to_HDL_normal'] = (merged_data['Triglycerides_to_HDL'] < 2.85857).astype(int)
merged_data['Systolic_to_age_normal'] = (merged_data['Systolic_to_age'] < 1.986).astype(int)
merged_data['Diastolic_to_age_normal'] = (merged_data['Diastolic_to_age'] < 1.1679).astype(int)
merged_data['Hemoglobin_to_age_normal'] = (merged_data['Hemoglobin_to_age'] < 0.253).astype(int)
merged_data['GTP_to_age_normal'] = (merged_data['GTP_to_age'] < 1.15).astype(int)
merged_data['GTP_to_AST_ALT_normal'] = (merged_data['GTP_to_AST_ALT'] < 0.76).astype(int)
merged_data['GTP_to_Cholesterol_normal'] = (merged_data['GTP_to_Cholesterol'] < 0.19).astype(int)
merged_data['GTP_index_normal'] = (merged_data['GTP_index'] < 0.123).astype(int)

merged_data.shape

(13863, 84)

In [262]:
test_cv['BMI'] = test_cv['weight(kg)'] / ((test_cv['height(cm)'] / 100) ** 2)
test_cv['Chol_HDL_ratio'] = test_cv['Cholesterol'] / test_cv['HDL']
test_cv['ldl_hdl'] = test_cv['LDL'] / test_cv['HDL']
test_cv['map'] = (test_cv['systolic'] + (2 * test_cv['relaxation'])) / 3
test_cv['Waist_Height_ratio'] = test_cv['waist(cm)'] / test_cv['height(cm)']
test_cv['height_nin_110'] = (test_cv['height(cm)'] - 110) / test_cv['weight(kg)']
test_cv['Average_hearing'] = (test_cv['hearing(left)'] + test_cv['hearing(right)']) / 2
test_cv['Average_eyesight'] = (test_cv['eyesight(left)'] + test_cv['eyesight(right)']) / 2
test_cv['ast_alt'] = test_cv['AST'] / test_cv['ALT']
test_cv['Systolic_Diastolic_ratio'] = test_cv['systolic'] / test_cv['relaxation']
test_cv['Atherogenic_coefficient'] = (test_cv['Cholesterol'] - test_cv['HDL']) / test_cv['HDL']
test_cv['BMI_to_age'] = test_cv['BMI'] / test_cv['age']
test_cv['Glucose_to_Cholesterol'] = test_cv['fasting blood sugar'] / test_cv['Cholesterol']
test_cv['Triglycerides_to_HDL'] = test_cv['triglyceride'] / test_cv['HDL']
test_cv['Systolic_to_age'] = test_cv['systolic'] / test_cv['age']
test_cv['Diastolic_to_age'] = test_cv['relaxation'] / test_cv['age']
test_cv['Hemoglobin_to_age'] = test_cv['hemoglobin'] / test_cv['age']
test_cv['GTP_to_age'] = test_cv['Gtp'] / test_cv['age']
test_cv['GTP_to_AST_ALT'] = test_cv['Gtp'] / (test_cv['AST'] + test_cv['ALT'])
test_cv['GTP_to_Cholesterol'] = test_cv['Gtp'] / test_cv['Cholesterol']
test_cv['GTP_index'] = test_cv['Gtp'] / (test_cv['Cholesterol'] + test_cv['triglyceride'])

test_cv['age_normal'] = (test_cv['age'] < 62.5).astype(int)
test_cv['weight(kg)_normal'] = (test_cv['weight(kg)'] < 52.5).astype(int)
test_cv['waist(cm)_normal'] = (test_cv['waist(cm)'] < 94.05).astype(int)
test_cv['eyesight(left)_normal'] = (test_cv['eyesight(left)'] < 0.35).astype(int)
test_cv['eyesight(right)_normal'] = (test_cv['eyesight(right)'] < 1.55).astype(int)
test_cv['systolic_normal'] = (test_cv['systolic'] < 140.5).astype(int)
test_cv['relaxation_normal'] = (test_cv['relaxation'] < 69.5).astype(int)
test_cv['fasting_blood_sugar_normal'] = (test_cv['fasting blood sugar'] < 238.5).astype(int)
test_cv['Cholesterol_normal'] = (test_cv['Cholesterol'] < 316.5).astype(int)
test_cv['triglyceride_normal'] = (test_cv['triglyceride'] < 114.5).astype(int)
test_cv['HDL_normal'] = (test_cv['HDL'] < 50.5).astype(int)
test_cv['LDL_normal'] = (test_cv['LDL'] < 88.5).astype(int)
test_cv['hemoglobin_normal'] = (test_cv['hemoglobin'] < 16.45).astype(int)
test_cv['Urine_protein_normal'] = (test_cv['Urine protein'] < 1.5).astype(int)
test_cv['serum_creatinine_normal'] = (test_cv['serum creatinine'] < 0.75).astype(int)
test_cv['AST_normal'] = (test_cv['AST'] < 45.5).astype(int)
test_cv['ALT_normal'] = (test_cv['ALT'] < 29.5).astype(int)
test_cv['Gtp_normal'] = (test_cv['Gtp'] < 43.5).astype(int)
test_cv['BMI_normal'] = (test_cv['BMI'] < 19.8).astype(int)
test_cv['Chol_HDL_ratio_normal'] = (test_cv['Chol_HDL_ratio'] < 4.46).astype(int)
test_cv['ldl_hdl_normal'] = (test_cv['ldl_hdl'] < 0.71).astype(int)
test_cv['map_normal'] = (test_cv['map'] < 134.3).astype(int)
test_cv['Waist_Height_ratio_normal'] = (test_cv['Waist_Height_ratio'] < 0.41).astype(int)
test_cv['height_nin_110_normal'] = (test_cv['height_nin_110'] < 1.08).astype(int)
test_cv['Average_hearing_normal'] = (test_cv['Average_hearing'] < 1.25).astype(int)
test_cv['Average_eyesight_normal'] = (test_cv['Average_eyesight'] < 0.325).astype(int)
test_cv['ast_alt_normal'] = (test_cv['ast_alt'] < 2.81).astype(int)
test_cv['Systolic_Diastolic_ratio_normal'] = (test_cv['Systolic_Diastolic_ratio'] < 1.67).astype(int) 
test_cv['Atherogenic_coefficient_normal'] = (test_cv['Atherogenic_coefficient'] < 3.456).astype(int)
test_cv['BMI_to_age_normal'] = (test_cv['BMI_to_age'] < 0.797).astype(int)
test_cv['Glucose_to_Cholesterol_normal'] = (test_cv['Glucose_to_Cholesterol'] < 0.64897).astype(int)
test_cv['Triglycerides_to_HDL_normal'] = (test_cv['Triglycerides_to_HDL'] < 2.85857).astype(int)
test_cv['Systolic_to_age_normal'] = (test_cv['Systolic_to_age'] < 1.986).astype(int)
test_cv['Diastolic_to_age_normal'] = (test_cv['Diastolic_to_age'] < 1.1679).astype(int)
test_cv['Hemoglobin_to_age_normal'] = (test_cv['Hemoglobin_to_age'] < 0.253).astype(int)
test_cv['GTP_to_age_normal'] = (test_cv['GTP_to_age'] < 1.15).astype(int)
test_cv['GTP_to_AST_ALT_normal'] = (test_cv['GTP_to_AST_ALT'] < 0.76).astype(int)
test_cv['GTP_to_Cholesterol_normal'] = (test_cv['GTP_to_Cholesterol'] < 0.19).astype(int)
test_cv['GTP_index_normal'] = (test_cv['GTP_index'] < 0.123).astype(int)

test_cv.shape

(5942, 83)

In [243]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def evaluate_features_auc(data, target, features):
    """
    Оценивает разделяющую способность каждого признака с помощью ROC-AUC.
    
    :param data: DataFrame, содержащий признаки и целевую переменную.
    :param target: Имя столбца, содержащего целевую переменную.
    :param features: Список имен признаков для оценки.
    
    :return: Словарь, где ключи — названия признаков, а значения — ROC-AUC для каждого признака.
    """
    auc_scores = {}
    for feature in tqdm(features):
        # Разделение данных на обучающую и тестовую выборку
        X_train, X_test, y_train, y_test = train_test_split(data[[feature]], data[target], test_size=0.2, stratify=data[target], random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
        
        # Обучение модели
        model = CatBoostClassifier(verbose=0, auto_class_weights='Balanced', random_state=42)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50)
        
        # Прогнозирование и вычисление ROC-AUC
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_pred_proba)
        auc_scores[feature] = auc
        
    auc_scores_df = pd.DataFrame(list(auc_scores.items()), columns=['Feature', 'auc_scores'])
    
    return auc_scores_df


In [244]:
best_infinity_features = [ 'GTP_index', 'GTP_to_Cholesterol', 'GTP_to_AST_ALT', 'GTP_to_age', 'Triglycerides_to_HDL', 'Gtp', 'triglyceride', 'tartar' ]
infinity_features = merged_data.nunique()[merged_data.nunique() >= 3].index.to_list()
infinity_features, len(infinity_features), len(merged_data.columns.to_list())

(['age',
  'height(cm)',
  'weight(kg)',
  'waist(cm)',
  'eyesight(left)',
  'eyesight(right)',
  'systolic',
  'relaxation',
  'fasting blood sugar',
  'Cholesterol',
  'triglyceride',
  'HDL',
  'LDL',
  'hemoglobin',
  'Urine protein',
  'serum creatinine',
  'AST',
  'ALT',
  'Gtp',
  'BMI',
  'Chol_HDL_ratio',
  'ldl_hdl',
  'map',
  'Waist_Height_ratio',
  'height_nin_110',
  'Average_hearing',
  'Average_eyesight',
  'ast_alt',
  'Systolic_Diastolic_ratio',
  'Atherogenic_coefficient',
  'BMI_to_age',
  'Glucose_to_Cholesterol',
  'Triglycerides_to_HDL',
  'Systolic_to_age',
  'Diastolic_to_age',
  'Hemoglobin_to_age',
  'GTP_to_age',
  'GTP_to_AST_ALT',
  'GTP_to_Cholesterol',
  'GTP_index'],
 40,
 84)

In [245]:
evaluate_features_auc_df = evaluate_features_auc(merged_data, 'smoking', merged_data.columns.to_list())

100%|██████████| 84/84 [00:27<00:00,  3.04it/s]


In [248]:
evaluate_features_auc_df.sort_values(by='auc_scores', ascending=False) .iloc[:30,:]#['Feature'].to_list()

Unnamed: 0,Feature,auc_scores
23,smoking,1.0
42,GTP_to_AST_ALT,0.647903
43,GTP_to_Cholesterol,0.641975
20,Gtp,0.638047
41,GTP_to_age,0.628335
44,GTP_index,0.62089
12,triglyceride,0.610875
82,GTP_to_Cholesterol_normal,0.610485
37,Triglycerides_to_HDL,0.608918
0,age,0.605216


In [11]:
from sklearn.preprocessing import PolynomialFeatures


continuous_data = merged_data[infinity_features]
# Создание полиномиальных признаков (степень 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_data = poly.fit_transform(continuous_data)
poly_features = poly.get_feature_names_out(infinity_features)

# Создание DataFrame с полиномиальными признаками
poly_df = pd.DataFrame(poly_data, columns=poly_features)

# Проверка размерности нового DataFrame
poly_df.shape

(13863, 860)

In [12]:
poly_df['smoking'] = merged_data['smoking'].copy()

In [13]:
poly_df

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),systolic,relaxation,fasting blood sugar,Cholesterol,...,GTP_to_age GTP_to_AST_ALT,GTP_to_age GTP_to_Cholesterol,GTP_to_age GTP_index,GTP_to_AST_ALT^2,GTP_to_AST_ALT GTP_to_Cholesterol,GTP_to_AST_ALT GTP_index,GTP_to_Cholesterol^2,GTP_to_Cholesterol GTP_index,GTP_index^2,smoking
0,65.0,170.0,75.0,91.0,0.6,0.9,122.0,79.0,107.0,119.0,...,6.360684,1.924240,1.346968,11.484568,3.474323,2.432026,1.051056,0.735739,0.515017,0
1,35.0,170.0,85.0,97.0,1.5,1.5,138.0,88.0,117.0,204.0,...,1.015873,0.224090,0.103193,0.790123,0.174292,0.080261,0.038447,0.017705,0.008153,1
2,70.0,165.0,55.0,75.0,0.8,1.0,115.0,63.0,128.0,165.0,...,0.184184,0.031255,0.022921,0.460459,0.078139,0.057302,0.013260,0.009724,0.007131,0
3,35.0,180.0,85.0,83.0,1.2,1.0,130.0,80.0,100.0,209.0,...,0.206429,0.039508,0.022684,0.180625,0.034569,0.019849,0.006616,0.003799,0.002181,0
4,25.0,170.0,65.0,80.0,1.5,1.2,135.0,75.0,94.0,153.0,...,0.237576,0.051242,0.034690,0.179982,0.038820,0.026281,0.008373,0.005668,0.003837,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13858,45.0,175.0,85.0,94.0,0.8,0.8,127.0,71.0,99.0,249.0,...,0.235948,0.032218,0.018744,0.312284,0.042641,0.024808,0.005822,0.003387,0.001971,0
13859,40.0,170.0,75.0,86.0,1.2,1.0,134.0,88.0,97.0,247.0,...,1.200000,0.131174,0.075349,1.777778,0.194332,0.111628,0.021243,0.012202,0.007009,1
13860,65.0,170.0,70.0,85.0,0.6,0.7,131.0,82.0,99.0,180.0,...,0.443077,0.049231,0.033065,1.440000,0.160000,0.107463,0.017778,0.011940,0.008020,1
13861,30.0,160.0,80.0,89.0,1.5,1.5,120.0,80.0,92.0,172.0,...,0.525137,0.186240,0.104004,0.258264,0.091594,0.051150,0.032484,0.018140,0.010130,0


In [14]:
poly_df.columns.to_list()

['age',
 'height(cm)',
 'weight(kg)',
 'waist(cm)',
 'eyesight(left)',
 'eyesight(right)',
 'systolic',
 'relaxation',
 'fasting blood sugar',
 'Cholesterol',
 'triglyceride',
 'HDL',
 'LDL',
 'hemoglobin',
 'Urine protein',
 'serum creatinine',
 'AST',
 'ALT',
 'Gtp',
 'BMI',
 'Chol_HDL_ratio',
 'ldl_hdl',
 'map',
 'Waist_Height_ratio',
 'height_nin_110',
 'Average_hearing',
 'Average_eyesight',
 'ast_alt',
 'Systolic_Diastolic_ratio',
 'Atherogenic_coefficient',
 'BMI_to_age',
 'Glucose_to_Cholesterol',
 'Triglycerides_to_HDL',
 'Systolic_to_age',
 'Diastolic_to_age',
 'Hemoglobin_to_age',
 'GTP_to_age',
 'GTP_to_AST_ALT',
 'GTP_to_Cholesterol',
 'GTP_index',
 'age^2',
 'age height(cm)',
 'age weight(kg)',
 'age waist(cm)',
 'age eyesight(left)',
 'age eyesight(right)',
 'age systolic',
 'age relaxation',
 'age fasting blood sugar',
 'age Cholesterol',
 'age triglyceride',
 'age HDL',
 'age LDL',
 'age hemoglobin',
 'age Urine protein',
 'age serum creatinine',
 'age AST',
 'age ALT'

In [15]:
%%time
evaluate_features_auc_poly_df = evaluate_features_auc(poly_df, 'smoking', poly_df.columns.to_list()) # [:,37:255]

100%|██████████| 861/861 [06:20<00:00,  2.26it/s]

CPU times: user 11min 51s, sys: 3min 15s, total: 15min 6s
Wall time: 6min 20s





In [16]:
evaluate_features_auc_poly_df.sort_values(by='auc_scores', ascending=False).iloc[:50,:]

Unnamed: 0,Feature,auc_scores
860,smoking,1.0
422,triglyceride GTP_to_AST_ALT,0.665334
423,triglyceride GTP_to_Cholesterol,0.659486
361,fasting blood sugar GTP_to_AST_ALT,0.659481
424,triglyceride GTP_index,0.657468
626,Gtp GTP_to_AST_ALT,0.65731
821,Glucose_to_Cholesterol GTP_to_AST_ALT,0.655941
855,GTP_to_AST_ALT GTP_to_Cholesterol,0.655747
829,Triglycerides_to_HDL GTP_to_AST_ALT,0.654313
851,GTP_to_age GTP_to_AST_ALT,0.653787


In [38]:
evaluate_features_auc_poly_df.to_csv('evaluate_features_auc_poly_df.csv', index=False)

In [42]:
evaluate_features_Gini_poly_df = pd.read_csv(r'/kaggle/input/evaluate-features-gini-poly-df/evaluate_features_Gini_poly_df.csv')

In [43]:
# Отсортировать датасеты по соответствующим критериям и присвоить ранг каждому признаку
auc_df_sorted = evaluate_features_auc_poly_df.sort_values(by='auc_scores', ascending=False).reset_index(drop=True)
auc_df_sorted['auc_rank'] = auc_df_sorted.index + 1

gini_df_sorted = evaluate_features_Gini_poly_df.sort_values(by='Gini_Index', ascending=True).reset_index(drop=True)
gini_df_sorted['gini_rank'] = gini_df_sorted.index + 1

# Объединить датасеты по признакам
combined_df = pd.merge(auc_df_sorted, gini_df_sorted, on='Feature')

# Рассчитать совокупный рейтинг для каждого признака (усреднение рангов)
combined_df['combined_rank'] = (combined_df['auc_rank'] + combined_df['gini_rank']) / 2

# Отсортировать признаки по совокупному рейтингу и выбрать топ-50
top_50_features = combined_df.sort_values(by='combined_rank').head(50)

top_50_features


Unnamed: 0,Feature,auc_scores,auc_rank,Gini_Index,gini_rank,combined_rank
0,triglyceride GTP_to_AST_ALT,0.654774,2,0.311241,2,2.0
1,Triglycerides_to_HDL GTP_to_AST_ALT,0.653306,3,0.312005,7,5.0
10,hemoglobin GTP_to_AST_ALT,0.637554,12,0.311536,3,7.5
4,triglyceride GTP_index,0.642338,6,0.312307,12,9.0
2,triglyceride GTP_to_Cholesterol,0.644721,4,0.31264,15,9.5
12,Gtp GTP_to_AST_ALT,0.635941,14,0.312002,6,10.0
15,height(cm) GTP_to_AST_ALT,0.633692,17,0.311771,4,10.5
13,GTP_to_AST_ALT GTP_to_Cholesterol,0.63494,15,0.312131,8,11.5
19,GTP_to_AST_ALT^2,0.632416,22,0.311794,5,13.5
26,height_nin_110 GTP_to_AST_ALT,0.627545,29,0.31103,1,15.0


In [44]:
top_50_features['Feature'].to_list()

['triglyceride GTP_to_AST_ALT',
 'Triglycerides_to_HDL GTP_to_AST_ALT',
 'hemoglobin GTP_to_AST_ALT',
 'triglyceride GTP_index',
 'triglyceride GTP_to_Cholesterol',
 'Gtp GTP_to_AST_ALT',
 'height(cm) GTP_to_AST_ALT',
 'GTP_to_AST_ALT GTP_to_Cholesterol',
 'GTP_to_AST_ALT^2',
 'height_nin_110 GTP_to_AST_ALT',
 'Triglycerides_to_HDL GTP_to_Cholesterol',
 'waist(cm) GTP_to_AST_ALT',
 'GTP_to_age GTP_to_AST_ALT',
 'Triglycerides_to_HDL GTP_index',
 'Chol_HDL_ratio GTP_to_AST_ALT',
 'fasting blood sugar GTP_to_AST_ALT',
 'Urine protein GTP_to_AST_ALT',
 'Waist_Height_ratio GTP_to_AST_ALT',
 'triglyceride Gtp',
 'Gtp Triglycerides_to_HDL',
 'weight(kg) GTP_to_AST_ALT',
 'Systolic_Diastolic_ratio GTP_to_AST_ALT',
 'Cholesterol GTP_to_AST_ALT',
 'Triglycerides_to_HDL GTP_to_age',
 'relaxation GTP_to_AST_ALT',
 'Average_hearing GTP_to_AST_ALT',
 'triglyceride GTP_to_age',
 'map GTP_to_AST_ALT',
 'systolic GTP_to_AST_ALT',
 'BMI GTP_to_AST_ALT',
 'AST GTP_to_AST_ALT',
 'Glucose_to_Cholesterol G

In [73]:
evaluate_features_auc_poly_df.sort_values(by='auc_scores', ascending=False).iloc[1:50,:]['Feature'].to_list()

['triglyceride GTP_to_AST_ALT',
 'Triglycerides_to_HDL GTP_to_AST_ALT',
 'triglyceride GTP_to_Cholesterol',
 'Triglycerides_to_HDL GTP_to_Cholesterol',
 'triglyceride GTP_index',
 'triglyceride Gtp',
 'Triglycerides_to_HDL GTP_index',
 'Gtp Triglycerides_to_HDL',
 'Chol_HDL_ratio GTP_to_AST_ALT',
 'Triglycerides_to_HDL GTP_to_age',
 'hemoglobin GTP_to_AST_ALT',
 'triglyceride GTP_to_age',
 'Gtp GTP_to_AST_ALT',
 'GTP_to_AST_ALT GTP_to_Cholesterol',
 'waist(cm) GTP_to_AST_ALT',
 'height(cm) GTP_to_AST_ALT',
 'Waist_Height_ratio GTP_to_AST_ALT',
 'weight(kg) GTP_to_AST_ALT',
 'GTP_to_age GTP_to_AST_ALT',
 'GTP_to_AST_ALT',
 'GTP_to_AST_ALT^2',
 'fasting blood sugar GTP_to_AST_ALT',
 'Atherogenic_coefficient GTP_to_AST_ALT',
 'BMI GTP_to_AST_ALT',
 'Urine protein GTP_to_AST_ALT',
 'Average_hearing GTP_to_AST_ALT',
 'Chol_HDL_ratio GTP_to_Cholesterol',
 'height_nin_110 GTP_to_AST_ALT',
 'Cholesterol GTP_to_AST_ALT',
 'relaxation GTP_to_AST_ALT',
 'Systolic_Diastolic_ratio GTP_to_AST_ALT',


In [17]:
for feat in evaluate_features_auc_poly_df.sort_values(by='auc_scores', ascending=False).iloc[1:55,:]['Feature'].to_list():
    merged_data[feat] = poly_df[feat]

In [18]:
merged_data#.iloc[:,:45]

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,Chol_HDL_ratio GTP_to_Cholesterol,map GTP_to_AST_ALT,ast_alt GTP_to_age,map GTP_to_Cholesterol,Glucose_to_Cholesterol GTP_to_age,Diastolic_to_age GTP_to_AST_ALT,weight(kg) GTP_to_Cholesterol,fasting blood sugar GTP_to_age,age GTP_to_age,Cholesterol GTP_to_Cholesterol
0,65,170,75,91.0,0.6,0.9,1.0,1.0,122.0,79.0,...,1.718310,316.296296,2.346154,95.686275,1.687654,4.118803,76.890756,200.830769,122.0,122.0
1,35,170,85,97.0,1.5,1.5,1.0,1.0,138.0,88.0,...,0.869565,93.037037,0.914286,20.522876,0.655462,2.234921,16.666667,133.714286,40.0,40.0
2,70,165,55,75.0,0.8,1.0,1.0,1.0,115.0,63.0,...,0.279412,54.511905,0.313187,9.250505,0.210563,0.610714,6.333333,34.742857,19.0,19.0
3,35,180,85,83.0,1.2,1.0,1.0,1.0,130.0,80.0,...,0.320755,41.083333,0.593651,7.862839,0.232399,0.971429,6.913876,48.571429,17.0,17.0
4,25,170,65,80.0,1.5,1.2,1.0,1.0,135.0,75.0,...,0.241379,40.303030,0.861538,8.692810,0.344052,1.272727,5.947712,52.640000,14.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13858,45,175,85,94.0,0.8,0.8,1.0,1.0,127.0,71.0,...,0.387755,50.107843,0.422222,6.842035,0.167871,0.881699,6.485944,41.800000,19.0,19.0
13859,40,170,75,86.0,1.2,1.0,1.0,1.0,134.0,88.0,...,0.580645,137.777778,1.125000,15.060729,0.353441,2.933333,10.931174,87.300000,36.0,36.0
13860,65,170,70,85.0,0.6,0.7,1.0,1.0,131.0,82.0,...,0.510638,118.000000,0.553846,13.111111,0.203077,1.513846,9.333333,36.553846,24.0,24.0
13861,30,160,80,89.0,1.5,1.5,1.0,1.0,120.0,80.0,...,0.632653,47.431694,0.767619,16.821705,0.552713,1.355191,14.418605,95.066667,31.0,31.0


In [134]:
poly_df.iloc[:,:100]

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),systolic,relaxation,fasting blood sugar,Cholesterol,...,height(cm) HDL,height(cm) LDL,height(cm) hemoglobin,height(cm) Urine protein,height(cm) serum creatinine,height(cm) AST,height(cm) ALT,height(cm) Gtp,height(cm) BMI,height(cm) Chol_HDL_ratio
0,65.0,170.0,75.0,91.0,0.6,0.9,122.0,79.0,107.0,119.0,...,12070.0,8840.0,2380.0,510.0,204.0,3400.0,2720.0,20740.0,4411.764706,284.929577
1,35.0,170.0,85.0,97.0,1.5,1.5,138.0,88.0,117.0,204.0,...,7820.0,20400.0,2482.0,170.0,119.0,3400.0,4250.0,6800.0,5000.000000,753.913043
2,70.0,165.0,55.0,75.0,0.8,1.0,115.0,63.0,128.0,165.0,...,11220.0,14025.0,2425.5,165.0,115.5,2475.0,2145.0,3135.0,3333.333333,400.367647
3,35.0,180.0,85.0,83.0,1.2,1.0,130.0,80.0,100.0,209.0,...,9540.0,22500.0,3096.0,180.0,144.0,3960.0,3240.0,3060.0,4722.222222,709.811321
4,25.0,170.0,65.0,80.0,1.5,1.2,135.0,75.0,94.0,153.0,...,9860.0,13600.0,2805.0,170.0,170.0,3400.0,2210.0,2380.0,3823.529412,448.448276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13858,45.0,175.0,85.0,94.0,0.8,0.8,127.0,71.0,99.0,249.0,...,8575.0,28700.0,2870.0,175.0,157.5,2975.0,2975.0,3325.0,4857.142857,889.285714
13859,40.0,170.0,75.0,86.0,1.2,1.0,134.0,88.0,97.0,247.0,...,10540.0,25160.0,2839.0,170.0,153.0,2550.0,2040.0,6120.0,4411.764706,677.258065
13860,65.0,170.0,70.0,85.0,0.6,0.7,131.0,82.0,99.0,180.0,...,7990.0,19550.0,2669.0,170.0,136.0,2040.0,1360.0,4080.0,4117.647059,651.063830
13861,30.0,160.0,80.0,89.0,1.5,1.5,120.0,80.0,92.0,172.0,...,7840.0,15360.0,2368.0,160.0,128.0,4160.0,5600.0,4960.0,5000.000000,561.632653


In [249]:
phik_overview = merged_data.phik_matrix()
# phik_overview['smoking'].sort_values(ascending=False) 
sorted_smoking_values = phik_overview['smoking'].sort_values(ascending=False)

interval columns not set, guessing: ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST', 'ALT', 'Gtp', 'dental caries', 'tartar', 'smoking', 'BMI', 'Chol_HDL_ratio', 'ldl_hdl', 'map', 'Waist_Height_ratio', 'height_nin_110', 'Average_hearing', 'Average_eyesight', 'ast_alt', 'Systolic_Diastolic_ratio', 'Atherogenic_coefficient', 'BMI_to_age', 'Glucose_to_Cholesterol', 'Triglycerides_to_HDL', 'Systolic_to_age', 'Diastolic_to_age', 'Hemoglobin_to_age', 'GTP_to_age', 'GTP_to_AST_ALT', 'GTP_to_Cholesterol', 'GTP_index', 'age_normal', 'weight(kg)_normal', 'waist(cm)_normal', 'eyesight(left)_normal', 'eyesight(right)_normal', 'systolic_normal', 'relaxation_normal', 'fasting_blood_sugar_normal', 'Cholesterol_normal', 'triglyceride_normal', 'HDL_normal', 'LDL_normal', 'hemoglobin

In [20]:
# sorted_smoking_values_bu = sorted_smoking_values.copy()

In [250]:
# sorted_smoking_values = phik_overview['smoking'].sort_values(ascending=False)
sorted_smoking_values[:30]
# phik_overview

smoking                           1.000000
GTP_to_AST_ALT_normal             0.278280
GTP_to_Cholesterol_normal         0.244651
Gtp_normal                        0.243253
GTP_to_age_normal                 0.231220
GTP_index_normal                  0.204338
triglyceride_normal               0.203980
Triglycerides_to_HDL_normal       0.203776
triglyceride                      0.190617
GTP_to_AST_ALT                    0.187505
age                               0.183296
Hemoglobin_to_age                 0.160277
Gtp                               0.142844
GTP_to_Cholesterol                0.127258
age_normal                        0.120534
GTP_to_age                        0.120122
Hemoglobin_to_age_normal          0.118743
hemoglobin_normal                 0.117904
hemoglobin                        0.112074
tartar                            0.104470
Diastolic_to_age                  0.101645
Systolic_to_age                   0.099949
Systolic_to_age_normal            0.096548
BMI_to_age 

In [22]:
sorted_smoking_values_df = sorted_smoking_values.reset_index()
sorted_smoking_values_df.columns = ['index', 'value']

In [23]:
sorted_smoking_values_df.to_csv('sorted_smoking_values_зршл_860_куыгде_df.csv')

In [24]:
sorted_smoking_values_df.iloc[:35,:]

Unnamed: 0,index,value
0,smoking,1.0
1,triglyceride height_nin_110,0.207605
2,triglyceride hemoglobin,0.20495
3,height(cm) triglyceride,0.19703
4,BMI_to_age GTP_to_AST_ALT,0.193019
5,systolic GTP_to_AST_ALT,0.191646
6,triglyceride,0.190617
7,HDL Triglycerides_to_HDL,0.190531
8,triglyceride GTP_to_AST_ALT,0.1899
9,GTP_to_AST_ALT,0.187505


In [227]:
for feat in poly_df.columns.to_list():
    merged_data[feat] = poly_df[feat]

  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_d

In [228]:
features_4_learing = list(set(evaluate_features_auc_df.sort_values(by='auc_scores', ascending=False).iloc[:35,:]['Feature'].to_list()) \
                    | set(sorted_smoking_values[:45].index.to_list()))

In [251]:
from sklearn.model_selection import train_test_split

# X = merged_data[evaluate_features_auc_poly_df.sort_values(by='auc_scores', ascending=False).iloc[1:50,:]['Feature'].to_list()]#.drop(['smoking'], axis=1).copy() # [] #   [selected_features.index.to_list()]
# X = merged_data[evaluate_features_auc_df.sort_values(by='auc_scores', ascending=False).iloc[:10,:]['Feature'].to_list()].drop(['smoking'], axis=1).copy()
# X = merged_data[list(set(evaluate_features_auc_df.sort_values(by='auc_scores', ascending=False).iloc[:25,:]['Feature'].to_list()) \
#                      | set(evaluate_features_auc_poly_df.sort_values(by='auc_scores', ascending=False).iloc[1:55,:]['Feature'].to_list()))]
#                            .drop(['smoking'], axis=1).copy()
# X = merged_data[features_4_learing].drop(['smoking'], axis=1).copy() # [] #   [selected_features.index.to_list()]
X = merged_data.drop(['smoking'], axis=1).copy() # [] #   [selected_features.index.to_list()]


# y = merged_data['smoking'].copy()
y = merged_data['smoking'].copy()


# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape

((11090, 83), (2773, 83))

In [252]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

X_train = pd.DataFrame(X_train)
X_val = pd.DataFrame(X_val)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((8317, 83), (2773, 83), (8317,), (2773,))

In [257]:
import optuna
from sklearn.metrics import roc_auc_score, average_precision_score


X_train_temp = X_train.copy() # .drop(columns=['hearing(left)'])
X_test_temp = X_test.copy()
X_val_temp = X_val.copy()

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 4000),
        'boosting_type': trial.suggest_categorical('boosting_type', [ 'Plain' ]), # 'Ordered',
        'depth': trial.suggest_int('depth', 6, 11),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.08 ), #  0.05, 0.1
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 20, 35),
        'border_count': trial.suggest_int('border_count', 20, 300),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 9),
        'od_type': 'Iter', 
        'od_wait': 100, 
        'eval_metric': 'AUC',
        'logging_level': 'Silent',
        'random_seed': 42,
        'auto_class_weights': 'Balanced',
#         'task_type': 'GPU'
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train_temp, y_train, eval_set=[(X_val_temp, y_val)], early_stopping_rounds=params['od_wait'], cat_features=[])

    y_pred_proba = model.predict_proba(X_test_temp)[:, 1]
        
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_prec = average_precision_score(y_test, y_pred_proba)
    
    return roc_auc, avg_prec

In [258]:
%%time 
study = optuna.create_study(pruner=MedianPruner(), directions=['maximize', 'maximize'] ) #pruner=MedianPruner(), sampler=optuna.samplers.TPESampler(seed=42)
study.optimize(objective,
               n_jobs=-1,
               n_trials=10, # 25 показывает результат лучше
               show_progress_bar=True
              )

best_trial = study.best_trials[0]
roc_auc_best = best_trial.values[0]
avg_prec_best = best_trial.values[1]

print(f"Лучшее значение ROC AUC: {roc_auc_best:.4f}")
print(f"Лучшее значение Average Precision: {avg_prec_best:.4f}")

[I 2023-10-31 17:11:10,737] A new study created in memory with name: no-name-55474b09-c61c-4612-aae2-315781a87ab7


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2023-10-31 17:11:50,037] Trial 1 finished with values: [0.7257165450906978, 0.4228203500412938] and parameters: {'iterations': 3314, 'boosting_type': 'Plain', 'depth': 8, 'learning_rate': 0.07727299983983583, 'l2_leaf_reg': 22.85633040212972, 'border_count': 39, 'bagging_temperature': 7.821303153298909}. 
[I 2023-10-31 17:12:25,314] Trial 2 finished with values: [0.7294356400490607, 0.4288716249434281] and parameters: {'iterations': 3738, 'boosting_type': 'Plain', 'depth': 10, 'learning_rate': 0.045880388018615284, 'l2_leaf_reg': 34.695174066374186, 'border_count': 88, 'bagging_temperature': 6.946739598972309}. 
[I 2023-10-31 17:13:38,997] Trial 5 finished with values: [0.727470789490672, 0.4277758296075022] and parameters: {'iterations': 1502, 'boosting_type': 'Plain', 'depth': 6, 'learning_rate': 0.013045000352372684, 'l2_leaf_reg': 27.504269063327865, 'border_count': 164, 'bagging_temperature': 4.234007505189664}. 
[I 2023-10-31 17:14:19,392] Trial 6 finished with values: [0.7258

In [259]:
%%time
# Возьмите параметры из лучшего trial на основе первой метрики (ROC AUC в данном случае)
best_params = study.best_trials[0].params

CatBoost_model = CatBoostClassifier(**best_params, auto_class_weights='Balanced', logging_level='Silent') #  task_type='GPU' auto_class_weights='Balanced' , 

CatBoost_model.fit(X_train_temp, y_train, eval_set=[(X_val_temp, y_val)], early_stopping_rounds=100, verbose=100, cat_features=[]) # cat_features

y_pred_proba = CatBoost_model.predict_proba(X_test_temp)[:, 1]

thresholds = np.linspace(0.01, 1, 300)
f1_scores = [f1_score(y_test, y_pred_proba > thresh) for thresh in thresholds]
optimal_threshold = thresholds[np.argmax(f1_scores)]

print(f"Наивысший F1: {max(f1_scores):.5f}")

Наивысший F1: 0.45278
CPU times: user 3min 1s, sys: 3.17 s, total: 3min 4s
Wall time: 51.5 s


In [256]:
# Получение значимости признаков
feature_importances = CatBoost_model.get_feature_importance()

# Создание датафрейма для лучшей визуализации
features_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Сортировка признаков по их значимости  
features_df = features_df.sort_values(by='Importance', ascending=False)

print(features_df)

                   Feature  Importance
41          GTP_to_AST_ALT    4.684443
15              hemoglobin    4.336206
22                  tartar    3.865928
39       Hemoglobin_to_age    3.651757
12            triglyceride    3.562104
..                     ...         ...
52      Cholesterol_normal    0.002561
65              map_normal    0.000569
59              AST_normal    0.000000
70          ast_alt_normal    0.000000
48  eyesight(right)_normal    0.000000

[83 rows x 2 columns]


In [138]:
# df_catboost_test_result.to_csv('df_catboost_test_result.csv', index=False)

In [139]:
import optuna
from optuna.pruners import MedianPruner
from xgboost import XGBClassifier

def XGB_objective(trial):
   
    
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.08),
        'n_estimators': trial.suggest_int('n_estimators', 800, 3500),
        'max_depth': trial.suggest_int('max_depth', 6, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'scale_pos_weight': 1/y_train.mean(),
        'early_stopping_rounds': 50,
        'reg_lambda': trial.suggest_int('reg_lambda', 3, 10),
        'eval_metric': "auc"
    }


    model = XGBClassifier(**params)
    eval_set = [(X_val, y_val)]
    model.fit(X_train, y_train, eval_set=eval_set, verbose=500) # , early_stopping_rounds=50

    y_prob_fold = model.predict_proba(X_test)[:, 1]
        
    roc_auc = roc_auc_score(y_test, y_prob_fold)
#     avg_prec = average_precision_score(y_test, y_prob_fold)
    
    return roc_auc

# Запуск оптимизации Optuna
XGB_study = optuna.create_study(pruner=MedianPruner(), direction='maximize')
XGB_study.optimize(XGB_objective, n_trials=20, n_jobs=-1)   # , show_progress_bar=True

# Лучшие параметры
XGB_best_params = XGB_study.best_params
XGB_roc_auc_best = XGB_study.best_value


print(f"Лучшее значение ROC AUC: {XGB_roc_auc_best:.5f}")
# print(f"Лучшее значение Average Precision: {XGB_avg_prec_best:.4f}")

[I 2023-10-31 15:36:50,387] A new study created in memory with name: no-name-4ee20817-7133-4f7b-963c-e71c1489c99b


[0]	validation_0-auc:0.62146
[0]	validation_0-auc:0.62654
[0]	validation_0-auc:0.63310
[0]	validation_0-auc:0.62237
[186]	validation_0-auc:0.73553


[I 2023-10-31 15:37:25,773] Trial 0 finished with value: 0.7136990833387128 and parameters: {'learning_rate': 0.07965796912601965, 'n_estimators': 1288, 'max_depth': 14, 'subsample': 0.6542533148636265, 'colsample_bytree': 0.7461319807365632, 'gamma': 0.02678420413918503, 'reg_lambda': 10}. Best is trial 0 with value: 0.7136990833387128.


[0]	validation_0-auc:0.64674
[251]	validation_0-auc:0.74561


[I 2023-10-31 15:37:39,645] Trial 3 finished with value: 0.7252892808727648 and parameters: {'learning_rate': 0.06718967320180498, 'n_estimators': 2414, 'max_depth': 19, 'subsample': 0.7760936375039661, 'colsample_bytree': 0.6362842907002032, 'gamma': 0.3441793844422684, 'reg_lambda': 9}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.61038
[442]	validation_0-auc:0.73353


[I 2023-10-31 15:37:42,982] Trial 2 finished with value: 0.7101680007746434 and parameters: {'learning_rate': 0.07959023211942642, 'n_estimators': 872, 'max_depth': 11, 'subsample': 0.6404055504784054, 'colsample_bytree': 0.5763377942446235, 'gamma': 0.10563676706482178, 'reg_lambda': 6}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.64263
[257]	validation_0-auc:0.74202


[I 2023-10-31 15:38:07,999] Trial 4 finished with value: 0.7193184752436899 and parameters: {'learning_rate': 0.07440613709346654, 'n_estimators': 3422, 'max_depth': 14, 'subsample': 0.9504072570333735, 'colsample_bytree': 0.6774973869405374, 'gamma': 0.08572186252417868, 'reg_lambda': 4}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.65954
[431]	validation_0-auc:0.75008


[I 2023-10-31 15:38:26,272] Trial 1 finished with value: 0.7209121425343749 and parameters: {'learning_rate': 0.030099415112926735, 'n_estimators': 1332, 'max_depth': 18, 'subsample': 0.641686753451538, 'colsample_bytree': 0.7924212160033968, 'gamma': 0.038338339072938876, 'reg_lambda': 3}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.64064
[345]	validation_0-auc:0.72961


[I 2023-10-31 15:38:28,266] Trial 7 finished with value: 0.7082414789232456 and parameters: {'learning_rate': 0.05111261293620938, 'n_estimators': 2155, 'max_depth': 6, 'subsample': 0.8547999277328149, 'colsample_bytree': 0.6278312923031155, 'gamma': 0.22638344697717705, 'reg_lambda': 8}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.62386
[486]	validation_0-auc:0.73387


[I 2023-10-31 15:38:39,403] Trial 6 finished with value: 0.717676392744174 and parameters: {'learning_rate': 0.016616307802475916, 'n_estimators': 3187, 'max_depth': 8, 'subsample': 0.692964198339147, 'colsample_bytree': 0.9422820506949727, 'gamma': 0.3640874097056582, 'reg_lambda': 7}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.63201
[161]	validation_0-auc:0.71946


[I 2023-10-31 15:38:41,291] Trial 9 finished with value: 0.7093715705893745 and parameters: {'learning_rate': 0.06325027263998181, 'n_estimators': 3094, 'max_depth': 7, 'subsample': 0.9101409727793444, 'colsample_bytree': 0.824729236183062, 'gamma': 0.2675156952312281, 'reg_lambda': 4}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.63750
[343]	validation_0-auc:0.74195


[I 2023-10-31 15:38:55,522] Trial 5 finished with value: 0.7240123297398491 and parameters: {'learning_rate': 0.03818348621391965, 'n_estimators': 3092, 'max_depth': 17, 'subsample': 0.6617505735762192, 'colsample_bytree': 0.823889138737923, 'gamma': 0.23157825958098938, 'reg_lambda': 5}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.65190
[193]	validation_0-auc:0.73607


[I 2023-10-31 15:39:26,207] Trial 10 finished with value: 0.7234862178038862 and parameters: {'learning_rate': 0.07342936773238143, 'n_estimators': 1455, 'max_depth': 19, 'subsample': 0.9612028557188768, 'colsample_bytree': 0.792112482565507, 'gamma': 0.3508691611758166, 'reg_lambda': 8}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.65227
[329]	validation_0-auc:0.73231


[I 2023-10-31 15:39:27,381] Trial 12 finished with value: 0.7120856465044219 and parameters: {'learning_rate': 0.05169300558512178, 'n_estimators': 1564, 'max_depth': 9, 'subsample': 0.8204996411664061, 'colsample_bytree': 0.6332518284107618, 'gamma': 0.3548461433955553, 'reg_lambda': 7}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.61639
[445]	validation_0-auc:0.73800


[I 2023-10-31 15:39:50,864] Trial 11 finished with value: 0.7225364727906527 and parameters: {'learning_rate': 0.02537885155744196, 'n_estimators': 1482, 'max_depth': 13, 'subsample': 0.5397107091738385, 'colsample_bytree': 0.666805100545615, 'gamma': 0.3434625593816933, 'reg_lambda': 4}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.63066
[319]	validation_0-auc:0.74060


[I 2023-10-31 15:40:15,530] Trial 13 finished with value: 0.721767477890388 and parameters: {'learning_rate': 0.05443112129512843, 'n_estimators': 2263, 'max_depth': 16, 'subsample': 0.7851429427639375, 'colsample_bytree': 0.5057349751965254, 'gamma': 0.48534882793984496, 'reg_lambda': 10}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.64132
[376]	validation_0-auc:0.73960


[I 2023-10-31 15:40:24,142] Trial 14 finished with value: 0.7232086372732555 and parameters: {'learning_rate': 0.037431840892289917, 'n_estimators': 2619, 'max_depth': 17, 'subsample': 0.5435922316373516, 'colsample_bytree': 0.5055229739066887, 'gamma': 0.4881294998126802, 'reg_lambda': 10}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.62915
[500]	validation_0-auc:0.74115
[261]	validation_0-auc:0.73849


[I 2023-10-31 15:40:36,394] Trial 15 finished with value: 0.7251742947517914 and parameters: {'learning_rate': 0.040132768257086275, 'n_estimators': 2560, 'max_depth': 17, 'subsample': 0.7474108318559, 'colsample_bytree': 0.5290996147842864, 'gamma': 0.4526998419519538, 'reg_lambda': 10}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.62826
[102]	validation_0-auc:0.73051


[I 2023-10-31 15:40:52,722] Trial 17 finished with value: 0.7167177716093216 and parameters: {'learning_rate': 0.04096043899371877, 'n_estimators': 2677, 'max_depth': 20, 'subsample': 0.7412679046808979, 'colsample_bytree': 0.8818634674864625, 'gamma': 0.19724287484127037, 'reg_lambda': 6}. Best is trial 3 with value: 0.7252892808727648.


[0]	validation_0-auc:0.62301
[186]	validation_0-auc:0.73835


[I 2023-10-31 15:41:12,924] Trial 18 finished with value: 0.722458201536376 and parameters: {'learning_rate': 0.06218272936249701, 'n_estimators': 2596, 'max_depth': 20, 'subsample': 0.7471130292854244, 'colsample_bytree': 0.5724596335300937, 'gamma': 0.4073109153360035, 'reg_lambda': 9}. Best is trial 3 with value: 0.7252892808727648.


[287]	validation_0-auc:0.74506


[I 2023-10-31 15:41:28,174] Trial 16 finished with value: 0.7224408527532116 and parameters: {'learning_rate': 0.038432139675971165, 'n_estimators': 2675, 'max_depth': 20, 'subsample': 0.7394298269841204, 'colsample_bytree': 0.9238063874295501, 'gamma': 0.19207110189704393, 'reg_lambda': 6}. Best is trial 3 with value: 0.7252892808727648.


[272]	validation_0-auc:0.74516


[I 2023-10-31 15:41:33,400] Trial 19 finished with value: 0.7182452714479375 and parameters: {'learning_rate': 0.061091025677423846, 'n_estimators': 2625, 'max_depth': 20, 'subsample': 0.7373055297544274, 'colsample_bytree': 0.5719821228274851, 'gamma': 0.4388495552755469, 'reg_lambda': 9}. Best is trial 3 with value: 0.7252892808727648.


[837]	validation_0-auc:0.74446


[I 2023-10-31 15:41:34,139] Trial 8 finished with value: 0.730579045897618 and parameters: {'learning_rate': 0.008983383691096733, 'n_estimators': 1360, 'max_depth': 15, 'subsample': 0.8269021597266271, 'colsample_bytree': 0.9255855574853349, 'gamma': 0.07887945401031737, 'reg_lambda': 4}. Best is trial 8 with value: 0.730579045897618.


Лучшее значение ROC AUC: 0.73058


In [140]:
XGB_roc_auc_best = XGB_study.best_value
print(f"Лучшее значение ROC AUC: {XGB_roc_auc_best:.5f}")

Лучшее значение ROC AUC: 0.73058


In [141]:
# Обучение с лучшими гиперпараметрами на всем тренировочном наборе данных
best_model = XGBClassifier(**XGB_best_params, eval_metric='auc')
best_model.fit(X_train, y_train)

# Предсказания на тестовой выборке
y_prob_test = best_model.predict_proba(X_test)[:, 1]

# Определение оптимального порога
thresholds = np.linspace(0, 1, 300)
f1_scores_test = [f1_score(y_test, y_prob_test > thresh) for thresh in thresholds]
optimal_threshold_test = thresholds[np.argmax(f1_scores_test)]

# Применение оптимального порога для классификации
y_pred_test = [1 if prob >= optimal_threshold_test else 0 for prob in y_prob_test]

# Оценка качества модели с новым порогом на тестовой выборке
print("ROC AUC:", roc_auc_score(y_test, y_prob_test))
print("Average Precision:", average_precision_score(y_test, y_prob_test))
print("\nOptimal Threshold:", optimal_threshold_test)
print("\nClassification Report with Optimal Threshold:\n", classification_report(y_test, y_pred_test))
print(f"Наивысший F1: {max(f1_scores_test):.5f}")
0.47829

ROC AUC: 0.7237363630495126
Average Precision: 0.42828280138692154

Optimal Threshold: 0.14046822742474915

Classification Report with Optimal Threshold:
               precision    recall  f1-score   support

           0       0.88      0.74      0.80      2213
           1       0.36      0.59      0.45       560

    accuracy                           0.71      2773
   macro avg       0.62      0.66      0.62      2773
weighted avg       0.77      0.71      0.73      2773

Наивысший F1: 0.44792


0.47829

In [142]:
y_prob_test

array([0.09185857, 0.21478112, 0.09101312, ..., 0.05281026, 0.10517685,
       0.0347778 ], dtype=float32)

In [143]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_recall_curve
from lightgbm.callback import early_stopping, log_evaluation

In [144]:
def lgbm_objective(trial):
    # Гиперпараметры для оптимизации
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 3500),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.07),
        'max_depth': trial.suggest_int('max_depth', 6, 25),
        'num_leaves': trial.suggest_int('num_leaves', 50, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 15, 55),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 3.0, 12.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 3.0, 12.0),
        'class_weight': 'balanced'
    }

    lgbm_model = LGBMClassifier(**params)
    lgbm_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping(50), log_evaluation(0)]
        )

    # Сохранение предсказаний
    lgbm_model_preds = lgbm_model.predict_proba(X_test)[:, 1]

    # Вычисление ROC AUC на out-of-fold предсказаниях
    oof_roc_auc = roc_auc_score(y_test, lgbm_model_preds)

    return oof_roc_auc

In [145]:
%%time
# Запуск оптимизации
lgbm_study = optuna.create_study(pruner=MedianPruner(), direction='maximize')
lgbm_study.optimize(lgbm_objective,
               n_jobs=-1,
               show_progress_bar=True,
               n_trials=20)

lgbm_roc_auc_best = lgbm_study.best_value
print(f"Best roc_auc: {lgbm_roc_auc_best}")

[I 2023-10-31 15:42:44,148] A new study created in memory with name: no-name-243e9f11-aade-45c3-9a68-63ea7ab4f6cd


  0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[474]	valid_0's binary_logloss: 0.523774
[I 2023-10-31 15:43:00,474] Trial 2 finished with value: 0.7113243173455555 and parameters: {'n_estimators': 1221, 'learning_rate': 0.06850433848704582, 'max_depth': 17, 'num_leaves': 118, 'min_child_samples': 25, 'feature_fraction': 0.6758202498293022, 'bagging_fraction': 0.5532948120709098, 'bagging_freq': 2, 'lambda_l1': 8.592927144684042, 'lambda_l2': 10.341200704864093}. Best is trial 2 with value: 0.7113243173455555.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[812]	valid_0's binary_logloss: 0.572087
[I 2023-10-31 15:43:03,254] Trial 1 finished with value: 0.721342230972823 and parameters: {'n_estimators': 2343, 'l

In [146]:
# Обучение с лучшими гиперпараметрами
best_params = lgbm_study.best_params
lgbm_model = LGBMClassifier(**best_params)
lgbm_model.fit(X_train, y_train)

# Предсказания на тестовой выборке
lgbm_test_preds = lgbm_model.predict_proba(X_test)[:, 1]



In [147]:
# Определение оптимального порога
precision, recall, thresholds = precision_recall_curve(y_test, lgbm_test_preds)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[f1_scores[:-1].argmax()]
# Применение оптимального порога для классификации
y_pred_optimal = (lgbm_test_preds > optimal_threshold).astype(int)

# Оценка качества модели с новым порогом на тестовой выборке
accuracy = accuracy_score(y_test, y_pred_optimal)
roc_auc = roc_auc_score(y_test, lgbm_test_preds)
f1 = f1_score(y_test, y_pred_optimal)

print(f"Accuracy: {accuracy}")
print(f"ROC AUC: {roc_auc}")
print(f"F1 Score: {f1}")

Accuracy: 0.711864406779661
ROC AUC: 0.717932993350978
F1 Score: 0.4455239417071478


In [148]:
lgbm_test_preds

array([0.09973263, 0.30152035, 0.15214354, ..., 0.12434203, 0.29900278,
       0.05742968])

In [149]:
df_preds = pd.DataFrame()

In [150]:
df_preds['catboost_preds'] = y_pred_proba
df_preds['XGBoost_preds'] = y_prob_test
df_preds['LGBM_preds'] = lgbm_test_preds
df_preds['Real_class'] = y_test.reset_index(drop=True)

In [151]:
df_preds

Unnamed: 0,catboost_preds,XGBoost_preds,LGBM_preds,Real_class
0,0.450911,0.091859,0.099733,0
1,0.399373,0.214781,0.301520,0
2,0.361224,0.091013,0.152144,0
3,0.187130,0.029062,0.040717,0
4,0.298977,0.022129,0.095674,0
...,...,...,...,...
2768,0.742135,0.272055,0.390383,1
2769,0.537838,0.318988,0.403494,0
2770,0.513000,0.052810,0.124342,0
2771,0.581785,0.105177,0.299003,0


In [152]:
# df_preds.to_csv('df_preds_V4.csv', index=False)

In [153]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

In [210]:
# Инициализация словаря для хранения метрик новых данных
metrics_new = {}

model_names = ['catboost_preds', 'XGBoost_preds', 'LGBM_preds']
thresholds = np.linspace(0.01, 1, 300)
metrics_optimal_threshold = {}

# Расчет метрик для каждой модели в новом наборе данных
for model in model_names:
    f1_scores = [f1_score(df_preds['Real_class'], df_preds[model] > thresh) for thresh in thresholds]
    
    # Определение оптимального порога
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    print(model, optimal_threshold)
    # Бинаризация предсказаний на основе оптимального порога
    predictions_optimal = (df_preds[model] >= optimal_threshold).astype(int)
    
    # Расчет метрик
    accuracy_new = accuracy_score(df_preds['Real_class'], predictions_optimal)
    auc_roc_new = roc_auc_score(df_preds['Real_class'], df_preds[model])
    precision_new = precision_score(df_preds['Real_class'], predictions_optimal)
    recall_new = recall_score(df_preds['Real_class'], predictions_optimal)
    f1_new = f1_score(df_preds['Real_class'], predictions_optimal)
    
    # Сохранение метрик в словаре
    metrics_new[model] = {
        'Accuracy': accuracy_new,
        'AUC ROC': auc_roc_new,
        'Precision': precision_new,
        'Recall': recall_new,
        'F1 Score': f1_new
    }

metrics_new_df = pd.DataFrame(metrics_new).T
metrics_new_df

catboost_preds 0.4603010033444816
XGBoost_preds 0.14244147157190637
LGBM_preds 0.2583277591973244


Unnamed: 0,Accuracy,AUC ROC,Precision,Recall,F1 Score
catboost_preds,0.702128,0.732683,0.364008,0.635714,0.462939
XGBoost_preds,0.710783,0.723736,0.364955,0.583929,0.449176
LGBM_preds,0.741435,0.717933,0.392613,0.5125,0.444617


In [264]:
cat_pred_test = CatBoost_model.predict_proba(test_cv)[:, 1]

In [159]:
XGB_pred_test = best_model.predict_proba(test_cv)[:, 1]
LGBM_pred_test = lgbm_model.predict_proba(test_cv)[:, 1]

In [160]:
cat_optimal_threshold = 0.4603010033444816
XGB_optimal_threshold = 0.14244147157190637
LGBM_optimal_threshold = 0.2583277591973244

In [265]:
cat_final_predictions = (cat_pred_test > optimal_threshold).astype(int)
XGB_final_predictions = (XGB_pred_test > optimal_threshold).astype(int)
LGBM_final_predictions = (LGBM_pred_test > optimal_threshold).astype(int)

In [266]:
test_preds_df = pd.DataFrame()
test_preds_df['cat_final_predictions'] = cat_final_predictions
# test_preds_df['XGB_final_predictions'] = XGB_final_predictions
# test_preds_df['LGBM_final_predictions'] = LGBM_final_predictions

In [267]:
test_preds_df

Unnamed: 0,cat_final_predictions
0,1
1,0
2,0
3,0
4,1
...,...
5937,0
5938,0
5939,0
5940,0


In [170]:
test_preds_df.to_csv('test_preds_df.csv', index=False)

In [213]:
test_preds_df['Consensus'] = test_preds_df.apply(lambda x: x.value_counts().max()/3, axis=1)

In [268]:
# Окончательное предсказание на основе голосования
test_preds_df['Final_Prediction'] = test_preds_df.mode(axis=1)[0].astype(int)

In [269]:
test_preds_df.iloc[:30,:]

Unnamed: 0,cat_final_predictions,Final_Prediction
0,1,1
1,0,0
2,0,0
3,0,0
4,1,1
5,0,0
6,1,1
7,0,0
8,1,1
9,0,0


In [270]:
test_preds_df['Final_Prediction'].sum()

2129

In [271]:
submission = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/sample_submission.csv')

In [272]:
submission['smoking'] = test_preds_df['Final_Prediction']
submission.to_csv('Catboost_only_submission_20_4.csv', index = False)