In [1]:
import pandas as pd
import numpy as np
from phik.report import plot_correlation_matrix
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, \
    classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.metrics import roc_auc_score, average_precision_score
from optuna.pruners import MedianPruner
from sklearn.metrics import f1_score
# plt.style.use('dark_background')

In [2]:
df_raw = pd.read_csv('/kaggle/input/leopard-challenge-classification/train.csv')
test = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/test.csv')
df_raw.shape, test.shape

((13863, 26), (5942, 25))

In [3]:
test_cv = test.drop(['oral', 'ID'], axis = 1).copy()
test_cv['tartar'] = test_cv['tartar'].apply(lambda x: 1 if x == 'Y' else 0)
merged_data = df_raw.drop(['oral', 'ID'], axis=1).copy()
merged_data['tartar'] = merged_data['tartar'].apply(lambda x: 1 if x == 'Y' else 0)

In [4]:
merged_data['BMI'] = merged_data['weight(kg)'] / ((merged_data['height(cm)'] / 100) ** 2)
merged_data['chol_hdl'] = merged_data['Cholesterol'] / merged_data['HDL']
merged_data['ldl_hdl'] = merged_data['LDL'] / merged_data['HDL']
merged_data['map'] = (merged_data['systolic'] + (2 * merged_data['relaxation'])) / 3
merged_data['wh_ratio'] = merged_data['waist(cm)'] / merged_data['height(cm)']
merged_data['height_nin_110'] = (merged_data['height(cm)'] - 110) / merged_data['weight(kg)']
merged_data['hearing_mean'] = (merged_data['hearing(left)'] + merged_data['hearing(right)']) / 2
merged_data['eyesight_mean'] = (merged_data['eyesight(left)'] + merged_data['eyesight(right)']) / 2
merged_data['ast_alt'] = merged_data['AST'] / merged_data['ALT']
merged_data.shape

(13863, 33)

In [5]:
def gini_impurity(y):
    """
    Вычисление Gini impurity для списка меток y.
    """
    if len(y) == 0:
        return 0
    
    p = sum(y) / len(y)
    return 1 - p**2 - (1-p)**2

def weighted_gini_impurity(y_left, y_right):
    """
    Вычисление взвешенной Gini impurity для двух подмножеств данных.
    """
    n_left, n_right = len(y_left), len(y_right)
    n_total = n_left + n_right
    
    return (n_left / n_total) * gini_impurity(y_left) + (n_right / n_total) * gini_impurity(y_right)

def best_split_threshold(feature, target):
    """
    Нахождение наилучшего порога разделения для заданного признака.
    """
    # Сортировка признака
    sorted_idx = feature.argsort()
    sorted_feature = feature[sorted_idx]
    sorted_target = target[sorted_idx]

    # Инициализация переменных
    best_gini = float('inf')
    best_threshold = None
    
    # Перебор возможных пороговых значений
    for i in range(1, len(sorted_feature)):
        if sorted_feature[i] != sorted_feature[i-1]:
            threshold = (sorted_feature[i] + sorted_feature[i-1]) / 2
            gini = weighted_gini_impurity(sorted_target[:i], sorted_target[i:])
            if gini < best_gini:
                best_gini = gini
                best_threshold = threshold
                
    return best_threshold, best_gini

# Тестирование на одном из признаков
# feature_name = 'age'
# threshold, gini = best_split_threshold(merged_data[feature_name].values, merged_data['smoking'].values)
# threshold, gini


In [6]:
# Список непрерывных признаков
continuous_features = ['BMI', 'chol_hdl', 'ldl_hdl', 'map', 'wh_ratio', 'height_nin_110', 'hearing_mean', 'eyesight_mean', 'ast_alt',
    'age', 'height(cm)', 'weight(kg)', 'waist(cm)', 
    'eyesight(left)', 'eyesight(right)', 'systolic', 'relaxation', 
    'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL', 'LDL', 
    'hemoglobin', 'Urine protein', 'serum creatinine', 'AST', 'ALT', 'Gtp'
]

# Нахождение наилучших порогов разделения для каждого признака
best_thresholds = {}
for feature in continuous_features:
    threshold, gini = best_split_threshold(merged_data[feature].values, merged_data['smoking'].values)
    best_thresholds[feature] = (threshold, gini)

best_thresholds


{'BMI': (19.79591836734694, 0.3215269436987101),
 'chol_hdl': (4.456331045003814, 0.32129215554826224),
 'ldl_hdl': (0.7075046904315196, 0.32157596526378096),
 'map': (134.33333333333334, 0.32225199236270957),
 'wh_ratio': (0.408989898989899, 0.32193085250471454),
 'height_nin_110': (1.080128205128205, 0.3216415408356256),
 'hearing_mean': (1.25, 0.3222019768227528),
 'eyesight_mean': (0.325, 0.3221792168351839),
 'ast_alt': (2.8057971014492753, 0.3221808924008923),
 'age': (62.5, 0.32041557759499767),
 'height(cm)': (167.5, 0.3220007667212753),
 'weight(kg)': (52.5, 0.32205507080218254),
 'waist(cm)': (94.05, 0.3219400088264416),
 'eyesight(left)': (0.35, 0.3222692060944869),
 'eyesight(right)': (1.55, 0.3222898572229469),
 'systolic': (140.5, 0.3221474813128374),
 'relaxation': (69.5, 0.32214158188668934),
 'fasting blood sugar': (238.5, 0.3217415349923938),
 'Cholesterol': (316.5, 0.3222386111993474),
 'triglyceride': (114.5, 0.3168128518892239),
 'HDL': (50.5, 0.3212278706120323),


In [7]:
merged_data['age_normal'] = (merged_data['age'] < 62.5).astype(int)
merged_data['weight(kg)_normal'] = (merged_data['weight(kg)'] < 52.5).astype(int)
merged_data['waist(cm)_normal'] = (merged_data['waist(cm)'] < 94.05).astype(int)
merged_data['eyesight(left)_normal'] = (merged_data['eyesight(left)'] < 0.35).astype(int)
merged_data['eyesight(right)_normal'] = (merged_data['eyesight(right)'] < 1.55).astype(int)
merged_data['systolic_normal'] = (merged_data['systolic'] < 140.5).astype(int)
merged_data['relaxation_normal'] = (merged_data['relaxation'] < 69.5).astype(int)
merged_data['fasting_blood_sugar_normal'] = (merged_data['fasting blood sugar'] < 238.5).astype(int)
merged_data['Cholesterol_normal'] = (merged_data['Cholesterol'] < 316.5).astype(int)
merged_data['triglyceride_normal'] = (merged_data['triglyceride'] < 114.5).astype(int)
merged_data['HDL_normal'] = (merged_data['HDL'] < 50.5).astype(int)
merged_data['LDL_normal'] = (merged_data['LDL'] < 88.5).astype(int)
merged_data['hemoglobin_normal'] = (merged_data['hemoglobin'] < 16.45).astype(int)
merged_data['Urine_protein_normal'] = (merged_data['Urine protein'] < 1.5).astype(int)
merged_data['serum_creatinine_normal'] = (merged_data['serum creatinine'] < 0.75).astype(int)
merged_data['AST_normal'] = (merged_data['AST'] < 45.5).astype(int)
merged_data['ALT_normal'] = (merged_data['ALT'] < 29.5).astype(int)
merged_data['Gtp_normal'] = (merged_data['Gtp'] < 43.5).astype(int)
merged_data['BMI_normal'] = (merged_data['BMI'] < 19.8).astype(int)
merged_data['chol_hdl_normal'] = (merged_data['chol_hdl'] < 4.46).astype(int)
merged_data['ldl_hdl_normal'] = (merged_data['ldl_hdl'] < 0.71).astype(int)
merged_data['map_normal'] = (merged_data['map'] < 134.3).astype(int)
merged_data['wh_ratio_normal'] = (merged_data['wh_ratio'] < 0.41).astype(int)
merged_data['height_nin_110_normal'] = (merged_data['height_nin_110'] < 1.08).astype(int)
merged_data['hearing_mean_normal'] = (merged_data['hearing_mean'] < 1.25).astype(int)
merged_data['eyesight_mean_normal'] = (merged_data['eyesight_mean'] < 0.325).astype(int)
merged_data['ast_alt_normal'] = (merged_data['ast_alt'] < 2.81).astype(int)

In [8]:
bin_only_features = merged_data.nunique()[merged_data.nunique() < 3].index.to_list()
bin_only_features

['hearing(left)',
 'hearing(right)',
 'dental caries',
 'tartar',
 'smoking',
 'age_normal',
 'weight(kg)_normal',
 'waist(cm)_normal',
 'eyesight(left)_normal',
 'eyesight(right)_normal',
 'systolic_normal',
 'relaxation_normal',
 'fasting_blood_sugar_normal',
 'Cholesterol_normal',
 'triglyceride_normal',
 'HDL_normal',
 'LDL_normal',
 'hemoglobin_normal',
 'Urine_protein_normal',
 'serum_creatinine_normal',
 'AST_normal',
 'ALT_normal',
 'Gtp_normal',
 'BMI_normal',
 'chol_hdl_normal',
 'ldl_hdl_normal',
 'map_normal',
 'wh_ratio_normal',
 'height_nin_110_normal',
 'hearing_mean_normal',
 'eyesight_mean_normal',
 'ast_alt_normal']

In [9]:
from sklearn.model_selection import train_test_split

X = merged_data.drop(['smoking'], axis=1).copy()
y = merged_data['smoking'].copy()

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape

((11090, 59), (2773, 59))

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

X_train = pd.DataFrame(X_train)
X_val = pd.DataFrame(X_val)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((8872, 59), (2218, 59), (8872,), (2218,))

In [21]:
import optuna
from sklearn.metrics import roc_auc_score, average_precision_score


X_train_temp = X_train.copy() # .drop(columns=['hearing(left)'])
X_test_temp = X_test.copy()
X_val_temp = X_val.copy()

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 800, 2500),
        'boosting_type': trial.suggest_categorical('boosting_type', [ 'Plain']), # 'Ordered',
        'depth': trial.suggest_int('depth', 5, 13),
        'learning_rate': trial.suggest_float('learning_rate', 0.002, 0.08 ), #  0.05, 0.1
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 6, 20),
        'border_count': trial.suggest_int('border_count', 40, 200),
        'od_type': 'Iter', 
        'od_wait': 100, 
        'eval_metric': 'AUC',
        'logging_level': 'Silent',
        'random_seed': 42,
        'auto_class_weights': 'Balanced'
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train_temp, y_train, eval_set=[(X_val_temp, y_val)], early_stopping_rounds=params['od_wait'], cat_features=[])

    y_pred_proba = model.predict_proba(X_test_temp)[:, 1]
        
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_prec = average_precision_score(y_test, y_pred_proba)
    
    return roc_auc, avg_prec


In [22]:
%%time 
study = optuna.create_study(pruner=MedianPruner(), directions=['maximize', 'maximize'] ) #pruner=MedianPruner(), sampler=optuna.samplers.TPESampler(seed=42)
study.optimize(objective,
               n_jobs=-1,
               n_trials=25, # 25 показывает результат лучше
               show_progress_bar=True
              )

best_trial = study.best_trials[0]
roc_auc_best = best_trial.values[0]
avg_prec_best = best_trial.values[1]

print(f"Лучшее значение ROC AUC: {roc_auc_best:.4f}")
print(f"Лучшее значение Average Precision: {avg_prec_best:.4f}")

[I 2023-10-25 14:01:57,619] A new study created in memory with name: no-name-7054c1ae-8374-4bd3-a677-e2fd307ec8f4


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2023-10-25 14:02:11,577] Trial 1 finished with values: [0.7257238073720225, 0.42130407230476113] and parameters: {'iterations': 1735, 'boosting_type': 'Plain', 'depth': 8, 'learning_rate': 0.057278648147773545, 'l2_leaf_reg': 7.638211594461918, 'border_count': 82}. 
[I 2023-10-25 14:02:17,714] Trial 4 finished with values: [0.7304547801949519, 0.42479235740178045] and parameters: {'iterations': 1679, 'boosting_type': 'Plain', 'depth': 6, 'learning_rate': 0.07701295662223373, 'l2_leaf_reg': 10.480718507697992, 'border_count': 158}. 
[I 2023-10-25 14:02:36,270] Trial 5 finished with values: [0.730463656316571, 0.42194903305648007] and parameters: {'iterations': 1750, 'boosting_type': 'Plain', 'depth': 5, 'learning_rate': 0.017703143110127065, 'l2_leaf_reg': 12.434756574611423, 'border_count': 198}. 
[I 2023-10-25 14:03:30,600] Trial 2 finished with values: [0.7428958750242076, 0.442157418941274] and parameters: {'iterations': 1388, 'boosting_type': 'Plain', 'depth': 12, 'learning_rate

In [23]:
import optuna.visualization as vis


def target_function(trial):
    return trial.values[0]  # 0 соответствует первой метрике (ROC AUC)

vis.plot_slice(study, params=['iterations', 'boosting_type', 'depth', 'learning_rate', 'l2_leaf_reg' ], target=target_function)


`target` is specified, but `target_name` is the default value, 'Objective Value'.



In [24]:
from sklearn.metrics import f1_score

# Возьмите параметры из лучшего trial на основе первой метрики (ROC AUC в данном случае)
best_params = study.best_trials[0].params

model = CatBoostClassifier(**best_params, auto_class_weights='Balanced', logging_level='Silent') #  task_type='GPU' auto_class_weights='Balanced' , 

model.fit(X_train_temp, y_train, eval_set=[(X_val_temp, y_val)], early_stopping_rounds=100, verbose=100, cat_features=[]) # cat_features

y_pred_proba = model.predict_proba(X_test_temp)[:, 1]

thresholds = np.linspace(0.01, 1, 300)
f1_scores = [f1_score(y_test, y_pred_proba > thresh) for thresh in thresholds]
optimal_threshold = thresholds[np.argmax(f1_scores)]

# print(f"Оптимальный порог: {optimal_threshold:.5f}")
print(f"Наивысший F1: {max(f1_scores):.5f}")

Наивысший F1: 0.46843


In [14]:
test_cv['BMI'] = test_cv['weight(kg)'] / ((test_cv['height(cm)'] / 100) ** 2)
test_cv['chol_hdl'] = test_cv['Cholesterol'] / test_cv['HDL']
test_cv['ldl_hdl'] = test_cv['LDL'] / test_cv['HDL']
test_cv['map'] = (test_cv['systolic'] + (2 * test_cv['relaxation'])) / 3
test_cv['wh_ratio'] = test_cv['waist(cm)'] / test_cv['height(cm)']
test_cv['height_nin_110'] = (test_cv['height(cm)'] - 110) / test_cv['weight(kg)']
test_cv['hearing_mean'] = (test_cv['hearing(left)'] + test_cv['hearing(right)']) / 2
test_cv['eyesight_mean'] = (test_cv['eyesight(left)'] + test_cv['eyesight(right)']) / 2
test_cv['ast_alt'] = test_cv['AST'] / test_cv['ALT']

test_cv['age_normal'] = (test_cv['age'] < 62.5).astype(int)
test_cv['weight(kg)_normal'] = (test_cv['weight(kg)'] < 52.5).astype(int)
test_cv['waist(cm)_normal'] = (test_cv['waist(cm)'] < 94.05).astype(int)
test_cv['eyesight(left)_normal'] = (test_cv['eyesight(left)'] < 0.35).astype(int)
test_cv['eyesight(right)_normal'] = (test_cv['eyesight(right)'] < 1.55).astype(int)
test_cv['systolic_normal'] = (test_cv['systolic'] < 140.5).astype(int)
test_cv['relaxation_normal'] = (test_cv['relaxation'] < 69.5).astype(int)
test_cv['fasting_blood_sugar_normal'] = (test_cv['fasting blood sugar'] < 238.5).astype(int)
test_cv['Cholesterol_normal'] = (test_cv['Cholesterol'] < 316.5).astype(int)
test_cv['triglyceride_normal'] = (test_cv['triglyceride'] < 114.5).astype(int)
test_cv['HDL_normal'] = (test_cv['HDL'] < 50.5).astype(int)
test_cv['LDL_normal'] = (test_cv['LDL'] < 88.5).astype(int)
test_cv['hemoglobin_normal'] = (test_cv['hemoglobin'] < 16.45).astype(int)
test_cv['Urine_protein_normal'] = (test_cv['Urine protein'] < 1.5).astype(int)
test_cv['serum_creatinine_normal'] = (test_cv['serum creatinine'] < 0.75).astype(int)
test_cv['AST_normal'] = (test_cv['AST'] < 45.5).astype(int)
test_cv['ALT_normal'] = (test_cv['ALT'] < 29.5).astype(int)
test_cv['Gtp_normal'] = (test_cv['Gtp'] < 43.5).astype(int)
test_cv['BMI_normal'] = (test_cv['BMI'] < 19.8).astype(int)
test_cv['chol_hdl_normal'] = (test_cv['chol_hdl'] < 4.46).astype(int)
test_cv['ldl_hdl_normal'] = (test_cv['ldl_hdl'] < 0.71).astype(int)
test_cv['map_normal'] = (test_cv['map'] < 134.3).astype(int)
test_cv['wh_ratio_normal'] = (test_cv['wh_ratio'] < 0.41).astype(int)
test_cv['height_nin_110_normal'] = (test_cv['height_nin_110'] < 1.08).astype(int)
test_cv['hearing_mean_normal'] = (test_cv['hearing_mean'] < 1.25).astype(int)
test_cv['eyesight_mean_normal'] = (test_cv['eyesight_mean'] < 0.325).astype(int)
test_cv['ast_alt_normal'] = (test_cv['ast_alt'] < 2.81).astype(int)

In [None]:
cat_pred_test = model.predict_proba(test_cv)[:, 1]    

In [None]:
final_predictions = (avg_ens_preds > optimal_threshold).astype(int)
final_predictions.sum()

In [None]:
submission = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/sample_submission.csv')

In [None]:
submission['smoking'] = final_predictions
submission.to_csv('Ensemble_submission_19_1.csv', index = False)