#### Packages

In [1]:
!pip install catboost optuna
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
import catboost as ct
from sklearn.metrics import log_loss, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import optuna
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna
  Downloading optuna-3.1.1-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.wh

#### Loading data

In [2]:

from google.colab import drive
drive.mount('/content/drive')
train = pd.read_csv('/content/drive/MyDrive/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/content/drive/MyDrive/icr-identify-age-related-conditions/test.csv')
greeks = pd.read_csv('/content/drive/MyDrive/icr-identify-age-related-conditions/greeks.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/icr-identify-age-related-conditions/sample_submission.csv')

##---- Copy from here

df = pd.merge(train, greeks, how='left', on='Id')
def encode(dataframe):
    le = LabelEncoder()
    obj = list(dataframe.loc[:, dataframe.dtypes == 'object'].columns)
    for i in obj:
        if i not in ['id', 'Epsilon']:
            dataframe[i] = le.fit_transform(dataframe[i])
    return dataframe
imputer = KNNImputer(n_neighbors=2)

features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
       'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
       'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']

df = encode(df)  
test = encode(test)

df[features] = imputer.fit_transform(df[features])
test[features] = imputer.fit_transform(test[features])

target = 'Class'
X = df[features]
y = df[target]

from sklearn.model_selection import train_test_split
# Assuming X and y are your features and labels
X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Mounted at /content/drive


#### Tuning

In [3]:
def balanced_log_loss(y_true, y_pred, class_weights):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    w0 = 1/class_weights[0]
    w1 = 1/class_weights[1]
    loss = -2*(w0 * y_true * np.log(y_pred[:,1]) + w1 * (1 - y_true) * np.log(y_pred[:,0]))/(w0+w1)
    return loss.mean()
    

#### Final Model

In [4]:
best_params = {'iterations': 212,
 'depth': 4,
 'learning_rate': 0.02995163795315085,
 'random_strength': 0,
 'bagging_temperature': 0.010387485768987258,
 'od_type': 'Iter',
 'od_wait': 37}
class_counts = y.value_counts()
class_weights = {class_counts.index[0]: class_counts[1] / (class_counts[1]+class_counts[0]),
                     class_counts.index[1]: class_counts[0] / (class_counts[1]+class_counts[0])}
final_model = CatBoostClassifier(**best_params, loss_function='Logloss', eval_metric='Logloss',task_type = 'GPU', class_weights=class_weights)
final_model.fit(X, y,verbose = 0) # Train on the full data
prediction = final_model.predict_proba(test[features])

In [5]:
import random 
# define a StratifiedKFold object
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=int(random.random()*1000))

# list to store log loss for each fold
log_loss_scores = []

for train_index, test_index in strat_kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = CatBoostClassifier(**best_params, loss_function='Logloss', eval_metric='Logloss', class_weights=class_weights)
    model.fit(X_train, y_train, verbose=0)
    
    y_pred = model.predict_proba(X_test)
    fold_log_loss = balanced_log_loss(y_test, y_pred,class_weights)
    log_loss_scores.append(fold_log_loss)
    
# print the mean log loss across all folds
print("Mean Log Loss on test data across 10 folds: ", np.mean(log_loss_scores))

Mean Log Loss on test data across 10 folds:  0.18127967275749446


In [6]:
log_loss_scores

[0.1802107408611545,
 0.14310649327564143,
 0.26260660254315477,
 0.19017781141942705,
 0.13029671568809467]

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import log_loss
import optuna

def objective(trial):
    # Parameters to tune
    params = {
        'iterations': trial.suggest_int('iterations', 250,300),
        'depth': trial.suggest_int('depth', 8,14),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.02, 0.03),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 50.00),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50)
    }
    
    # Compute class weights
    class_counts = y.value_counts()
    class_weights = {class_counts.index[0]: class_counts[1] / (class_counts[1]+class_counts[0]),
                     class_counts.index[1]: class_counts[0] / (class_counts[1]+class_counts[0])}

    n_splits = 5
    n_repeats = 5  # Number of times cross-validator needs to be repeated.
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)
    
    log_loss_scores = []
    for train_index, valid_index in rskf.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        model = CatBoostClassifier(**params, loss_function='Logloss', eval_metric='Logloss', task_type='GPU', class_weights=class_weights)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=False)
        
        pred_valid = model.predict_proba(X_valid)
        log_loss_scores.append(balanced_log_loss(y_valid, pred_valid,class_weights))
    
    return np.mean(log_loss_scores)

# Create the study and run optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=40)


[32m[I 2023-05-26 17:36:27,286][0m A new study created in memory with name: no-name-b1274099-032d-4b5c-95f7-86ae41916c01[0m
[32m[I 2023-05-26 17:38:25,348][0m Trial 0 finished with value: 0.2128080146597731 and parameters: {'iterations': 296, 'depth': 13, 'learning_rate': 0.02250166425923213, 'random_strength': 2, 'bagging_temperature': 19.944323466207972, 'od_type': 'Iter', 'od_wait': 16}. Best is trial 0 with value: 0.2128080146597731.[0m
[32m[I 2023-05-26 17:40:14,100][0m Trial 1 finished with value: 0.22725899438935554 and parameters: {'iterations': 262, 'depth': 11, 'learning_rate': 0.024281461411614327, 'random_strength': 65, 'bagging_temperature': 0.02872373924991248, 'od_type': 'Iter', 'od_wait': 49}. Best is trial 0 with value: 0.2128080146597731.[0m
[32m[I 2023-05-26 17:41:24,669][0m Trial 2 finished with value: 0.19994706870937054 and parameters: {'iterations': 271, 'depth': 8, 'learning_rate': 0.024675582577956198, 'random_strength': 15, 'bagging_temperature': 2.

In [None]:
# Compute class weights
import random
class_counts = y.value_counts()
class_weights = {class_counts.index[0]: class_counts[1] / (class_counts[1]+class_counts[0]),
                     class_counts.index[1]: class_counts[0] / (class_counts[1]+class_counts[0])}

X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.5, random_state=int(random.random()*1000))
best_params = study.best_params
model = CatBoostClassifier(**best_params, loss_function='Logloss', eval_metric='Logloss', class_weights=class_weights)
model.fit(X_t, y_t, verbose=0)
y_pred = model.predict_proba(X_test)
balanced_log_loss(y_test, y_pred,class_weights)
