# 0. Default start

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [3]:
np.random.seed(42)

In [4]:
df = pd.read_csv('train_dataset_Самолет.csv')

  df = pd.read_csv('train_dataset_Самолет.csv')


# 1. Data analyze

In [5]:
def count_missing_values(df):
    """
    Функция для подсчета пропусков в указанном столбце DataFrame. 
    Параметры:
    df (pd.DataFrame): Исходный DataFrame. 
    Возвращает:
    float: Процент пропусков в столбце."""
    total_rows = df.shape[0]  # Общее количество строк в DataFrame
    missing_values = df.isnull().sum()  # Подсчет пропусков в столбце

    # Вычисление процента пропусков
    missing_percentage = (missing_values / total_rows) * 100

    return missing_percentage

In [6]:
df = df.replace({'nan': None})

In [7]:
df.shape

(14456, 2666)

In [8]:
df = df.T.drop_duplicates().T
df.shape

(14456, 2213)

In [9]:
miss_pos_col = count_missing_values(df[df['target'] == 1])
threshold = 75 # percent value in col for drop
to_drop = miss_pos_col[miss_pos_col >= threshold].index.tolist()
len(to_drop)

1876

In [10]:
df = df.drop(to_drop, axis=1)

In [11]:
df.shape

(14456, 337)

In [12]:
def intersection_col(df, list_cols):
    if isinstance(list_cols, list):
        return list(set(df.columns.tolist()) & set(list_cols))
    else:
        return list(set(df.columns.tolist()) & set(list_cols.columns.tolist()))

In [13]:
cat_all_columns = ['col520', 'col528', 'col536',
 'col544', 'col552', 'col592', 'col600', 'col608', 'col1454']
to_drop_cat = ['col552', 'col1454']

In [14]:
cat_cols = intersection_col(df, cat_all_columns)
to_drop = list(set(to_drop_cat) & set(cat_cols))
cat_cols = list(set(cat_cols) - set(to_drop))
len(cat_cols)

0

In [15]:
df = df.drop(to_drop, axis=1)

In [16]:
df.shape

(14456, 336)

In [17]:
cols = df.columns.tolist()
cols.remove('client_id'), cols.remove('target'), cols.remove('report_date')
df[cols] = df[cols].astype(np.float64)

category_cols = []

for col in df.columns.tolist():
    if col == 'report_date':
        continue
    if df[col].max() <= 40:
        category_cols.append(col)
category_cols.remove('col2663'), category_cols.remove('target')

(None, None)

In [18]:
# df[category_cols] = df[category_cols].astype(str)

# 2. Model builder

In [19]:
from sklearn.metrics import classification_report
from eli5.sklearn import PermutationImportance
from lightgbm import LGBMClassifier
from category_encoders import CatBoostEncoder

In [20]:
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    average_precision_score,
    log_loss,
    )
def evaluate_metrics(y_true, y_pred):
    roc_auc = roc_auc_score(y_true, y_pred) 
    accuracy = accuracy_score(y_true, (y_pred > 0.5).astype(int)) 
    precision = precision_score(y_true, (y_pred > 0.5).astype(int)) 
    recall = recall_score(y_true, (y_pred > 0.5).astype(int))
    f1 = f1_score(y_true, (y_pred > 0.5).astype(int)) 
    confusion = confusion_matrix(y_true, (y_pred > 0.5).astype(int)) ок
    pr_auc = average_precision_score(y_true, y_pred) 
    logloss = log_loss(y_true, y_pred)

    print(f"ROC AUC: {roc_auc}", end='; ')
    print(f"Accuracy: {accuracy}", end='; ')
    print(f"Precision: {precision}", end='; ')
    print(f"Recall: {recall}", end='; ')
    print(f"F1 Score: {f1}", end='; ')
    print(f"PR AUC: {pr_auc}", end='; ')
    print(f"Log Loss: {logloss}")

In [144]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin

from eli5.sklearn import PermutationImportance
from imblearn.over_sampling import SMOTE
import optuna
from optuna.integration import LightGBMPruningCallback
import lightgbm as lgb

    
class FeatureSelector(BaseEstimator, ClassifierMixin):
    def __init__(self, features):
        """
        Класс для отбора заданных признаков.
        
        Параметры:
        features (list): Список заданных признаков.
        """
        self.features = features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """
        Метод для отбора заданных признаков из датасета.
        
        Параметры:
        df (pd.DataFrame): Входной датасет.
        
        Возвращает:
        pd.DataFrame: Датасет, содержащий только отобранные признаки.
        
        Исключения:
        ValueError: Генерируется, если какие-то из заданных признаков отсутствуют во входном датасете.
        """
        # Проверяем наличие всех заданных признаков во входном датасете
        missing_features = set(self.features) - set(df.columns)
        if missing_features:
            raise ValueError(f"Признаки {missing_features} отсутствуют во входном датасете.")
        
        # Отбираем заданные признаки из датасета
        selected_df = X[self.features]
        
        return selected_df

def objective(trial, trf, df_train, df_val, target_col='target'):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'min_sum_hessian_in_leaf ': trial.suggest_float('min_sum_hessian_in_leaf', 0.0, 10.0),
        'is_unbalance': trial.suggest_categorical('is_unbalance', [True]),
        'random_state': trial.suggest_categorical('random_state', [42])
    }
    pruning_callback = LightGBMPruningCallback(trial, "auc")
    trf = trf.fit(df_train, df_train[target_col])

    model = lgb.train(params, lgb.Dataset(trf.transform(df_train), label=df_train[target_col]), num_boost_round=1000,
                    valid_sets=[lgb.Dataset(trf.transform(df_val), label=df_val[target_col])], 
                    early_stopping_rounds=100, verbose_eval=False, callbacks=[pruning_callback])
   
    y_pred = model.predict(trf.transform(df_val))

    evaluate_metrics(df_val[target_col], y_pred)
    roc_auc = roc_auc_score(df_val[target_col], y_pred) 

    return roc_auc
    
class ModelBuilder(object):
    def __init__(self, cat_cols, features_name, ml_cfg, target_col='target', params=None):
        
        self.features = features_name
        self.cat_cols = cat_cols
        self.target_col = target_col
        self.cfg = ml_cfg
        selector = FeatureSelector(features=features_name)
        enc = CatBoostEncoder(cols=cat_cols)
        lgbm = LGBMClassifier()
        self.model = Pipeline([('sel', selector), ('enc', enc), ('lgbm', lgbm)])      
        self.model.set_params(**params) 
        
        
    def train(self, df_train):
        model = self.model.fit(df_train, df_train[self.target_col])
        return model
  
    def get_transformer_data(self):
        return Pipeline(self.model.steps[:-1])
        
    def select_features(self, df_train, df_val):

        self.train(df_train)
        transformer = self.get_transformer_data()
        model = self.model.steps[-1][-1]
        transformed_val_data = transformer.transform(df_val)

        print('start feature select...')
        perm = PermutationImportance(model, random_state=42) \
                                    .fit(transformed_val_data, df_val[self.target_col])
        print('end feature select...')

        feature_importances = list(zip(transformed_val_data.columns, perm.feature_importances_))
        feature_importances.sort(key=lambda x: x[1], reverse=True)
        selectd_featurs = [name_col for name_col, value in feature_importances if value > 0]
        print(f'selected {len(selectd_featurs)} out of {len(df_train.columns)}')
        
        cat_cols = list(set(self.cat_cols) & set(selectd_featurs))
        self.model.set_params(**{'sel__features': selectd_featurs,
                                'enc__cols': cat_cols})
    
    def params_tuning(self, df_train, df_val):
        print('start tuning...')
        study = optuna.create_study(direction='maximize')
        transformer = self.get_transformer_data()
        study.optimize(lambda trial: objective(trial, transformer, df_train, df_val),
                       n_jobs=1, n_trials=self.cfg['n_trails'])
        print('end tuning...')

        best_params = study.best_params
        best_score = study.best_value
        best_params = {f'lgbm__{k}':v for k,v in best_params.items()}
        self.model.set_params(**best_params)
        print(f'best params: {best_params}')
        print(f'best score: {best_score}')
    
    # def sampling_dataset(self, x, y):
    #     print('start sampling with SMOTE ...')
    #     trf = self.get_transformer_data().fit(x, y)
    #     sampler = SMOTE(sampling_strategy='auto', random_state=42)
    #     X_train_resampled, y_train_resampled = sampler.fit_resample(trf.transform(x.fillna(-1)), y)
    #     return X_train_resampled, y_train_resampled
    
    def build(self, df):
        print('start build model...')
        df_train, df_val = train_test_split(df, random_state=42, test_size=self.cfg['test_size'],
                                            shuffle=True, stratify=df[self.target_col])
        
        self.select_features(df_train, df_val)
        
        self.params_tuning(df_train, df_val)

        self.model.fit(df_train, df_train[self.target_col])
        
        y_pred = self.model.predict(df_val)
        metrics = roc_auc_score(df_val[self.target_col], y_pred)
        print(f'roc_auc on validate data = {metrics}')
        
        return self.model

# 3. Create and get trainable model

In [149]:
exclude_cols = ['target', 'client_id', 'report_date']
features = df.columns.tolist()
features = list(set(features) - set(exclude_cols))
category_cols = list(set(features) & set(category_cols))
default_params = {'lgbm__n_estimators': 100, 'lgbm__max_depth': 10, 
                  'lgbm__random_state': 42, 'lgbm__n_jobs': -1}
ml_cfg =  {'n_trails': 100, 'test_size': 0.25}

In [150]:
df['target'] = df['target'].astype(int)

In [151]:
builder = ModelBuilder(cat_cols = category_cols, features_name=features, ml_cfg=ml_cfg, params=default_params)

In [152]:
model = builder.build(df)

start build model...
start feature select...


[I 2023-09-09 21:10:04,357] A new study created in memory with name: no-name-eeb6bf39-0271-4eca-9aa2-715848b80a70


end feature select...
selected 88 out of 336
start tuning...


[I 2023-09-09 21:10:05,129] Trial 0 finished with value: 0.8764203169783665 and parameters: {'num_leaves': 98, 'learning_rate': 0.03708942446287805, 'feature_fraction': 0.17052261372406863, 'bagging_fraction': 0.35884062766941294, 'bagging_freq': 8, 'max_depth': 3, 'min_child_samples': 33, 'min_sum_hessian_in_leaf': 2.0667689890434837, 'is_unbalance': True, 'random_state': 42}. Best is trial 0 with value: 0.8764203169783665.


ROC AUC: 0.8764203169783665; Accuracy: 0.9222468179302712; Precision: 0.23939393939393938; Recall: 0.7247706422018348; F1 Score: 0.35990888382687924; PR AUC: 0.34264731153556455; Log Loss: 0.20487345956900088


[I 2023-09-09 21:10:06,301] Trial 1 finished with value: 0.866502637123899 and parameters: {'num_leaves': 255, 'learning_rate': 0.004773572927877699, 'feature_fraction': 0.4597915389271122, 'bagging_fraction': 0.6262217892169022, 'bagging_freq': 5, 'max_depth': 3, 'min_child_samples': 48, 'min_sum_hessian_in_leaf': 2.8707060890315517, 'is_unbalance': True, 'random_state': 42}. Best is trial 0 with value: 0.8764203169783665.


ROC AUC: 0.866502637123899; Accuracy: 0.8696734919756502; Precision: 0.15458015267175573; Recall: 0.7431192660550459; F1 Score: 0.2559241706161137; PR AUC: 0.31856578420396064; Log Loss: 0.2718442886260174


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:07,410] Trial 2 finished with value: 0.8893481134421337 and parameters: {'num_leaves': 111, 'learning_rate': 0.0020946324633802295, 'feature_fraction': 0.12649439298837145, 'bagging_fraction': 0.6489174087629024, 'bagging_freq': 10, 'max_depth': 9, 'min_child_samples': 64, 'min_sum_hessian_in_leaf': 1.8250215929321467, 'is_unbalance': True, 'random_state': 42}. Best is trial 2 with value: 0.8893481134421337.


ROC AUC: 0.8893481134421337; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.38201964090737395; Log Loss: 0.1328231716712364


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:08,288] Trial 3 finished with value: 0.8794487560365926 and parameters: {'num_leaves': 117, 'learning_rate': 0.0017878337373206368, 'feature_fraction': 0.865655847505695, 'bagging_fraction': 0.8638405076539262, 'bagging_freq': 10, 'max_depth': 7, 'min_child_samples': 28, 'min_sum_hessian_in_leaf': 3.6768229724548576, 'is_unbalance': True, 'random_state': 42}. Best is trial 2 with value: 0.8893481134421337.


ROC AUC: 0.8794487560365926; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.23236354156217243; Log Loss: 0.1307365739539597


[I 2023-09-09 21:10:09,113] Trial 4 finished with value: 0.8829928411574552 and parameters: {'num_leaves': 164, 'learning_rate': 0.05465776906186318, 'feature_fraction': 0.5532804523760733, 'bagging_fraction': 0.24664099934199152, 'bagging_freq': 9, 'max_depth': 7, 'min_child_samples': 55, 'min_sum_hessian_in_leaf': 0.18521887428950445, 'is_unbalance': True, 'random_state': 42}. Best is trial 2 with value: 0.8893481134421337.


ROC AUC: 0.8829928411574552; Accuracy: 0.9557277255118982; Precision: 0.3575418994413408; Recall: 0.5871559633027523; F1 Score: 0.4444444444444444; PR AUC: 0.3840254996421908; Log Loss: 0.12805939478455444


[I 2023-09-09 21:10:09,574] Trial 5 pruned. Trial was pruned at iteration 14.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:10,515] Trial 6 finished with value: 0.8970435419911268 and parameters: {'num_leaves': 199, 'learning_rate': 0.01231359676866343, 'feature_fraction': 0.3067815357807885, 'bagging_fraction': 0.469021719383125, 'bagging_freq': 5, 'max_depth': 11, 'min_child_samples': 29, 'min_sum_hessian_in_leaf': 6.219319562894894, 'is_unbalance': True, 'random_state': 42}. Best is trial 6 with value: 0.8970435419911268.


ROC AUC: 0.8970435419911268; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.3784473845400746; Log Loss: 0.11045560016953054


[I 2023-09-09 21:10:11,605] Trial 7 finished with value: 0.8924236673690271 and parameters: {'num_leaves': 67, 'learning_rate': 0.034354993669382085, 'feature_fraction': 0.42046975788078733, 'bagging_fraction': 0.9805527151935329, 'bagging_freq': 10, 'max_depth': 10, 'min_child_samples': 76, 'min_sum_hessian_in_leaf': 1.840743334786894, 'is_unbalance': True, 'random_state': 42}. Best is trial 6 with value: 0.8970435419911268.


ROC AUC: 0.8924236673690271; Accuracy: 0.9579413392363033; Precision: 0.39408866995073893; Recall: 0.7339449541284404; F1 Score: 0.5128205128205129; PR AUC: 0.5619696248266239; Log Loss: 0.1211636557981145


[I 2023-09-09 21:10:12,087] Trial 8 pruned. Trial was pruned at iteration 13.
[I 2023-09-09 21:10:12,547] Trial 9 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:13,020] Trial 10 pruned. Trial was pruned at iteration 0.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:13,862] Trial 11 finished with value: 0.898475310500072 and parameters: {'num_leaves': 188, 'learning_rate': 0.021955915034922963, 'feature_fraction': 0.2882063536499666, 'bagging_fraction': 0.984888969585022, 'bagging_freq': 7, 'max_depth': 10, 'min_child_samples': 98, 'min_sum_hessian_in_leaf': 5.950678344685625, 'is_unbalance': True, 'random_state': 42}. Best is trial 11 with value: 0.898475310500072.


ROC AUC: 0.898475310500072; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.40250212360846827; Log Loss: 0.11912727613510136


[I 2023-09-09 21:10:14,349] Trial 12 pruned. Trial was pruned at iteration 0.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:15,310] Trial 13 finished with value: 0.8889685770000917 and parameters: {'num_leaves': 187, 'learning_rate': 0.019861825691994267, 'feature_fraction': 0.27115646683272066, 'bagging_fraction': 0.510572763233321, 'bagging_freq': 3, 'max_depth': 9, 'min_child_samples': 12, 'min_sum_hessian_in_leaf': 5.419875183904099, 'is_unbalance': True, 'random_state': 42}. Best is trial 11 with value: 0.898475310500072.


ROC AUC: 0.8889685770000917; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.30342542007913864; Log Loss: 0.11534976158275262


[I 2023-09-09 21:10:16,179] Trial 14 finished with value: 0.8940556740698086 and parameters: {'num_leaves': 213, 'learning_rate': 0.08517046843169176, 'feature_fraction': 0.10993590930333669, 'bagging_fraction': 0.8225766949903536, 'bagging_freq': 7, 'max_depth': 10, 'min_child_samples': 34, 'min_sum_hessian_in_leaf': 7.403558981848619, 'is_unbalance': True, 'random_state': 42}. Best is trial 11 with value: 0.898475310500072.


ROC AUC: 0.8940556740698086; Accuracy: 0.9562811289429994; Precision: 0.3793103448275862; Recall: 0.7064220183486238; F1 Score: 0.4935897435897435; PR AUC: 0.4929506889342698; Log Loss: 0.13625950835281023


[I 2023-09-09 21:10:16,692] Trial 15 pruned. Trial was pruned at iteration 17.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:17,600] Trial 16 finished with value: 0.8842309152063239 and parameters: {'num_leaves': 217, 'learning_rate': 0.011072322067355754, 'feature_fraction': 0.5953166336291329, 'bagging_fraction': 0.5674727429841825, 'bagging_freq': 7, 'max_depth': 11, 'min_child_samples': 45, 'min_sum_hessian_in_leaf': 7.9796843042183765, 'is_unbalance': True, 'random_state': 42}. Best is trial 11 with value: 0.898475310500072.


ROC AUC: 0.8842309152063239; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.27829056598849106; Log Loss: 0.11824881242826829


[I 2023-09-09 21:10:18,079] Trial 17 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:18,578] Trial 18 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:19,079] Trial 19 pruned. Trial was pruned at iteration 1.
[I 2023-09-09 21:10:21,738] Trial 20 finished with value: 0.8982685285764765 and parameters: {'num_leaves': 38, 'learning_rate': 0.013686224481584072, 'feature_fraction': 0.36319167786280315, 'bagging_fraction': 0.8392393197480867, 'bagging_freq': 8, 'max_depth': 10, 'min_child_samples': 16, 'min_sum_hessian_in_leaf': 3.7901279371245193, 'is_unbalance': True, 'random_state': 42}. Best is trial 11 with value: 0.898475310500072.


ROC AUC: 0.8982685285764765; Accuracy: 0.9795240730492529; Precision: 0.6521739130434783; Recall: 0.6880733944954128; F1 Score: 0.6696428571428572; PR AUC: 0.6973585629787779; Log Loss: 0.083554090365897


[I 2023-09-09 21:10:22,230] Trial 21 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:22,711] Trial 22 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:23,194] Trial 23 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:23,699] Trial 24 pruned. Trial was pruned at iteration 0.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:25,185] Trial 25 finished with value: 0.9016267717153739 and parameters: {'num_leaves': 233, 'learning_rate': 0.0205088875437792, 'feature_fraction': 0.38260944326557517, 'bagging_fraction': 0.9166780816990325, 'bagging_freq': 4, 'max_depth': 11, 'min_child_samples': 10, 'min_sum_hessian_in_leaf': 5.543423711311589, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.9016267717153739; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.4008154045989917; Log Loss: 0.11503275484933044


[I 2023-09-09 21:10:27,184] Trial 26 finished with value: 0.8896831525082124 and parameters: {'num_leaves': 234, 'learning_rate': 0.023001552696032036, 'feature_fraction': 0.49986141546692436, 'bagging_fraction': 0.9129768129179157, 'bagging_freq': 3, 'max_depth': 9, 'min_child_samples': 8, 'min_sum_hessian_in_leaf': 5.354098296814049, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.8896831525082124; Accuracy: 0.9659656889872718; Precision: 0.4583333333333333; Recall: 0.7064220183486238; F1 Score: 0.555956678700361; PR AUC: 0.5665745004661124; Log Loss: 0.1148293516304991


[I 2023-09-09 21:10:27,712] Trial 27 pruned. Trial was pruned at iteration 7.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:28,683] Trial 28 finished with value: 0.8969126673559397 and parameters: {'num_leaves': 240, 'learning_rate': 0.03393164815199157, 'feature_fraction': 0.38498888992899377, 'bagging_fraction': 0.8087511229494359, 'bagging_freq': 4, 'max_depth': 11, 'min_child_samples': 40, 'min_sum_hessian_in_leaf': 4.61860103840676, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.8969126673559397; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.266746727227693; Log Loss: 0.11304562458963566


[I 2023-09-09 21:10:29,168] Trial 29 pruned. Trial was pruned at iteration 1.
[I 2023-09-09 21:10:29,651] Trial 30 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:30,137] Trial 31 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:30,630] Trial 32 pruned. Trial was pruned at iteration 0.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:31,671] Trial 33 finished with value: 0.891323011687105 and parameters: {'num_leaves': 201, 'learning_rate': 0.00819122135458437, 'feature_fraction': 0.38365756336359785, 'bagging_fraction': 0.9277188334304183, 'bagging_freq': 6, 'max_depth': 9, 'min_child_samples': 17, 'min_sum_hessian_in_leaf': 4.766172850998122, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.891323011687105; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.3671853095226744; Log Loss: 0.11853430866483418


[I 2023-09-09 21:10:32,555] Trial 34 finished with value: 0.8929222997290893 and parameters: {'num_leaves': 137, 'learning_rate': 0.02207316127598945, 'feature_fraction': 0.1359431101633581, 'bagging_fraction': 0.6938362156559582, 'bagging_freq': 5, 'max_depth': 10, 'min_child_samples': 49, 'min_sum_hessian_in_leaf': 5.675777164233615, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.8929222997290893; Accuracy: 0.9703929164360819; Precision: 0.5098039215686274; Recall: 0.47706422018348627; F1 Score: 0.4928909952606635; PR AUC: 0.4403893527417063; Log Loss: 0.14055531503399876


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:33,718] Trial 35 finished with value: 0.8891033778743342 and parameters: {'num_leaves': 180, 'learning_rate': 0.009671921427060158, 'feature_fraction': 0.4642451253061314, 'bagging_fraction': 0.8544857831683672, 'bagging_freq': 8, 'max_depth': 12, 'min_child_samples': 24, 'min_sum_hessian_in_leaf': 3.8890871567684977, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.8891033778743342; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.47444654546124476; Log Loss: 0.11556973968201203


[I 2023-09-09 21:10:34,200] Trial 36 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:34,698] Trial 37 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:35,217] Trial 38 pruned. Trial was pruned at iteration 0.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:36,296] Trial 39 finished with value: 0.8951982096349906 and parameters: {'num_leaves': 111, 'learning_rate': 0.009464216705827254, 'feature_fraction': 0.42722132850639255, 'bagging_fraction': 0.9033572752170425, 'bagging_freq': 9, 'max_depth': 11, 'min_child_samples': 54, 'min_sum_hessian_in_leaf': 4.158183632087535, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.8951982096349906; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.4245013037524537; Log Loss: 0.11251973028520314


[I 2023-09-09 21:10:36,799] Trial 40 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:38,040] Trial 41 finished with value: 0.8944993390830924 and parameters: {'num_leaves': 244, 'learning_rate': 0.0394852695333054, 'feature_fraction': 0.38237946386730576, 'bagging_fraction': 0.8220766921797481, 'bagging_freq': 3, 'max_depth': 11, 'min_child_samples': 42, 'min_sum_hessian_in_leaf': 4.4477064505536745, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.8944993390830924; Accuracy: 0.9609850581073602; Precision: 0.41397849462365593; Recall: 0.7064220183486238; F1 Score: 0.5220338983050847; PR AUC: 0.4956929247704636; Log Loss: 0.12036626221157119


[I 2023-09-09 21:10:39,458] Trial 42 finished with value: 0.895878757737963 and parameters: {'num_leaves': 255, 'learning_rate': 0.034656172892978405, 'feature_fraction': 0.3760535620959377, 'bagging_fraction': 0.875804306221381, 'bagging_freq': 4, 'max_depth': 11, 'min_child_samples': 38, 'min_sum_hessian_in_leaf': 5.034223647685395, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.895878757737963; Accuracy: 0.9670724958494743; Precision: 0.46987951807228917; Recall: 0.7155963302752294; F1 Score: 0.5672727272727274; PR AUC: 0.5814298690848548; Log Loss: 0.10694630110502999


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:40,725] Trial 43 finished with value: 0.8962151055503933 and parameters: {'num_leaves': 170, 'learning_rate': 0.03081875499446824, 'feature_fraction': 0.42737452452791735, 'bagging_fraction': 0.9567552945831027, 'bagging_freq': 4, 'max_depth': 12, 'min_child_samples': 14, 'min_sum_hessian_in_leaf': 3.452273203508689, 'is_unbalance': True, 'random_state': 42}. Best is trial 25 with value: 0.9016267717153739.


ROC AUC: 0.8962151055503933; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.38753209045462994; Log Loss: 0.1084957784581006


[I 2023-09-09 21:10:41,217] Trial 44 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:41,714] Trial 45 pruned. Trial was pruned at iteration 1.
[I 2023-09-09 21:10:42,214] Trial 46 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:42,708] Trial 47 pruned. Trial was pruned at iteration 0.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:43,629] Trial 48 finished with value: 0.9060228507113037 and parameters: {'num_leaves': 78, 'learning_rate': 0.017828664475184237, 'feature_fraction': 0.4521387703062011, 'bagging_fraction': 0.8355327814866393, 'bagging_freq': 2, 'max_depth': 10, 'min_child_samples': 38, 'min_sum_hessian_in_leaf': 5.169824939514641, 'is_unbalance': True, 'random_state': 42}. Best is trial 48 with value: 0.9060228507113037.


ROC AUC: 0.9060228507113037; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.31473990779120864; Log Loss: 0.12057579098665268


[I 2023-09-09 21:10:44,121] Trial 49 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:44,619] Trial 50 pruned. Trial was pruned at iteration 1.
[I 2023-09-09 21:10:45,140] Trial 51 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:46,467] Trial 52 finished with value: 0.8992762632674162 and parameters: {'num_leaves': 102, 'learning_rate': 0.030102390627177247, 'feature_fraction': 0.4591150994108716, 'bagging_fraction': 0.9416027609866848, 'bagging_freq': 2, 'max_depth': 10, 'min_child_samples': 53, 'min_sum_hessian_in_leaf': 4.295963961246065, 'is_unbalance': True, 'random_state': 42}. Best is trial 48 with value: 0.9060228507113037.


ROC AUC: 0.8992762632674162; Accuracy: 0.9620918649695628; Precision: 0.4263157894736842; Recall: 0.7431192660550459; F1 Score: 0.5418060200668896; PR AUC: 0.584198393829086; Log Loss: 0.11698556100160927


[I 2023-09-09 21:10:46,980] Trial 53 pruned. Trial was pruned at iteration 4.
[I 2023-09-09 21:10:47,479] Trial 54 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:47,983] Trial 55 pruned. Trial was pruned at iteration 2.
[I 2023-09-09 21:10:48,499] Trial 56 pruned. Trial was pruned at iteration 4.
[I 2023-09-09 21:10:49,005] Trial 57 pruned. Trial was pruned at iteration 1.
[I 2023-09-09 21:10:49,506] Trial 58 pruned. Trial was pruned at iteration 1.
[I 2023-09-09 21:10:50,006] Trial 59 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:50,508] Trial 60 pruned. Trial was pruned at iteration 0.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:10:51,548] Trial 61 finished with value: 0.8925780994385479 and parameters: {'num_leaves': 110, 'learning_rate': 0.025442806289176775, 'feature_fraction': 0.40206178153240757, 'bagging_fraction': 0.8972117906528683, 'bagging_freq': 4, 'max_depth': 11, 'min_child_samples': 39, 'min_sum_hessian_in_leaf': 

ROC AUC: 0.8925780994385479; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.33972593694229414; Log Loss: 0.11147215203463623


[I 2023-09-09 21:10:52,034] Trial 62 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:52,546] Trial 63 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:53,041] Trial 64 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:53,543] Trial 65 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:54,037] Trial 66 pruned. Trial was pruned at iteration 1.
[I 2023-09-09 21:10:54,555] Trial 67 pruned. Trial was pruned at iteration 4.
[I 2023-09-09 21:10:55,056] Trial 68 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:55,549] Trial 69 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:56,034] Trial 70 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:10:56,567] Trial 71 pruned. Trial was pruned at iteration 6.
[I 2023-09-09 21:11:00,284] Trial 72 finished with value: 0.9083013781099085 and parameters: {'num_leaves': 175, 'learning_rate': 0.038350742865861515, 'feature_fraction': 0.379488857027627, 'bagging_fraction': 0.9493003

ROC AUC: 0.9083013781099085; Accuracy: 0.9842280022136137; Precision: 0.7888888888888889; Recall: 0.6513761467889908; F1 Score: 0.7135678391959799; PR AUC: 0.7039960237813871; Log Loss: 0.09591265242650228


[I 2023-09-09 21:11:00,809] Trial 73 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:01,366] Trial 74 pruned. Trial was pruned at iteration 2.
[I 2023-09-09 21:11:01,887] Trial 75 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:02,428] Trial 76 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:04,134] Trial 77 finished with value: 0.894994045204099 and parameters: {'num_leaves': 52, 'learning_rate': 0.017583014995885764, 'feature_fraction': 0.3896853016846249, 'bagging_fraction': 0.9463472887912912, 'bagging_freq': 7, 'max_depth': 12, 'min_child_samples': 43, 'min_sum_hessian_in_leaf': 5.6743910008622525, 'is_unbalance': True, 'random_state': 42}. Best is trial 72 with value: 0.9083013781099085.


ROC AUC: 0.894994045204099; Accuracy: 0.9714997232982845; Precision: 0.5192307692307693; Recall: 0.7431192660550459; F1 Score: 0.6113207547169811; PR AUC: 0.6244597925269145; Log Loss: 0.09946150560939501


[I 2023-09-09 21:11:04,721] Trial 78 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:05,286] Trial 79 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:05,826] Trial 80 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:10,098] Trial 81 finished with value: 0.9099032836445969 and parameters: {'num_leaves': 176, 'learning_rate': 0.037676630634207287, 'feature_fraction': 0.4267394208152833, 'bagging_fraction': 0.9761636622221321, 'bagging_freq': 4, 'max_depth': 12, 'min_child_samples': 15, 'min_sum_hessian_in_leaf': 3.667235736293018, 'is_unbalance': True, 'random_state': 42}. Best is trial 81 with value: 0.9099032836445969.


ROC AUC: 0.9099032836445969; Accuracy: 0.9836745987825124; Precision: 0.7777777777777778; Recall: 0.6422018348623854; F1 Score: 0.7035175879396985; PR AUC: 0.7024399815096778; Log Loss: 0.10264596091936701


[I 2023-09-09 21:11:10,697] Trial 82 pruned. Trial was pruned at iteration 1.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:11:12,295] Trial 83 finished with value: 0.8993534793021765 and parameters: {'num_leaves': 176, 'learning_rate': 0.05676772305189751, 'feature_fraction': 0.38851588139485255, 'bagging_fraction': 0.9331892685300602, 'bagging_freq': 5, 'max_depth': 12, 'min_child_samples': 8, 'min_sum_hessian_in_leaf': 4.074533567496684, 'is_unbalance': True, 'random_state': 42}. Best is trial 81 with value: 0.9099032836445969.


ROC AUC: 0.8993534793021765; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.3732532632998716; Log Loss: 0.10467457304782517


[I 2023-09-09 21:11:12,914] Trial 84 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:13,523] Trial 85 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:14,119] Trial 86 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:14,729] Trial 87 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:15,336] Trial 88 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:15,925] Trial 89 pruned. Trial was pruned at iteration 1.
[I 2023-09-09 21:11:16,517] Trial 90 pruned. Trial was pruned at iteration 1.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:11:17,781] Trial 91 finished with value: 0.901416063552723 and parameters: {'num_leaves': 162, 'learning_rate': 0.03409791865722902, 'feature_fraction': 0.39313132558329983, 'bagging_fraction': 0.9060438886979448, 'bagging_freq': 4, 'max_depth': 11, 'min_child_samples': 36, 'min_sum_hessian_in_leaf': 4.616386520755907, 'is_unbalance': True, 'random_state': 42}. Best is trial 81 w

ROC AUC: 0.901416063552723; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.28733297056764595; Log Loss: 0.1113816545842916


[I 2023-09-09 21:11:18,379] Trial 92 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:18,896] Trial 93 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:19,432] Trial 94 pruned. Trial was pruned at iteration 0.
[I 2023-09-09 21:11:19,936] Trial 95 pruned. Trial was pruned at iteration 0.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:11:21,153] Trial 96 finished with value: 0.9019709720059156 and parameters: {'num_leaves': 161, 'learning_rate': 0.018612265050895072, 'feature_fraction': 0.37370675585114943, 'bagging_fraction': 0.9461762963155449, 'bagging_freq': 5, 'max_depth': 10, 'min_child_samples': 8, 'min_sum_hessian_in_leaf': 3.710591242936376, 'is_unbalance': True, 'random_state': 42}. Best is trial 81 with value: 0.9099032836445969.


ROC AUC: 0.9019709720059156; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.3942588098725532; Log Loss: 0.11255295617047217


[I 2023-09-09 21:11:22,115] Trial 97 pruned. Trial was pruned at iteration 80.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-09-09 21:11:23,314] Trial 98 finished with value: 0.8971901215825361 and parameters: {'num_leaves': 124, 'learning_rate': 0.023233657231561452, 'feature_fraction': 0.4339722885989097, 'bagging_fraction': 0.8626743522874187, 'bagging_freq': 5, 'max_depth': 10, 'min_child_samples': 9, 'min_sum_hessian_in_leaf': 3.7057090780398503, 'is_unbalance': True, 'random_state': 42}. Best is trial 81 with value: 0.9099032836445969.


ROC AUC: 0.8971901215825361; Accuracy: 0.9698395130049806; Precision: 0.0; Recall: 0.0; F1 Score: 0.0; PR AUC: 0.395287612481583; Log Loss: 0.10888544577016179


[I 2023-09-09 21:11:23,818] Trial 99 pruned. Trial was pruned at iteration 0.


end tuning...
best params: {'lgbm__num_leaves': 176, 'lgbm__learning_rate': 0.037676630634207287, 'lgbm__feature_fraction': 0.4267394208152833, 'lgbm__bagging_fraction': 0.9761636622221321, 'lgbm__bagging_freq': 4, 'lgbm__max_depth': 12, 'lgbm__min_child_samples': 15, 'lgbm__min_sum_hessian_in_leaf': 3.667235736293018, 'lgbm__is_unbalance': True, 'lgbm__random_state': 42}
best score: 0.9099032836445969
roc_auc on validate data = 0.719492468164745


# 4. Submission file

In [None]:
test = pd.read_csv(...)

In [None]:
test['score'] = model.predict_proba(test)

In [None]:
sample_submission = test.loc[:, ['report_date', 'client_id', 'score']]

In [None]:
sample_submission.to_csv('result_novichki.csv')