# Модель оценки кредитного риска - выход клиента в дефолт по кредиту

In [8]:
import numpy as np
import pandas as pd

import missingno as msno
import matplotlib.pyplot as plt
from matplotlib.pylab import rc, plot
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score, roc_curve, auc, precision_score, recall_score, f1_score, make_scorer, confusion_matrix

from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [9]:
# путь данным
path = 'train_data'

In [10]:
import os
import pandas as pd
import tqdm


def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                     num_parts_to_read: int = 2, columns=None, verbose=False) -> pd.DataFrame:
    res = []
    dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
                              if filename.startswith('train')])
    print(dataset_paths)

    start_from = max(0, start_from)
    chunks = dataset_paths[start_from: start_from + num_parts_to_read]
    if verbose:
        print('Reading chunks:\n')
        for chunk in chunks:
            print(chunk)
    for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):
        print('chunk_path', chunk_path)
        chunk = pd.read_parquet(chunk_path,columns=columns)
        res.append(chunk)

    return pd.concat(res).reset_index(drop=True)

In [11]:
def prepare_transactions_dataset(path_to_dataset: str, num_parts_to_preprocess_at_once: int = 2, num_parts_total: int=50,
                                 save_to_path=None, verbose: bool=False):
    preprocessed_frames = []

    for step in tqdm.tqdm_notebook(range(0, num_parts_total, num_parts_to_preprocess_at_once),
                                   desc="Transforming transactions data"):
        transactions_frame = read_parquet_dataset_from_local(path_to_dataset, step, num_parts_to_preprocess_at_once,
                                                             verbose=verbose)
        


   #здесь должен быть препроцессинг данных
       
        transactions_frame = ohe(transactions_frame)

   #записываем подготовленные данные в файл
        if save_to_path:
            block_as_str = str(step)
            if len(block_as_str) == 1:
                block_as_str = '00' + block_as_str
            else:
                block_as_str = '0' + block_as_str
            transactions_frame.to_parquet(os.path.join(save_to_path, f'processed_chunk_{block_as_str}.parquet'))

        preprocessed_frames.append(transactions_frame)
    return pd.concat(preprocessed_frames)

In [None]:
# Удаление столбцов, кодирование OHE, агрегирование

In [12]:
def ohe(df):
    df1 = df['id']
    df = df.drop(columns=['id','rn', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
       'pre_fterm', 'pre_till_pclose', 'pre_till_fclose'])
    ohe = OneHotEncoder(sparse_output=False, dtype=np.int8)
    df = ohe.fit_transform(df)
    df = pd.DataFrame(df, columns=ohe.get_feature_names_out())
    df = pd.concat([df1, df], axis=1)
    
    #агрегация
    df = agg_count(df)
    return df

    
    

In [None]:
#группировка по id

In [13]:
def agg_count(df):
    df = df.groupby("id").sum().reset_index()
    return df

In [None]:
#загрузка данных

In [74]:
data = prepare_transactions_dataset(path, num_parts_to_preprocess_at_once=2, num_parts_total=11,
                                    save_to_path='train_data/')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step in tqdm.tqdm_notebook(range(0, num_parts_total, num_parts_to_preprocess_at_once),


Transforming transactions data:   0%|          | 0/6 [00:00<?, ?it/s]

['train_data\\train_data_00.pq', 'train_data\\train_data_01.pq', 'train_data\\train_data_02.pq', 'train_data\\train_data_03.pq', 'train_data\\train_data_04.pq', 'train_data\\train_data_05.pq', 'train_data\\train_data_06.pq', 'train_data\\train_data_07.pq', 'train_data\\train_data_08.pq', 'train_data\\train_data_09.pq', 'train_data\\train_data_10.pq', 'train_data\\train_data_11.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

chunk_path train_data\train_data_00.pq
chunk_path train_data\train_data_01.pq
['train_data\\train_data_00.pq', 'train_data\\train_data_01.pq', 'train_data\\train_data_02.pq', 'train_data\\train_data_03.pq', 'train_data\\train_data_04.pq', 'train_data\\train_data_05.pq', 'train_data\\train_data_06.pq', 'train_data\\train_data_07.pq', 'train_data\\train_data_08.pq', 'train_data\\train_data_09.pq', 'train_data\\train_data_10.pq', 'train_data\\train_data_11.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

chunk_path train_data\train_data_02.pq
chunk_path train_data\train_data_03.pq
['train_data\\train_data_00.pq', 'train_data\\train_data_01.pq', 'train_data\\train_data_02.pq', 'train_data\\train_data_03.pq', 'train_data\\train_data_04.pq', 'train_data\\train_data_05.pq', 'train_data\\train_data_06.pq', 'train_data\\train_data_07.pq', 'train_data\\train_data_08.pq', 'train_data\\train_data_09.pq', 'train_data\\train_data_10.pq', 'train_data\\train_data_11.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

chunk_path train_data\train_data_04.pq
chunk_path train_data\train_data_05.pq
['train_data\\train_data_00.pq', 'train_data\\train_data_01.pq', 'train_data\\train_data_02.pq', 'train_data\\train_data_03.pq', 'train_data\\train_data_04.pq', 'train_data\\train_data_05.pq', 'train_data\\train_data_06.pq', 'train_data\\train_data_07.pq', 'train_data\\train_data_08.pq', 'train_data\\train_data_09.pq', 'train_data\\train_data_10.pq', 'train_data\\train_data_11.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

chunk_path train_data\train_data_06.pq
chunk_path train_data\train_data_07.pq
['train_data\\train_data_00.pq', 'train_data\\train_data_01.pq', 'train_data\\train_data_02.pq', 'train_data\\train_data_03.pq', 'train_data\\train_data_04.pq', 'train_data\\train_data_05.pq', 'train_data\\train_data_06.pq', 'train_data\\train_data_07.pq', 'train_data\\train_data_08.pq', 'train_data\\train_data_09.pq', 'train_data\\train_data_10.pq', 'train_data\\train_data_11.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

chunk_path train_data\train_data_08.pq
chunk_path train_data\train_data_09.pq
['train_data\\train_data_00.pq', 'train_data\\train_data_01.pq', 'train_data\\train_data_02.pq', 'train_data\\train_data_03.pq', 'train_data\\train_data_04.pq', 'train_data\\train_data_05.pq', 'train_data\\train_data_06.pq', 'train_data\\train_data_07.pq', 'train_data\\train_data_08.pq', 'train_data\\train_data_09.pq', 'train_data\\train_data_10.pq', 'train_data\\train_data_11.pq']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

chunk_path train_data\train_data_10.pq
chunk_path train_data\train_data_11.pq


In [75]:
data

Unnamed: 0,id,pre_loans_credit_limit_0,pre_loans_credit_limit_1,pre_loans_credit_limit_2,pre_loans_credit_limit_3,pre_loans_credit_limit_4,pre_loans_credit_limit_5,pre_loans_credit_limit_6,pre_loans_credit_limit_7,pre_loans_credit_limit_8,...,pre_loans3060_3,pre_loans90_3,pre_loans5_10,pre_loans530_5,pre_loans530_8,pre_loans530_9,pre_loans530_17,pre_loans3060_4,pre_loans3060_6,pre_loans6090_0
0,0,1,0,1,0,1,0,0,0,0,...,,,,,,,,,,
1,1,1,2,0,1,0,0,4,0,0,...,,,,,,,,,,
2,2,0,2,0,1,0,0,0,0,0,...,,,,,,,,,,
3,3,0,2,0,1,1,1,0,0,0,...,,,,,,,,,,
4,4,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,2999995,2,0,0,1,0,0,1,1,1,...,,0.0,,0.0,0.0,0.0,,,0.0,
499996,2999996,0,1,0,1,0,1,0,0,2,...,,0.0,,0.0,0.0,0.0,,,0.0,
499997,2999997,1,1,0,1,0,0,1,1,2,...,,0.0,,0.0,0.0,0.0,,,0.0,
499998,2999998,0,0,0,0,1,0,0,1,0,...,,0.0,,0.0,0.0,0.0,,,0.0,


In [84]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000000 entries, 0 to 2999999
Columns: 314 entries, pre_loans_credit_limit_0 to flag
dtypes: float64(19), int64(1), int8(294)
memory usage: 1.3 GB


In [None]:
# столбец id делаем индексом, заполнение NAN

In [77]:
data.set_index('id', inplace=True)

In [78]:
data.fillna(0, inplace=True)

In [None]:
#загрузка целевой переменной

In [82]:
targets = pd.read_csv('train_target.csv')
targets.head(10)

Unnamed: 0,id,flag
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


In [None]:
# объединение датафрейма с целевой переменной

In [83]:
data['flag'] = targets['flag']
data

Unnamed: 0_level_0,pre_loans_credit_limit_0,pre_loans_credit_limit_1,pre_loans_credit_limit_2,pre_loans_credit_limit_3,pre_loans_credit_limit_4,pre_loans_credit_limit_5,pre_loans_credit_limit_6,pre_loans_credit_limit_7,pre_loans_credit_limit_8,pre_loans_credit_limit_9,...,pre_loans90_3,pre_loans5_10,pre_loans530_5,pre_loans530_8,pre_loans530_9,pre_loans530_17,pre_loans3060_4,pre_loans3060_6,pre_loans6090_0,flag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,2,0,1,0,0,4,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0,2,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0,2,0,1,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,2,0,0,1,0,0,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2999996,0,1,0,1,0,1,0,0,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2999997,1,1,0,1,0,0,1,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2999998,0,0,0,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
# поиск дубликатов

In [15]:
data[data.duplicated()]

Unnamed: 0_level_0,pre_loans_credit_limit_0,pre_loans_credit_limit_1,pre_loans_credit_limit_2,pre_loans_credit_limit_3,pre_loans_credit_limit_4,pre_loans_credit_limit_5,pre_loans_credit_limit_6,pre_loans_credit_limit_7,pre_loans_credit_limit_8,pre_loans_credit_limit_9,...,pre_loans3060_3,pre_loans90_3,pre_loans5_10,pre_loans530_5,pre_loans530_8,pre_loans530_9,pre_loans530_17,pre_loans3060_4,pre_loans3060_6,pre_loans6090_0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
488,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2677,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4042,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4058,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999865,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2999887,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2999902,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2999923,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
data[data.duplicated()]

Unnamed: 0_level_0,pre_loans_credit_limit_0,pre_loans_credit_limit_1,pre_loans_credit_limit_2,pre_loans_credit_limit_3,pre_loans_credit_limit_4,pre_loans_credit_limit_5,pre_loans_credit_limit_6,pre_loans_credit_limit_7,pre_loans_credit_limit_8,pre_loans_credit_limit_9,...,pre_loans90_3,pre_loans5_10,pre_loans530_5,pre_loans530_8,pre_loans530_9,pre_loans530_17,pre_loans3060_4,pre_loans3060_6,pre_loans6090_0,flag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
488,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2677,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3949,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4042,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4058,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999865,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2999887,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2999902,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2999923,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
#удаляю дубли 

In [85]:
data.drop_duplicates(inplace=True)

In [86]:
data.shape

(2901143, 314)

In [None]:
#список признаков без целевой переменной 

In [87]:
col1= data.columns.to_list()

In [88]:
col1.pop(-1)

'flag'

In [27]:
# полностью удаляю строки которые по признакам дубликаты, а по целевой переменной отличаются   

In [89]:
data.drop_duplicates(subset=col1, inplace=True, keep=False)

In [90]:
data.shape

(2892227, 314)

In [91]:
data.to_pickle('train_data/train_data.pkl')

In [None]:
#определяю х и у

In [30]:
x = data.drop(columns=['flag']).astype('int8')

In [32]:
y = data['flag']

In [None]:
#разделение  датасета на треин тест, со стратификацией

In [64]:
x_train, x_test, y_train, y_test = train_test_split(x, data['flag'], stratify=data['flag'], test_size=0.2, random_state=42)

In [None]:
# пробую различные модели

In [40]:
models_result_test_data = dict() 
models = [
    #MLPClassifier(hidden_layer_sizes=(5), random_state=42, max_iter=500),
    #SVC(C=7.0, probability=True),
    GradientBoostingClassifier(),
    CatBoostClassifier(iterations=200,learning_rate=0.15, auto_class_weights='Balanced' )
]

for m in models:
    dict_temp = []
    model = m
    m.fit(x_train, y_train)
    predict_train = m.predict(x_train)
    predict_test = m.predict(x_test)
    acc_tr = accuracy_score(y_train, predict_train)
    dict_temp.append(accuracy_score(y_test, predict_test))
    
    prob = m.predict_proba(x_test)
    prob = prob[:, 1]
    dict_temp.append(roc_auc_score(y_test, prob))
    
    dict_temp.append(precision_score(y_test, predict_test))
    dict_temp.append(recall_score(y_test, predict_test))
    dict_temp.append(f1_score(y_test, predict_test))
    models_result_test_data[type(model).__name__] = dict_temp
    

### 'GradientBoostingClassifier'

In [41]:
models_result_test_data

{'GradientBoostingClassifier': [0.9654332470100926,
  0.7435538651553806,
  0.5,
  0.0007001750437609402,
  0.0013983918493732208]}

In [42]:
confusion_matrix(y_test, predict_test)

array([[558437,     14],
       [ 19981,     14]], dtype=int64)

### CatBoostClassifier

In [38]:
models_result_test_data

{'CatBoostClassifier': [0.6794255643569149,
  0.7543577992820171,
  0.07223675909358872,
  0.698624656164041,
  0.13093502926799547]}

### MLPClassifier

In [47]:
models_result_test_data

{'MLPClassifier': [0.9653623674465723,
  0.7410426622997379,
  0.30097087378640774,
  0.0015503875968992248,
  0.0030848840680664743]}

In [39]:
confusion_matrix(y_test, predict_test)

array([[379042, 179409],
       [  6026,  13969]], dtype=int64)

### LGBMClassifier

In [None]:
# обучение модели LGBMClassifier

In [43]:
import lightgbm as lgb


In [66]:
# Создание и обучение модели
model = lgb.LGBMClassifier(class_weight='balanced')
model.fit(x_train, y_train)

# Предсказание на тестовом наборе данных
y_pred_lgb = model.predict(x_test)

[LightGBM] [Info] Number of positive: 79865, number of negative: 2233916
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.900756 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4448
[LightGBM] [Info] Number of data points in the train set: 2313781, number of used features: 301
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [67]:
prob_lgb = model.predict_proba(x_test)
prob_lgb = prob_lgb[:, 1]

In [68]:
roc_auc_score(y_test, prob_lgb)

0.7536507064068076

In [69]:
prob_lgb

array([0.46087936, 0.74749905, 0.06057924, ..., 0.7009286 , 0.3628281 ,
       0.13457457])

In [70]:
precision_score(y_test, y_pred_lgb)

0.07033802121885023

In [71]:
(recall_score(y_test, y_pred_lgb))

0.7139136532104577

In [72]:
(f1_score(y_test, y_pred_lgb))

0.12805907931146007

In [73]:
confusion_matrix(y_test, y_pred_lgb)

array([[370084, 188396],
       [  5712,  14254]], dtype=int64)

### roc_auc_score на тесте 0.7536
### recall_score 0,71
### При использовании данной модели классификации, наименьшее значение ошибки первого рода, что важно для нашей задачи выявления дефолта.

In [None]:
# попытка подбора лучших гиперпараметров по 'roc_auc'

In [52]:
from sklearn.model_selection import GridSearchCV

In [53]:
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000],
    'num_leaves': [31, 63, 127]
}


In [54]:
model_gs = GridSearchCV(lgb.LGBMClassifier(class_weight='balanced'), param_grid, scoring='roc_auc', n_jobs=-1)

In [55]:
model_gs.fit(x_train, y_train)

1 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Zaraz\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Zaraz\anaconda3\Lib\site-packages\lightgbm\sklearn.py", line 1142, in fit
    super().fit(
  File "C:\Users\Zaraz\anaconda3\Lib\site-packages\lightgbm\sklearn.py", line 842, in fit
    self._Booster = train(
                    ^^^^^^
  File "C:\Users\Zaraz\anaconda3\Lib\site-packages\lightgbm\engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
              ^^^

[LightGBM] [Info] Number of positive: 79836, number of negative: 2233945
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.856216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4453
[LightGBM] [Info] Number of data points in the train set: 2313781, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [57]:
pred_gs = model_gs.predict(x_test)

In [58]:
prob_lgb_gs = model_gs.predict_proba(x_test)
prob_lgb_gs = prob_lgb_gs[:, 1]

In [59]:
roc_auc_score(y_test, prob_lgb_gs)

0.7553911183458493

In [60]:
precision_score(y_test, pred_gs)

0.07254881486895688

In [61]:
(recall_score(y_test, pred_gs))

0.6949737434358589

In [62]:
(f1_score(y_test, pred_gs))

0.1313825135320396

In [63]:
confusion_matrix(y_test, pred_gs)

array([[380807, 177644],
       [  6099,  13896]], dtype=int64)

In [None]:
# на тесте происходит ухудшение метрик 

# Модель LGBMClassifier выбрана как наилучшая для дальнейшего внедрения в пайплайн 