# Заготовка

# ДЗ 6 Гладышев В.В.

Использую датасет Kaggle соревнования: https://www.kaggle.com/c/geekbrains-competitive-data-analysis/overview 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from scipy.stats import ttest_rel

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score, cross_validate

from typing import List, Optional, Dict, Tuple

## Загрузка данных

In [2]:
src_list = ["train", "test", "client_profile"]

In [3]:
data_dir = './data/'

In [4]:
source = {}
for name in src_list:
    source[name] = pd.read_csv(data_dir + name + ".csv")

## Ключевые характеристики

In [5]:
for name in src_list:
    print(f"{name} \n {source[name].shape} \n")

train 
 (110093, 3) 

test 
 (165141, 2) 

client_profile 
 (250000, 24) 



In [6]:
source['train'].head(3)

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE
0,123687442,0,Cash
1,123597908,1,Cash
2,123526683,0,Cash


In [7]:
source['client_profile'].head(3)

Unnamed: 0,APPLICATION_NUMBER,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,REGION_POPULATION,AGE,...,FAMILY_SIZE,EXTERNAL_SCORING_RATING_1,EXTERNAL_SCORING_RATING_2,EXTERNAL_SCORING_RATING_3,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,...,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,...,2.0,,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0
2,123501780,M,1,427500.0,239850.0,23850.0,Incomplete higher,Married,0.072508,14387,...,3.0,0.409017,0.738159,,,,,,,


In [8]:
source['test'].head(3)

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE
0,123724268,Cash
1,123456549,Cash
2,123428178,Credit Card


In [9]:
for name in src_list:
    print(f"{name} \n")
    source[name].info()
    print("\n")

train 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110093 entries, 0 to 110092
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   APPLICATION_NUMBER  110093 non-null  int64 
 1   TARGET              110093 non-null  int64 
 2   NAME_CONTRACT_TYPE  110093 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.5+ MB


test 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165141 entries, 0 to 165140
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   APPLICATION_NUMBER  165141 non-null  int64 
 1   NAME_CONTRACT_TYPE  165141 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


client_profile 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      -

## Формируем "Большую таблицу"

### Подшиваем test к train

In [10]:
test = source['test']

In [11]:
test["TARGET"] = math.nan

In [12]:
test.head

<bound method NDFrame.head of         APPLICATION_NUMBER NAME_CONTRACT_TYPE  TARGET
0                123724268               Cash     NaN
1                123456549               Cash     NaN
2                123428178        Credit Card     NaN
3                123619984               Cash     NaN
4                123671104               Cash     NaN
...                    ...                ...     ...
165136           123487967               Cash     NaN
165137           123536402               Cash     NaN
165138           123718238               Cash     NaN
165139           123631557               Cash     NaN
165140           123433260               Cash     NaN

[165141 rows x 3 columns]>

In [13]:
data = pd.concat([source['train'], test], axis=0)
data.tail()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE
165136,123487967,,Cash
165137,123536402,,Cash
165138,123718238,,Cash
165139,123631557,,Cash
165140,123433260,,Cash


In [14]:
data.shape

(275234, 3)

### Добавляем client_profile

In [15]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X["DAYS_ON_LAST_JOB"] = X["DAYS_ON_LAST_JOB"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "AMT_REQ_CREDIT_BUREAU" in flag]
    X["BKI_REQUESTS_COUNT"] = X[bki_flags].sum(axis=1)
    X["BKI_KURTOSIS"] = X[bki_flags].kurtosis(axis=1)

    X["EXTERNAL_SCORING_PROD"] = X["EXTERNAL_SCORING_RATING_1"] * X["EXTERNAL_SCORING_RATING_2"] * X["EXTERNAL_SCORING_RATING_3"]
    X["EXTERNAL_SCORING_WEIGHTED"] = X.EXTERNAL_SCORING_RATING_1 * 2 + X.EXTERNAL_SCORING_RATING_2 * 1 + X.EXTERNAL_SCORING_RATING_3 * 3

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "EXTERNAL_SCORING_RATING_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["EXTERNAL_SCORING_RATING_1", "EXTERNAL_SCORING_RATING_2", "EXTERNAL_SCORING_RATING_3"]], axis=1
        )

    # Отношение между основными фин. показателями
    X['RATIO_CREDIT_TO_ANNUITY'] = X['AMOUNT_CREDIT'] / X['AMOUNT_ANNUITY']
    X["RATIO_ANNUITY_TO_SALARY"] = X['AMOUNT_ANNUITY'] / X['TOTAL_SALARY']
    X['RATIO_CREDIT_TO_SALARY'] = X['AMOUNT_CREDIT'] / X['TOTAL_SALARY']
    #X["TOTAL_SALARY_NET"] = X["TOTAL_SALARY"] - X["AMOUNT_ANNUITY"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["RATIO_ANNUITY_TO_AGE"] = X["AMOUNT_ANNUITY"] / X["AGE"]
    X["RATIO_CREDIT_TO_AGE"] = X["AMOUNT_CREDIT"] / X["AGE"]
    X["RATIO_SALARY_TO_AGE"] = X["TOTAL_SALARY"] / X["AGE"]
    X["RATIO_SALARY_TO_EXPERIENCE"] = X["TOTAL_SALARY"] / X["DAYS_ON_LAST_JOB"]
    X["RATIO_CREDIT_TO_EXPERIENCE"] = X["AMOUNT_CREDIT"] / X["DAYS_ON_LAST_JOB"]
    X["RATIO_ANNUITY_TO_EXPERIENCE"] = X["AMOUNT_ANNUITY"] / X["DAYS_ON_LAST_JOB"]

    # Отношение врменных признаков
    X["RATIO_AGE_TO_EXPERIENCE"] = X["AGE"] / X["DAYS_ON_LAST_JOB"]
    X["RATIO_SALARY_TO_REGION_POPULATION"] = X["TOTAL_SALARY"] * X["REGION_POPULATION"]
    X["RATIO_CAR_TO_EXPERIENCE"] = X["OWN_CAR_AGE"] / X["DAYS_ON_LAST_JOB"]
    X["RATIO_CAR_TO_AGE"] = X["OWN_CAR_AGE"] / X["AGE"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["EXPECTED_TOTAL_LOSS_1"] = X["EXTERNAL_SCORING_RATING_1"] * X["AMOUNT_CREDIT"]
    X["EXPECTED_TOTAL_LOSS_2"] = X["EXTERNAL_SCORING_RATING_2"] * X["AMOUNT_CREDIT"]
    X["EXPECTED_TOTAL_LOSS_3"] = X["EXTERNAL_SCORING_RATING_3"] * X["AMOUNT_CREDIT"]
    X["EXPECTED_MONTHLY_LOSS_1"] = X["EXTERNAL_SCORING_RATING_1"] * X["AMOUNT_ANNUITY"]
    X["EXPECTED_MONTHLY_LOSS_2"] = X["EXTERNAL_SCORING_RATING_2"] * X["AMOUNT_ANNUITY"]
    X["EXPECTED_MONTHLY_LOSS_3"] = X["EXTERNAL_SCORING_RATING_3"] * X["AMOUNT_ANNUITY"]

    return X

In [16]:
client_profile = create_client_profile_features(source["client_profile"])
client_profile.head(n=2)

  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


Unnamed: 0,APPLICATION_NUMBER,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,REGION_POPULATION,AGE,...,RATIO_AGE_TO_EXPERIENCE,RATIO_SALARY_TO_REGION_POPULATION,RATIO_CAR_TO_EXPERIENCE,RATIO_CAR_TO_AGE,EXPECTED_TOTAL_LOSS_1,EXPECTED_TOTAL_LOSS_2,EXPECTED_TOTAL_LOSS_3,EXPECTED_MONTHLY_LOSS_1,EXPECTED_MONTHLY_LOSS_2,EXPECTED_MONTHLY_LOSS_3
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,...,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,...,,5466.42,,,,237475.743779,431008.094056,,12590.802122,22851.755462


In [17]:
data = pd.merge(data, client_profile, on='APPLICATION_NUMBER', how='left')
data.head()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,...,RATIO_AGE_TO_EXPERIENCE,RATIO_SALARY_TO_REGION_POPULATION,RATIO_CAR_TO_EXPERIENCE,RATIO_CAR_TO_AGE,EXPECTED_TOTAL_LOSS_1,EXPECTED_TOTAL_LOSS_2,EXPECTED_TOTAL_LOSS_3,EXPECTED_MONTHLY_LOSS_1,EXPECTED_MONTHLY_LOSS_2,EXPECTED_MONTHLY_LOSS_3
0,123687442,0.0,Cash,M,1.0,157500.0,855000.0,25128.0,Secondary / secondary special,Married,...,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111
1,123597908,1.0,Cash,,,,,,,,...,,,,,,,,,,
2,123526683,0.0,Cash,F,0.0,135000.0,1006920.0,42660.0,Higher education,Married,...,5.958264,3562.92,,,,686869.876357,269722.58888,,29100.49351,11427.288803
3,123710391,1.0,Cash,M,0.0,180000.0,518562.0,22972.5,Secondary / secondary special,Married,...,,5639.22,,,,88829.188848,88386.882459,,3935.167908,3915.573562
4,123590329,1.0,Cash,,,,,,,,...,,,,,,,,,,


In [18]:
data.tail()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,...,RATIO_AGE_TO_EXPERIENCE,RATIO_SALARY_TO_REGION_POPULATION,RATIO_CAR_TO_EXPERIENCE,RATIO_CAR_TO_AGE,EXPECTED_TOTAL_LOSS_1,EXPECTED_TOTAL_LOSS_2,EXPECTED_TOTAL_LOSS_3,EXPECTED_MONTHLY_LOSS_1,EXPECTED_MONTHLY_LOSS_2,EXPECTED_MONTHLY_LOSS_3
275229,123487967,,Cash,,,,,,,,...,,,,,,,,,,
275230,123536402,,Cash,M,0.0,135000.0,450000.0,16807.5,Secondary / secondary special,Single / not married,...,1.789767,3869.505,0.002212,0.001236,,299686.84168,240874.312713,,11193.303537,8996.65558
275231,123718238,,Cash,,,,,,,,...,,,,,,,,,,
275232,123631557,,Cash,F,0.0,112500.0,350181.0,36769.5,Secondary / secondary special,Married,...,,3459.9375,,,,266619.257702,,,27995.39894,
275233,123433260,,Cash,M,0.0,247500.0,1241437.5,36427.5,Secondary / secondary special,Married,...,2.8814,4462.1775,0.002293,0.000796,,612794.525674,677853.672632,,17981.229489,19890.260009


In [19]:
data.shape

(275234, 54)

### Заполняем пропуски

In [20]:
obj_col_names = data.select_dtypes(include=['object']).columns
obj_col_names

Index(['NAME_CONTRACT_TYPE', 'GENDER', 'EDUCATION_LEVEL', 'FAMILY_STATUS'], dtype='object')

In [21]:
data.loc[data[obj_col_names[1]].isnull(), obj_col_names[1]]

1         NaN
4         NaN
10        NaN
16        NaN
17        NaN
         ... 
275225    NaN
275226    NaN
275228    NaN
275229    NaN
275231    NaN
Name: GENDER, Length: 51519, dtype: object

In [22]:
for col in obj_col_names:
    data.loc[data[col].isnull(), col] = 'No'

In [23]:
num_col_names = data.select_dtypes(exclude=['object']).columns
num_col_names

Index(['APPLICATION_NUMBER', 'TARGET', 'CHILDRENS', 'TOTAL_SALARY',
       'AMOUNT_CREDIT', 'AMOUNT_ANNUITY', 'REGION_POPULATION', 'AGE',
       'DAYS_ON_LAST_JOB', 'OWN_CAR_AGE', 'FLAG_PHONE', 'FLAG_EMAIL',
       'FAMILY_SIZE', 'EXTERNAL_SCORING_RATING_1', 'EXTERNAL_SCORING_RATING_2',
       'EXTERNAL_SCORING_RATING_3', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'BKI_REQUESTS_COUNT', 'BKI_KURTOSIS',
       'EXTERNAL_SCORING_PROD', 'EXTERNAL_SCORING_WEIGHTED',
       'EXTERNAL_SCORING_RATING_min', 'EXTERNAL_SCORING_RATING_max',
       'EXTERNAL_SCORING_RATING_mean', 'EXTERNAL_SCORING_RATING_nanmedian',
       'EXTERNAL_SCORING_RATING_var', 'RATIO_CREDIT_TO_ANNUITY',
       'RATIO_ANNUITY_TO_SALARY', 'RATIO_CREDIT_TO_SALARY',
       'RATIO_ANNUITY_TO_AGE', 'RATIO_CREDIT_TO_AGE', 'RATIO_SALARY_TO_AGE',
       'RATIO_SALARY_TO_EXPERIEN

In [24]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [25]:
for col in num_col_names:
    data.loc[data[col].isnull(), col] = -1

In [26]:
data.head()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,...,RATIO_AGE_TO_EXPERIENCE,RATIO_SALARY_TO_REGION_POPULATION,RATIO_CAR_TO_EXPERIENCE,RATIO_CAR_TO_AGE,EXPECTED_TOTAL_LOSS_1,EXPECTED_TOTAL_LOSS_2,EXPECTED_TOTAL_LOSS_3,EXPECTED_MONTHLY_LOSS_1,EXPECTED_MONTHLY_LOSS_2,EXPECTED_MONTHLY_LOSS_3
0,123687442,0.0,Cash,M,1.0,157500.0,855000.0,25128.0,Secondary / secondary special,Married,...,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111
1,123597908,1.0,Cash,No,-1.0,-1.0,-1.0,-1.0,No,No,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,123526683,0.0,Cash,F,0.0,135000.0,1006920.0,42660.0,Higher education,Married,...,5.958264,3562.92,-1.0,-1.0,-1.0,686869.876357,269722.58888,-1.0,29100.49351,11427.288803
3,123710391,1.0,Cash,M,0.0,180000.0,518562.0,22972.5,Secondary / secondary special,Married,...,-1.0,5639.22,-1.0,-1.0,-1.0,88829.188848,88386.882459,-1.0,3935.167908,3915.573562
4,123590329,1.0,Cash,No,-1.0,-1.0,-1.0,-1.0,No,No,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### Преобразуем категориальные признаки в числовые

In [27]:
data_cat = data.copy()

In [28]:
obj_col = data.select_dtypes(exclude=[np.number])

In [29]:
def data_obj_discribe(data: pd.DataFrame):
# если столбец имеет тип 'object' и содержит менее 30 уникальных 
# распечатывает значения столбцов и подсчитывает количество значений

    # data: pd.DataFrame исходный датафрейм
    
    obj_fit = data.select_dtypes(include='object')
    print('*' * 100)
    for cat_colname in obj_fit.columns:
        if len(obj_fit[cat_colname].unique()) < 30:
            print(str(cat_colname) + '\n\n' + str(obj_fit[cat_colname].unique()) + '\n\n')
            print(obj_fit.groupby(cat_colname)[cat_colname].count())
            print('*' * 100 + '\n')

In [30]:
if not obj_col.empty:
    data_obj_discribe(data)

****************************************************************************************************
NAME_CONTRACT_TYPE

['Cash' 'Credit Card']


NAME_CONTRACT_TYPE
Cash           248983
Credit Card     26251
Name: NAME_CONTRACT_TYPE, dtype: int64
****************************************************************************************************

GENDER

['M' 'No' 'F' 'XNA']


GENDER
F      147444
M       76267
No      51519
XNA         4
Name: GENDER, dtype: int64
****************************************************************************************************

EDUCATION_LEVEL

['Secondary / secondary special' 'No' 'Higher education' 'Lower secondary'
 'Incomplete higher' 'Academic degree']


EDUCATION_LEVEL
Academic degree                     120
Higher education                  54516
Incomplete higher                  7456
Lower secondary                    2736
No                                51519
Secondary / secondary special    158887
Name: EDUCATION_LEVEL, dtype: int64
*

In [31]:
def data_obj_map(data: pd.DataFrame):
# если столбец имеет тип 'object' и содержит менее 30 уникальных 
# преобразует категориальные признак в числовые

    # data: pd.DataFrame исходный датафрейм
    
    # изменяет датафрейм источник
    
    obj_fit = data.select_dtypes(include='object')
    for cat_colname in obj_fit.columns:
        map_tr = {}
        un_val = obj_fit[cat_colname].unique()
        if len(un_val) < 30:
            print(str(cat_colname) + '\n')
            for i in range(len(un_val)):
                map_tr[un_val[i]] = i
            print(map_tr)
            data[cat_colname] = data[cat_colname].map(map_tr)
            print('*' * 100 + '\n')

In [32]:
if not obj_col.empty:
    data_obj_map(data)

NAME_CONTRACT_TYPE

{'Cash': 0, 'Credit Card': 1}
****************************************************************************************************

GENDER

{'M': 0, 'No': 1, 'F': 2, 'XNA': 3}
****************************************************************************************************

EDUCATION_LEVEL

{'Secondary / secondary special': 0, 'No': 1, 'Higher education': 2, 'Lower secondary': 3, 'Incomplete higher': 4, 'Academic degree': 5}
****************************************************************************************************

FAMILY_STATUS

{'Married': 0, 'No': 1, 'Single / not married': 2, 'Separated': 3, 'Civil marriage': 4, 'Widow': 5, 'Unknown': 6}
****************************************************************************************************



In [33]:
data.head()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,...,RATIO_AGE_TO_EXPERIENCE,RATIO_SALARY_TO_REGION_POPULATION,RATIO_CAR_TO_EXPERIENCE,RATIO_CAR_TO_AGE,EXPECTED_TOTAL_LOSS_1,EXPECTED_TOTAL_LOSS_2,EXPECTED_TOTAL_LOSS_3,EXPECTED_MONTHLY_LOSS_1,EXPECTED_MONTHLY_LOSS_2,EXPECTED_MONTHLY_LOSS_3
0,123687442,0.0,0,0,1.0,157500.0,855000.0,25128.0,0,0,...,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111
1,123597908,1.0,0,1,-1.0,-1.0,-1.0,-1.0,1,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,123526683,0.0,0,2,0.0,135000.0,1006920.0,42660.0,2,0,...,5.958264,3562.92,-1.0,-1.0,-1.0,686869.876357,269722.58888,-1.0,29100.49351,11427.288803
3,123710391,1.0,0,0,0.0,180000.0,518562.0,22972.5,0,0,...,-1.0,5639.22,-1.0,-1.0,-1.0,88829.188848,88386.882459,-1.0,3935.167908,3915.573562
4,123590329,1.0,0,1,-1.0,-1.0,-1.0,-1.0,1,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [34]:
for col in data.columns:
    data.loc[data[col].isnull(), col] = -1

### Разделяем train и test

In [35]:
train_prep = data.loc[data['TARGET'] != -1]
train_prep.shape

(110093, 54)

In [36]:
train_prep.tail()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,...,RATIO_AGE_TO_EXPERIENCE,RATIO_SALARY_TO_REGION_POPULATION,RATIO_CAR_TO_EXPERIENCE,RATIO_CAR_TO_AGE,EXPECTED_TOTAL_LOSS_1,EXPECTED_TOTAL_LOSS_2,EXPECTED_TOTAL_LOSS_3,EXPECTED_MONTHLY_LOSS_1,EXPECTED_MONTHLY_LOSS_2,EXPECTED_MONTHLY_LOSS_3
110088,123458312,0.0,0,1,-1.0,-1.0,-1.0,-1.0,1,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
110089,123672463,0.0,0,2,0.0,175500.0,269550.0,12618.0,0,0,...,7.768833,1760.616,-1.0,-1.0,142348.303154,127166.558499,156738.9329,6663.516562,5952.838565,7337.1614
110090,123723001,0.0,0,1,-1.0,-1.0,-1.0,-1.0,1,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
110091,123554358,0.0,0,2,2.0,270000.0,1024740.0,49428.0,4,0,...,41.573407,5157.27,0.060942,0.001466,670001.539719,667338.205604,572324.081646,32317.305956,32188.840903,27605.865593
110092,123676265,0.0,1,2,1.0,112500.0,337500.0,16875.0,0,0,...,61.166667,3524.5125,-1.0,-1.0,-1.0,186314.656055,-1.0,-1.0,9315.732803,-1.0


In [37]:
source['train'].tail()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE
110088,123458312,0,Cash
110089,123672463,0,Cash
110090,123723001,0,Cash
110091,123554358,0,Cash
110092,123676265,0,Credit Card


In [38]:
test_prep = data.loc[data['TARGET'] == -1].copy().reset_index(drop=True)
test_prep.drop('TARGET', axis=1, inplace=True)
test_prep.shape

(165141, 53)

In [39]:
test_prep.head(3)

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,REGION_POPULATION,...,RATIO_AGE_TO_EXPERIENCE,RATIO_SALARY_TO_REGION_POPULATION,RATIO_CAR_TO_EXPERIENCE,RATIO_CAR_TO_AGE,EXPECTED_TOTAL_LOSS_1,EXPECTED_TOTAL_LOSS_2,EXPECTED_TOTAL_LOSS_3,EXPECTED_MONTHLY_LOSS_1,EXPECTED_MONTHLY_LOSS_2,EXPECTED_MONTHLY_LOSS_3
0,123724268,0,0,0.0,117000.0,1125000.0,32895.0,0,0,0.028663,...,6.049509,3353.571,0.007559,0.001249,-1.0,706799.427892,-1.0,-1.0,20666.815272,-1.0
1,123456549,0,2,2.0,81000.0,312768.0,17095.5,0,0,0.019689,...,22.472767,1594.809,-1.0,-1.0,-1.0,180830.265914,58953.494506,-1.0,9883.951718,3222.322825
2,123428178,1,2,2.0,157500.0,450000.0,22500.0,0,0,0.019101,...,13.322416,3008.4075,-1.0,-1.0,-1.0,68644.98432,172125.811865,-1.0,3432.249216,8606.290593


In [40]:
test_prep.tail()

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,REGION_POPULATION,...,RATIO_AGE_TO_EXPERIENCE,RATIO_SALARY_TO_REGION_POPULATION,RATIO_CAR_TO_EXPERIENCE,RATIO_CAR_TO_AGE,EXPECTED_TOTAL_LOSS_1,EXPECTED_TOTAL_LOSS_2,EXPECTED_TOTAL_LOSS_3,EXPECTED_MONTHLY_LOSS_1,EXPECTED_MONTHLY_LOSS_2,EXPECTED_MONTHLY_LOSS_3
165136,123487967,0,1,-1.0,-1.0,-1.0,-1.0,1,1,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
165137,123536402,0,0,0.0,135000.0,450000.0,16807.5,0,2,0.028663,...,1.789767,3869.505,0.002212,0.001236,-1.0,299686.84168,240874.312713,-1.0,11193.303537,8996.65558
165138,123718238,0,1,-1.0,-1.0,-1.0,-1.0,1,1,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
165139,123631557,0,2,0.0,112500.0,350181.0,36769.5,0,0,0.030755,...,-1.0,3459.9375,-1.0,-1.0,-1.0,266619.257702,-1.0,-1.0,27995.39894,-1.0
165140,123433260,0,0,0.0,247500.0,1241437.5,36427.5,0,0,0.018029,...,2.8814,4462.1775,0.002293,0.000796,-1.0,612794.525674,677853.672632,-1.0,17981.229489,19890.260009


### Сохраняем train и test

In [41]:
train_prep.to_csv(data_dir + "train_prep.csv", sep=";", index=False)

In [42]:
test_prep.to_csv(data_dir + "test_prep.csv", sep=";", index=False)

## XGBoost

In [43]:
def get_model_hp(parameters, X, y):
    clf = GridSearchCV(
        estimator=xgb.XGBClassifier(silent=False, subsample=0.5, random_state=42),
        param_grid=parameters,
        scoring='roc_auc',
        cv=5,
        return_train_score = True
    )
    
    clf.fit(X, y)
    
    cv_results = pd.DataFrame(clf.cv_results_)

    #print(cv_results.columns)
    
    param_columns = [
        column
        for column in cv_results.columns
        if column.startswith('param_')
    ]

    score_columns = ['mean_test_score', 'mean_train_score']

    cv_results = (cv_results[param_columns + score_columns]
                  .sort_values(by=score_columns, ascending=False))

    print(cv_results)
    print(f"\nbest params: {clf.best_params_}\n")
    return clf

In [44]:
def make_cross_validation_adapt(X: pd.DataFrame,
                                y: pd.Series,
                                params: Dict,
                                metric: callable,
                                cv_strategy):
    """
    Кросс-валидация.

    Parameters
    ----------
    X: pd.DataFrame
        Матрица признаков.

    y: pd.Series
        Вектор целевой переменной.

    params: Dict
        Параметры модели.

    metric: callable
        Метрика для оценки качества решения.
        Ожидается, что на вход будет передана функция,
        которая принимает 2 аргумента: y_true, y_pred.

    cv_strategy: cross-validation generator
        Объект для описания стратегии кросс-валидации.
        Ожидается, что на вход будет передан объект типа
        KFold или StratifiedKFold.

    Returns
    -------
    oof_score: float
        Значение метрики качества на OOF-прогнозах.

    fold_train_scores: List[float]
        Значение метрики качества на каждом обучающем датасете кросс-валидации.

    fold_valid_scores: List[float]
        Значение метрики качества на каждом валидационном датасете кросс-валидации.

    oof_predictions: np.array
        Прогнозы на OOF.

    """
    estimators, fold_train_scores, fold_valid_scores = [], [], []
    oof_predictions = np.zeros(X.shape[0])

    plst = list(params.items())

    num_rounds=1000 
    
    for fold_number, (train_idx, valid_idx) in enumerate(cv_strategy.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]

        xgtrain = xgb.DMatrix(x_train, label=y_train)
        xgval = xgb.DMatrix(x_valid, label=y_valid)

        watchlist = [(xgtrain,'train'), (xgval,'eval')]

        model = xgb.train(plst, 
                          xgtrain, 
                          num_rounds,
                          watchlist,
                          verbose_eval=False,
                          early_stopping_rounds=10)

        y_train_pred = model.predict(xgtrain)
        y_valid_pred = model.predict(xgval)

        fold_train_scores.append(metric(y_train, y_train_pred))
        fold_valid_scores.append(metric(y_valid, y_valid_pred))
        oof_predictions[valid_idx] = y_valid_pred

        msg = (
            f"Fold: {fold_number+1}, train-observations = {len(train_idx)}, "
            f"valid-observations = {len(valid_idx)}\n"
            f"train-score = {round(fold_train_scores[fold_number], 4)}, "
            f"valid-score = {round(fold_valid_scores[fold_number], 4)}" 
        )
        print(msg)
        print("="*69)
        estimators.append(model)

    oof_score = metric(y, oof_predictions)
    print(f"CV-results train: {round(np.mean(fold_train_scores), 4)} +/- {round(np.std(fold_train_scores), 3)}")
    print(f"CV-results valid: {round(np.mean(fold_valid_scores), 4)} +/- {round(np.std(fold_valid_scores), 3)}")
    print(f"OOF-score = {round(oof_score, 4)}")

    return estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions

In [45]:
params = {}
params["objective"] = "binary:logistic"
params["eta"] = 0.3 # clf_1.best_params_['eta']
# params["n_estimators"] = clf.best_params_['n_estimators']
params["random_state"] = 42
# params["silent"] = 1
params["max_depth"] = 4 # clf_1.best_params_['max_depth']
params["eval_metric"] = "auc"

In [46]:
cv_strategy = KFold(n_splits=5) #, random_state=42)

estimators_xgb_c1, oof_score_xgb_c1, fold_train_scores_xgb_c1, fold_valid_scores_xgb_c1, oof_predictions_xgb_c1 = make_cross_validation_adapt(
    train_prep.drop("TARGET", axis=1),
    train_prep["TARGET"],
    params=params,
    metric=roc_auc_score, 
    cv_strategy=cv_strategy
)

Fold: 1, train-observations = 88074, valid-observations = 22019
train-score = 0.768, valid-score = 0.7209
Fold: 2, train-observations = 88074, valid-observations = 22019
train-score = 0.7625, valid-score = 0.7153
Fold: 3, train-observations = 88074, valid-observations = 22019
train-score = 0.7624, valid-score = 0.7274
Fold: 4, train-observations = 88075, valid-observations = 22018
train-score = 0.7649, valid-score = 0.7224
Fold: 5, train-observations = 88075, valid-observations = 22018
train-score = 0.7598, valid-score = 0.728
CV-results train: 0.7635 +/- 0.003
CV-results valid: 0.7228 +/- 0.005
OOF-score = 0.722


In [47]:
cv_results_1 = xgb.cv(dtrain=xgb.DMatrix(train_prep.drop("TARGET", axis=1), label=train_prep["TARGET"]), 
                      params=params, 
                      nfold=5,
                      num_boost_round=1000,
                      early_stopping_rounds=50,
                      metrics="auc", 
                      as_pandas=True, 
                      seed=42)

In [48]:
cv_results_1.tail()

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
47,0.772572,0.001716,0.724329,0.008409
48,0.773166,0.001564,0.724491,0.00837
49,0.774416,0.001415,0.724683,0.008533
50,0.775206,0.001545,0.724765,0.008553
51,0.776092,0.001449,0.724844,0.008754


### XGBoost prediction

In [49]:
prediction_xgb = test_prep[['APPLICATION_NUMBER']].copy()
prediction_xgb["TARGET"] = 0
prediction_xgb.tail()

Unnamed: 0,APPLICATION_NUMBER,TARGET
165136,123487967,0
165137,123536402,0
165138,123718238,0
165139,123631557,0
165140,123433260,0


In [50]:
test_predictions_xgb = np.zeros(test_prep.shape[0])

for i in range(5):
    test_predictions_xgb += estimators_xgb_c1[i].predict(xgb.DMatrix(test_prep))

test_predictions_xgb /= 5

In [51]:
prediction_xgb["TARGET"] = test_predictions_xgb.copy()
prediction_xgb.tail()

Unnamed: 0,APPLICATION_NUMBER,TARGET
165136,123487967,0.084091
165137,123536402,0.046914
165138,123718238,0.083943
165139,123631557,0.012307
165140,123433260,0.04349
