# Решение задачи предсказания LGD-рисков

# Загружаем данные и устанавливаем нужные библиотеки

In [None]:
!pip install catboost
!pip install lightgbm
!pip install xgboost
!pip install -U dask    # после установки рекомендую перезагрузить ядро
!pip install featuretools
!pip install evalml
!pip install mljar-supervised
!pip install requests
!pip install tabulate
!pip install "colorama>=0.3.8"
!pip install future
!pip uninstall h2o
!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

In [None]:
import pandas as pd
import numpy as np
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from math import log
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import *
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from catboost import *
from xgboost import XGBRegressor, XGBRFRegressor
from lightgbm import LGBMRegressor
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from supervised.automl import AutoML
import pandas as pd
import numpy as np
#from sklearn.externals.six import StringIO
import h2o
from h2o.automl import H2OAutoML
from h2o.sklearn import H2OAutoMLRegressor

h2o.init()

In [None]:
!wget https://raw.githubusercontent.com/BKHV/risk_models/master/data/LGD-data-train.csv
!wget https://raw.githubusercontent.com/BKHV/risk_models/master/data/LGD-data-test.csv
!wget https://raw.githubusercontent.com/BKHV/risk_models/master/data/PD-data-desc.csv

In [None]:
train_df = pd.read_csv('LGD-data-train.csv', sep=';')
test_df = pd.read_csv('LGD-data-test.csv', sep=';')
desc_df = pd.read_csv('PD-data-desc.csv', sep=';')

In [None]:
desc_df

In [None]:
train_df

# Пишем полезные функции

In [None]:
def add_new_features(df, df1):
    for d in [df, df1]:
        d['ead'] = d['ab_accounts_payable'] + d['ab_other_borrowings'] + d['ab_borrowed_capital'] + d['ab_accounts_receivable']
        d['prob_recovery'] = d['ab_inventory'] + d['ab_own_capital']
        d['ub'] = d['ead'] - d['ab_inventory'] - d['ab_own_capital']
    return (df, df1)

def get_two_data_frames(data):
    d1, d2 = [], []
    cols = data.columns
    for i in range(len(data)):
        if data.iloc[i].isna().sum() == 0:
            d1.append(data.iloc[i].tolist())
            d2.append(data.iloc[i].tolist())
        else:
            d2.append(data.iloc[i].tolist())
    d1, d2 = pd.DataFrame(d1, columns=cols), pd.DataFrame(d2, columns=cols)
    del_cols = []
    for x in d2:
        if d2[x].isna().sum() != 0:
            del_cols += [x]
    d2 = d2.drop(columns=del_cols)
    return (d1, d2)

def make_predict(model, test, name):
    preds = model.predict(test)
    for i in range(len(preds)):
        if preds[i] > 1:
            preds[i] = 1
        if preds[i] < 0:
            preds[i] = 0
    sumbit_df = test[['record_id']].copy()
    sumbit_df['predict'] = preds
    sumbit_df.rename({'record_id':'id'},axis=1,inplace=True)
    sumbit_df.to_csv(name, index=False)

def make_two_models_predict(model_0, model_24, test, name):
    res = test['record_id']
    preds = []
    test_0, test_24 = get_two_data_frames(test)
    preds_0 = np.array(model_0.predict(test_0))
    preds_24 = np.array(model_24.predict(test_24))
    for i in range(len(preds_0)):
        if preds_0[i] > 1:
            preds_0[i] = 1
        if preds_0[i] < 0:
            preds_0[i] = 0
    for i in range(len(preds_24)):
        if preds_24[i] > 1:
            preds_24[i] = 1
        if preds_24[i] < 0:
            preds_24[i] = 0
    preds_0 = pd.DataFrame(list(zip(test_0['record_id'].tolist(), preds_0)), columns=['id', 'predict'])
    preds_24 = pd.DataFrame(list(zip(test_24['record_id'].tolist(), preds_24)), columns=['id', 'predict'])
    
    res = pd.concat([preds_0, preds_24])
    res.to_csv(name, index=False)

def make_sber_predict(model, test, name):
    preds = model.predict(test).data[:, 0]
    for i in range(len(preds)):
        if preds[i] > 1:
            preds[i] = 1
        if preds[i] < 0:
            preds[i] = 0
    sumbit_df = test[['record_id']].copy()
    sumbit_df['predict'] = preds
    sumbit_df.rename({'record_id':'id'},axis=1,inplace=True)
    sumbit_df.to_csv(name, index=False)

def make_predict_h2o(model, test_df, name):
    preds = model.predict(test_df)
    preds = h2o.as_list(preds)
    for i in range(len(preds['predict'])):
        if preds['predict'][i] > 1:
            preds['predict'][i] = 1
        if preds['predict'][i] < 0:
            preds['predict'][i] = 0
    preds['id'] = h2o.as_list(test_df)['record_id']
    preds = pd.DataFrame(list(zip(preds['id'].tolist(), preds['predict'].tolist())), columns=['id', 'predict'])
    preds.to_csv(name, index=False)

# Готовим данные для обучения

In [None]:
cat_features, cat_test = add_new_features(train_df[[x for x in train_df.columns if x != 'lgd']], test_df[[x for x in test_df.columns if x != 'lgd']])
# cat_features, cat_test = train_df[[x for x in train_df.columns if x != 'lgd']], test_df[[x for x in test_df.columns if x != 'lgd']]
cat_target = train_df[['lgd']]
cat_columns = [x for x in train_df.columns if train_df[x].dtype == 'object']
'''for x in cat_features:
    if 'cnt' in x or 'flg' in x:
        cat_features[x] = cat_features[x].astype('object')
for x in cat_test:
    if 'cnt' in x or 'flg' in x:
        cat_test[x] = cat_test[x].astype('object')'''
cat_columns = [x for x in train_df.columns if train_df[x].dtype == 'object']

features = train_df[[x for x in train_df.columns if x != 'lgd']]
target = train_df[['lgd']]
test = test_df[[x for x in test_df.columns if x != 'lgd' and test_df[x].dtype != 'object']]


features_na = train_df[[x for x in train_df.columns if x != 'lgd' and train_df[x].dtype != 'object' and train_df[x].isna().sum() == 0]]
target_na = train_df[['lgd']]
test_na = test_df[[x for x in test_df.columns if x != 'lgd' and test_df[x].dtype != 'object' and test_df[x].isna().sum() == 0]]

# Различные решения данной задачи

## Feature tools + Catboost

In [None]:
df = cat_features.copy()
df = df.append(cat_test, ignore_index=True)
for x in df:
    if df[x].isna().sum() != 0 and df[x].dtype!='object':
        df[x].fillna(0, inplace=True)
    elif df[x].dtype=='object' and df[x].isna().sum()!=0:
        df[x].fillna('Unknown', inplace=True)
df['id'] = [i for i in range(len(df['record_id']))]

In [None]:
es = ft.EntitySet(id = 'sber')
es = es.entity_from_dataframe(entity_id = 'lgd', dataframe = df, index = 'id')
es.normalize_entity(base_entity_id='lgd', new_entity_id='lgd_adv_train', index = 'record_id',
additional_variables = [x for x in df.columns if 'ar_' in x or 'ab_' in x])

In [None]:
feature_matrix, feature_names = ft.dfs(entityset=es,
    target_entity = 'lgd',
    max_depth = 5,
    verbose = 1,
    n_jobs = 3)

In [None]:
tr, ts = feature_matrix[:len(cat_features)], feature_matrix[len(cat_features):]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(tr, cat_target, test_size=0.1)
model = CatBoostRegressor(cat_features=cat_columns, 
                          random_seed=337, 
                          loss_function='MAE', 
                          l2_leaf_reg=2.8, 
                          nan_mode='Min',
                          score_function='L2',
                          n_estimators=618,
                          max_depth=6,
                          random_strength=1.5,
                          boosting_type='Plain',
                          rsm=1,
                          )
model.fit(X_train, y_train)
mean_absolute_error(y_val, model.predict(X_val))

In [None]:
make_predict(model, ts, 'feature_tools_catboost.csv')

MAE: 0.11

## Catboost

In [None]:
model = CatBoostRegressor(cat_features=cat_columns, 
                          random_seed=337, 
                          loss_function='MAE', 
                          l2_leaf_reg=2.8, 
                          nan_mode='Min',
                          score_function='L2',
                          n_estimators=618,
                          max_depth=6,
                          random_strength=1.5,
                          boosting_type='Plain',
                          rsm=1,
                          )
model.fit(cat_features, target)
mean_absolute_error(target, model.predict(cat_features))

In [None]:
make_predict(model, cat_test, 'catboost')

MAE: 0.0974

## Catboost with feature selection

In [None]:
fi = model.get_feature_importance(prettified=True)
good_cols = fi['Feature Id'][fi['Importances'] > 1].to_list()
if 'record_id' not in good_cols:
    good_cols.append('record_id')
good_cols

In [None]:
new_model = CatBoostRegressor(cat_features=[x for x in cat_features[good_cols] if cat_features[good_cols][x].dtype == 'object'], 
                          random_seed=337, 
                          loss_function='MAE', 
                          l2_leaf_reg=2.8, 
                          nan_mode='Min',
                          score_function='L2',
                          n_estimators=818,
                          max_depth=6,
                          random_strength=1.5,
                          boosting_type='Plain',
                          rsm=1
                          )
new_model.fit(cat_features[good_cols], target)
mean_absolute_error(target, new_model.predict(cat_features[good_cols]))

In [None]:
make_predict(new_model, cat_test[good_cols], 'catboost_fs.csv')

MAE: 0.0981

## Grid search on catboost

In [None]:
model = CatBoostRegressor(cat_features=cat_columns, verbose=500, loss_function='MAE', eval_metric='MAE', random_seed=337)
params = {'n_estimators': [700, 800, 900, 1000],
          'l2_leaf_reg': [2.6, 2.7, 2.8, 2.9, 3, 3.1],
          'depth': [4,5,6,7],
          #'learning_rate': [0.1, 0.05, 0.025, 0.01],
          'score_function': ['Cosine', 'L2']}
grid_search_result = model.grid_search(params, 
                                       X=cat_features, 
                                       y=cat_target, 
                                       plot=True)

In [None]:
make_predict(model, cat_test, 'gs_catboost.csv')

MAE: 0.10

## Two catboosts

In [None]:
train_0, train_24 = get_two_data_frames(train_df)

In [None]:
model_0 = CatBoostRegressor(cat_features=cat_columns, 
                          random_seed=337, 
                          loss_function='MAE', 
                          l2_leaf_reg=2.8, 
                          nan_mode='Min',
                          score_function='L2',
                          n_estimators=618,
                          max_depth=6,
                          random_strength=1.5,
                          boosting_type='Plain',
                          rsm=1)
model_24 = CatBoostRegressor(cat_features=cat_columns, 
                          random_seed=337, 
                          loss_function='MAE', 
                          l2_leaf_reg=2.8, 
                          nan_mode='Min',
                          score_function='L2',
                          n_estimators=618,
                          max_depth=6,
                          random_strength=1.5,
                          boosting_type='Plain',
                          rsm=1)
model_0.fit(train_0[[x for x in train_0 if x != 'lgd']], train_0['lgd'])
model_24.fit(train_24[[x for x in train_24 if x != 'lgd']], train_24['lgd'])
mean_absolute_error(train_0['lgd'], model_0.predict(train_0[[x for x in train_0 if x != 'lgd']])), mean_absolute_error(train_24['lgd'], model_24.predict(train_24[[x for x in train_24 if x != 'lgd']]))

In [None]:
make_two_models_predict(model_0, model_24, test_df, 'great.csv')

MAE: 0.12

# AutoML решения

## LightAutoML

In [None]:
sber_train = cat_features.copy()
sber_train['lgd'] = cat_target['lgd']
sber_test = cat_test.copy()
train, val = train_test_split(sber_train, test_size=0.2)
task = Task('reg', loss = 'mae', metric = 'mae')
roles = {
    'target': 'lgd',
    'drop': ['record_id'],
}

In [None]:
automl = TabularAutoML(task = task, 
                       timeout = 300,
                       cpu_limit = 1,
                       reader_params = {'n_jobs': 1, 'cv': 5, 'random_state': 337},
                      verbose = 1)
preds = automl.fit_predict(train, roles = roles)
mean_absolute_error(val['lgd'], automl.predict(val[[x for x in val if x != 'lgd']]).data[:, 0])

In [None]:
make_sber_predict(automl, sber_test, 'sber.csv')

MAE: 0.0993

## MLJAR - supervised

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    cat_features, cat_target, test_size=0.25
)

automl = AutoML(random_state=337,
                eval_metric='mae',
                mode='Compete',
                ml_task='regression',
                hill_climbing_steps=3,
                algorithms=["CatBoost", "Xgboost", "Random Forest"],
                top_models_to_improve=4,
                kmeans_features=True
                )
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

In [None]:
mean_absolute_error(y_val, automl.predict(X_val))

In [None]:
make_predict(automl, cat_test, 'mljar.csv')

MAE: 0.10

## H2O

In [None]:
train_df = h2o.import_file('LGD-data-train.csv')
test_df = h2o.import_file('LGD-data-test.csv')
desc_df = pd.read_csv('PD-data-desc.csv', sep=';')

In [None]:
x = train_df.columns
y = "lgd"
x.remove(y)

In [None]:
aml = H2OAutoML(max_models=10, seed=337)
aml.fit(train_df[x], train_df[y])

In [None]:
aml.leaderboard

In [None]:
make_predict_h2o(aml, test_df, 'h2o.csv')

MAE: 0.113

## Autosklearn

In [None]:
features = train_df[[x for x in train_df.columns if x != 'lgd' and train_df[x].dtype != 'object']]
cat_columns = [x for x in train_df if train_df[x].dtype == 'object']
cat_features = train_df[[x for x in train_df.columns if x != 'lgd']]
lin_features = train_df.dropna()[[x for x in train_df.columns if x != 'lgd' and train_df[x].dtype != 'object']]
lin_target = train_df.dropna()[['lgd']]
target = train_df[['lgd']]
test = test_df[[x for x in test_df.columns if x != 'lgd' and test_df[x].dtype != 'object']]

In [None]:
automl = autosklearn.regression.AutoSklearnRegressor(
                            time_left_for_this_task=600,
                            per_run_time_limit=10,
                            tmp_folder='/tmp/autosklearn_regression_example_tmp1',
                            output_folder='/tmp/autosklearn_regression_example_out1',
                            )
automl.fit(features, target)

In [None]:
mean_absolute_error(target, automl.predict(features))

In [None]:
make_predict(automl, test, 'autosklearn.csv')

MAE: 0.17