In [10]:
import pandas as pd
import numpy as np
import catboost as catb
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1)

In [4]:
df = pd.read_csv('Data/prepared_train_data_without_new_features.csv')
df_test = pd.read_csv('Data/prepared_test_data_without_new_features.csv')

In [5]:
target = 'Credit Default'

In [7]:
y_train = df[target]
X_train = df.drop(target, axis=1)

In [8]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, target)

In [9]:
y_train = df_balanced[target]
X_train = df_balanced.drop(target, axis=1)

In [11]:
model = catb.CatBoostClassifier(depth=12, iterations=500, learning_rate=0.2, random_state=42)

In [12]:
model.fit(X_train, y_train)

0:	learn: 0.6428412	total: 124ms	remaining: 1m 1s
1:	learn: 0.6096777	total: 190ms	remaining: 47.2s
2:	learn: 0.5872194	total: 257ms	remaining: 42.5s
3:	learn: 0.5702499	total: 320ms	remaining: 39.7s
4:	learn: 0.5517810	total: 384ms	remaining: 38.1s
5:	learn: 0.5373118	total: 448ms	remaining: 36.9s
6:	learn: 0.5136281	total: 512ms	remaining: 36s
7:	learn: 0.5001378	total: 577ms	remaining: 35.5s
8:	learn: 0.4855301	total: 642ms	remaining: 35s
9:	learn: 0.4748958	total: 706ms	remaining: 34.6s
10:	learn: 0.4661252	total: 771ms	remaining: 34.3s
11:	learn: 0.4497127	total: 833ms	remaining: 33.9s
12:	learn: 0.4354024	total: 898ms	remaining: 33.6s
13:	learn: 0.4271494	total: 964ms	remaining: 33.4s
14:	learn: 0.4227220	total: 1.03s	remaining: 33.3s
15:	learn: 0.4071013	total: 1.09s	remaining: 33.1s
16:	learn: 0.3921973	total: 1.16s	remaining: 32.9s
17:	learn: 0.3838247	total: 1.23s	remaining: 32.8s
18:	learn: 0.3728771	total: 1.29s	remaining: 32.7s
19:	learn: 0.3666883	total: 1.35s	remaining: 

162:	learn: 0.0453067	total: 10.9s	remaining: 22.5s
163:	learn: 0.0447623	total: 10.9s	remaining: 22.4s
164:	learn: 0.0443413	total: 11s	remaining: 22.4s
165:	learn: 0.0437483	total: 11.1s	remaining: 22.3s
166:	learn: 0.0433521	total: 11.2s	remaining: 22.3s
167:	learn: 0.0431556	total: 11.2s	remaining: 22.2s
168:	learn: 0.0428290	total: 11.3s	remaining: 22.1s
169:	learn: 0.0423029	total: 11.4s	remaining: 22.1s
170:	learn: 0.0417338	total: 11.4s	remaining: 22s
171:	learn: 0.0413132	total: 11.5s	remaining: 22s
172:	learn: 0.0410997	total: 11.6s	remaining: 21.9s
173:	learn: 0.0406576	total: 11.7s	remaining: 21.9s
174:	learn: 0.0403167	total: 11.7s	remaining: 21.8s
175:	learn: 0.0398591	total: 11.8s	remaining: 21.8s
176:	learn: 0.0392726	total: 11.9s	remaining: 21.7s
177:	learn: 0.0389653	total: 12s	remaining: 21.6s
178:	learn: 0.0386866	total: 12s	remaining: 21.6s
179:	learn: 0.0381442	total: 12.1s	remaining: 21.5s
180:	learn: 0.0379483	total: 12.2s	remaining: 21.4s
181:	learn: 0.0374325	

322:	learn: 0.0152339	total: 21.8s	remaining: 12s
323:	learn: 0.0151917	total: 21.9s	remaining: 11.9s
324:	learn: 0.0151147	total: 22s	remaining: 11.8s
325:	learn: 0.0150024	total: 22.1s	remaining: 11.8s
326:	learn: 0.0149268	total: 22.1s	remaining: 11.7s
327:	learn: 0.0147891	total: 22.2s	remaining: 11.6s
328:	learn: 0.0147635	total: 22.3s	remaining: 11.6s
329:	learn: 0.0146862	total: 22.4s	remaining: 11.5s
330:	learn: 0.0145670	total: 22.4s	remaining: 11.5s
331:	learn: 0.0144803	total: 22.5s	remaining: 11.4s
332:	learn: 0.0144059	total: 22.6s	remaining: 11.3s
333:	learn: 0.0143227	total: 22.6s	remaining: 11.2s
334:	learn: 0.0142614	total: 22.7s	remaining: 11.2s
335:	learn: 0.0141846	total: 22.8s	remaining: 11.1s
336:	learn: 0.0140947	total: 22.8s	remaining: 11s
337:	learn: 0.0140014	total: 22.9s	remaining: 11s
338:	learn: 0.0139521	total: 23s	remaining: 10.9s
339:	learn: 0.0139080	total: 23s	remaining: 10.8s
340:	learn: 0.0138239	total: 23.1s	remaining: 10.8s
341:	learn: 0.0137673	to

481:	learn: 0.0080825	total: 32.7s	remaining: 1.22s
482:	learn: 0.0080634	total: 32.8s	remaining: 1.15s
483:	learn: 0.0080369	total: 32.9s	remaining: 1.09s
484:	learn: 0.0080174	total: 33s	remaining: 1.02s
485:	learn: 0.0079828	total: 33s	remaining: 951ms
486:	learn: 0.0079462	total: 33.1s	remaining: 883ms
487:	learn: 0.0079231	total: 33.2s	remaining: 815ms
488:	learn: 0.0078951	total: 33.2s	remaining: 747ms
489:	learn: 0.0078593	total: 33.3s	remaining: 680ms
490:	learn: 0.0078283	total: 33.4s	remaining: 612ms
491:	learn: 0.0078282	total: 33.5s	remaining: 544ms
492:	learn: 0.0077973	total: 33.5s	remaining: 476ms
493:	learn: 0.0077784	total: 33.6s	remaining: 408ms
494:	learn: 0.0077484	total: 33.7s	remaining: 340ms
495:	learn: 0.0077280	total: 33.7s	remaining: 272ms
496:	learn: 0.0077016	total: 33.8s	remaining: 204ms
497:	learn: 0.0076750	total: 33.9s	remaining: 136ms
498:	learn: 0.0076632	total: 34s	remaining: 68.1ms
499:	learn: 0.0076391	total: 34s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1af76647c88>

In [14]:
y_pred_proba = model.predict_proba(df_test)

In [15]:
result = []
for value in y_pred_proba[:, 1]:
    if value > 0.2:
        result.append(1)
    else:
        result.append(0)
result_data = pd.DataFrame({'Credit default': result})

In [16]:
result_data.head()

Unnamed: 0,Credit default
0,1
1,1
2,0
3,1
4,0


In [18]:
result_data.to_csv('Data/Nikita_Krasheninnikov_predictions.csv', index=False)