In [19]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1)

In [4]:
df = pd.read_csv('Data/prepared_train_data_without_new_features.csv')
df_test = pd.read_csv('Data/prepared_test_data_without_new_features.csv')

In [5]:
target = 'Credit Default'

In [7]:
y_train = df[target]
X_train = df.drop(target, axis=1)

In [8]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, target)

In [9]:
y_train = df_balanced[target]
X_train = df_balanced.drop(target, axis=1)

In [20]:
model = lgbm.LGBMClassifier(learning_rate=0.2, n_estimators=300, max_depth=12, reg_lambda=1, num_leaves=50, random_state=42)

In [21]:
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.2, max_depth=12,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=300, n_jobs=-1, num_leaves=50, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=1, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [22]:
y_pred_proba = model.predict_proba(df_test)

In [23]:
result = []
for value in y_pred_proba[:, 1]:
    if value > 0.147:
        result.append(1)
    else:
        result.append(0)
result_data = pd.DataFrame({'Credit default': result})

In [24]:
result_data.head()

Unnamed: 0,Credit default
0,1
1,1
2,0
3,1
4,0


In [25]:
result_data.to_csv('Data/Nikita_Krasheninnikov_predictions.csv', index=False)