# imports

In [1]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score, roc_curve
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import optuna

sns.set()
filterwarnings("ignore")

# getting and preprocessing data

In [2]:
DATA_DIR = 'kaggle/input'

df_train = pd.read_csv(DATA_DIR +'/train.csv')
df_test = pd.read_csv(DATA_DIR +'/test.csv')
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

# ВАЖНО! дря корректной обработки признаков объединяем трейн и тест в один датасет
df_train['sample'] = 1 # помечаем где у нас трейн
df_test['sample'] = 0  # помечаем где у нас тест
df_test['default'] = 0 # в тесте у нас нет значения Rating, мы его должны предсказать, по этому пока просто заполняем нулями

data = df_test.append(df_train, sort=False).reset_index(drop=True) # объединяем

In [3]:
num_cols = ['age', 'score_bki', 'decline_app_cnt', 'bki_request_cnt', 'income']
cat_cols = ['education', 'first_time', 'sna', 'work_address', 'home_address', 'region_rating']
bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']

In [4]:
ids = data["client_id"]  # сохраним на всякий пожарный
data.drop(['client_id','app_date',], axis = 1, inplace=True)

# Заполнение пропуски 'education' наиболее частым значением 'SCH'
# Можно было бы выделить в отдельную категорию, "без образования", пробовал, качество от этого не повышается 
data["education"].fillna(data.education.mode(), inplace=True)

# dummies
data = pd.get_dummies(data, columns=['education'], dummy_na=True)

# label encoding
le = LabelEncoder()

for column in bin_cols:
    data[column] = le.fit_transform(data[column])
    
columns = ['first_time', 'sna', 'work_address', 'home_address', 'region_rating']

for column in columns:
    data[column] = le.fit_transform(data[column])

# логорифмируем хвостатых
data['age'] = np.log(data['age'] + 1)
data['decline_app_cnt'] = np.log(data['decline_app_cnt'] + 1)
data['income'] = np.log(data['income'] + 1)
data['bki_request_cnt'] = np.log(data.bki_request_cnt + 1)

# Нормализируем численные
for column in num_cols:
    data[column] = StandardScaler().fit_transform(np.array(data[column].values).reshape(-1, 1))

In [5]:
X, y = (
    data.query("sample == 1").drop(columns=["sample", "default"]),
    data.query("sample == 1")["default"].values,
)
test = data.query("sample == 0").drop(columns=["sample", "default"])

In [6]:
rus = RandomUnderSampler(random_state=42)
X, y = rus.fit_resample(X, y)

# modelling

### base models

In [7]:
simple_models = [
    LinearRegression(),
    LogisticRegression(random_state=42),
    RandomForestClassifier(random_state=42),
    LGBMClassifier(
        **{
            "lambda_l1": 1.6758594914477627e-05,
            "lambda_l2": 1.555459299457268e-07,
            "num_leaves": 8,
            "feature_fraction": 0.4030892028571655,
            "bagging_fraction": 0.7039470678830023,
            "bagging_freq": 3,
            "min_child_samples": 27,
        },
        random_state=42,
        silent=True
    ),
    CatBoostClassifier(random_state=42, silent=True),
]
simple_models_names = ["linear", "logistic", "rf", "lgbm", "catboost"]

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2, shuffle=True)

In [8]:
meta_df = pd.DataFrame()
for name, model in zip(simple_models_names, simple_models):
    print("Fitting", name, end = " ")
    if name != "linear":
        model.fit(X_train, y_train)
        meta_df[name] = model.predict_proba(X_val)[:,1]
    else:
        model.fit(X_train, y_train)
        meta_df[name] = model.predict(X_val)
    print("done")

meta_df.head(3)

Fitting linear done
Fitting logistic done
Fitting rf done
done
Fitting catboost done


Unnamed: 0,linear,logistic,rf,lgbm,catboost
0,0.391481,0.375536,0.38,0.379378,0.335971
1,0.654895,0.675221,0.66,0.655999,0.654139
2,0.628189,0.642456,0.66,0.663686,0.736752


### meta model

In [9]:
meta_model = CatBoostClassifier(random_state=42, silent=True)
meta_model.fit(meta_df, y_val)

test_meta_df = pd.DataFrame()
for name, model in zip(simple_models_names, simple_models):
    print("Predicting using", name, end = " ")
    if name != "linear":
        test_meta_df[name] = model.predict_proba(test)[:,1]
    else:
        test_meta_df[name] = model.predict(test)
    print("done")

test_meta_df["preds"] = meta_model.predict(test_meta_df)
test_meta_df.head(3)

Predicting using linear done
Predicting using logistic done
Predicting using rf done
Predicting using lgbm done
Predicting using catboost done


Unnamed: 0,linear,logistic,rf,lgbm,catboost,preds
0,0.257588,0.244739,0.21,0.198108,0.178658,0
1,0.733078,0.753684,0.82,0.774777,0.927645,1
2,0.336925,0.317852,0.42,0.368779,0.44333,0


In [10]:
sample_submission[:3]

Unnamed: 0,client_id,default
0,74835,0
1,17527,0
2,75683,0


In [11]:
sample_submission["default"] = test_meta_df["preds"]

In [12]:
sample_submission.to_csv("submission.csv", index=False)

## kaggle: 0.33879