# imports

In [1]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score, roc_curve
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import optuna

sns.set()
filterwarnings("ignore")

# getting and preprocessing data

In [2]:
DATA_DIR = 'kaggle/input'

df_train = pd.read_csv(DATA_DIR +'/train.csv')
df_test = pd.read_csv(DATA_DIR +'/test.csv')
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

# ВАЖНО! дря корректной обработки признаков объединяем трейн и тест в один датасет
df_train['sample'] = 1 # помечаем где у нас трейн
df_test['sample'] = 0  # помечаем где у нас тест
df_test['default'] = 0 # в тесте у нас нет значения Rating, мы его должны предсказать, по этому пока просто заполняем нулями

data = df_test.append(df_train, sort=False).reset_index(drop=True) # объединяем

In [3]:
num_cols = ['age', 'score_bki', 'decline_app_cnt', 'bki_request_cnt', 'income']
cat_cols = ['education', 'first_time', 'sna', 'work_address', 'home_address', 'region_rating']
bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']

In [4]:
ids = data["client_id"]  # сохраним на всякий пожарный
data.drop(['client_id','app_date',], axis = 1, inplace=True)

# Заполнение пропуски 'education' наиболее частым значением 'SCH'
# Можно было бы выделить в отдельную категорию, "без образования", пробовал, качество от этого не повышается 
data["education"].fillna(data.education.mode(), inplace=True)

# dummies
data = pd.get_dummies(data, columns=['education'], dummy_na=True)

# label encoding
le = LabelEncoder()

for column in bin_cols:
    data[column] = le.fit_transform(data[column])
    
columns = ['first_time', 'sna', 'work_address', 'home_address', 'region_rating']

for column in columns:
    data[column] = le.fit_transform(data[column])

# логорифмируем хвостатых
data['age'] = np.log(data['age'] + 1)
data['decline_app_cnt'] = np.log(data['decline_app_cnt'] + 1)
data['income'] = np.log(data['income'] + 1)
data['bki_request_cnt'] = np.log(data.bki_request_cnt + 1)

# Нормализируем численные
for column in num_cols:
    data[column] = StandardScaler().fit_transform(np.array(data[column].values).reshape(-1, 1))

In [5]:
X, y = (
    data.query("sample == 1").drop(columns=["sample", "default"]),
    data.query("sample == 1")["default"].values,
)
test = data.query("sample == 0").drop(columns=["sample", "default"])

In [6]:
rus = RandomUnderSampler(random_state=42)
X, y = rus.fit_resample(X, y)

# modelling

### model tuning

#### Logistic Regression

In [7]:
def objective(trial):
    params = {
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
        "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs", "newton-cg", "liblinear", "saga"]),
        "multi_class": trial.suggest_categorical("multi_class", ["auto", "ovr"]),
        "C": trial.suggest_float("C", 1e-10, 1e10, log=True),
    }

    lr = LogisticRegression(**params)

    cv = cross_val_score(lr, X, y, scoring="roc_auc", cv=8, n_jobs=-1)
    return np.mean(cv)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///LogisticRegression.db",
    study_name="LogisticRegression",
    load_if_exists=True,
)
study.optimize(objective, timeout=600, n_trials=0)


[32m[I 2022-02-14 23:23:03,201][0m Using an existing study with name 'LogisticRegression' instead of creating a new one.[0m


In [8]:
study.best_params

{'C': 0.07213528501783559,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'solver': 'liblinear'}

#### LightGBM

In [9]:
def objective(trial):

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = LGBMClassifier(**param, silent=True)
    cv_roc_auc = cross_val_score(gbm, X, y, cv=8, scoring="roc_auc", n_jobs=-1)

    return np.mean(cv_roc_auc)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///LGBMClassifier.db",
    study_name="LGBMClassifier",
    load_if_exists=True,
)
study.optimize(objective, timeout=600, n_trials=0)

[32m[I 2022-02-14 23:23:03,576][0m Using an existing study with name 'LGBMClassifier' instead of creating a new one.[0m


In [10]:
study.best_params

{'bagging_fraction': 0.6641812860565748,
 'bagging_freq': 3,
 'feature_fraction': 0.4257771924417812,
 'lambda_l1': 0.41391873003799096,
 'lambda_l2': 0.009238732952784297,
 'min_child_samples': 76,
 'num_leaves': 9}

#### CatBoost

In [11]:
def objective(trial):

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "used_ram_limit": "6gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = CatBoostClassifier(**param, silent=True)
    cv_roc_auc = cross_val_score(gbm, X, y, cv=8, scoring="roc_auc", n_jobs=-1)

    return np.mean(cv_roc_auc)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///CatBoostClassifier.db",
    study_name="CatBoostClassifier",
    load_if_exists=True,
)
study.optimize(objective, timeout=600, n_trials=0)

[32m[I 2022-02-14 23:23:04,004][0m Using an existing study with name 'CatBoostClassifier' instead of creating a new one.[0m


In [12]:
study.best_params

{'boosting_type': 'Ordered',
 'bootstrap_type': 'Bernoulli',
 'colsample_bylevel': 0.07494834574844549,
 'depth': 12,
 'objective': 'CrossEntropy',
 'subsample': 0.9809549562352028}

### base models

In [13]:
simple_models = [
    LinearRegression(),
    LogisticRegression(
        **{
            "C": 0.07213528501783559,
            "multi_class": "ovr",
            "penalty": "l2",
            "solver": "liblinear",
        },
        random_state=42
    ),
    RandomForestClassifier(n_estimators=256, random_state=42),
    LGBMClassifier(
        **{
            "bagging_fraction": 0.6641812860565748,
            "bagging_freq": 3,
            "feature_fraction": 0.4257771924417812,
            "lambda_l1": 0.41391873003799096,
            "lambda_l2": 0.009238732952784297,
            "min_child_samples": 76,
            "num_leaves": 9,
        },
        random_state=42,
        silent=True
    ),
    CatBoostClassifier(
        # **{
        #     "boosting_type": "Ordered",
        #     "bootstrap_type": "Bernoulli",
        #     "colsample_bylevel": 0.07494834574844549,
        #     "depth": 12,
        #     "objective": "CrossEntropy",
        #     "subsample": 0.9809549562352028,
        # },
        random_state=42,
        silent=True
    ),
]
simple_models_names = ["linear", "logistic", "rf", "lgbm", "catboost"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, random_state=42, test_size=0.2, shuffle=True
)


In [14]:
meta_df = pd.DataFrame()
for name, model in zip(simple_models_names, simple_models):
    print("Fitting", name, end = " ")
    if name != "linear":
        model.fit(X_train, y_train)
        meta_df[name] = model.predict_proba(X_val)[:,1]
    else:
        model.fit(X_train, y_train)
        meta_df[name] = model.predict(X_val)
    print("done")

meta_df.head(3)

Fitting linear done
Fitting logistic done
Fitting rf done
done
Fitting catboost done


Unnamed: 0,linear,logistic,rf,lgbm,catboost
0,0.391481,0.376763,0.328125,0.382711,0.335971
1,0.654895,0.672962,0.707031,0.665023,0.654139
2,0.628189,0.641612,0.667969,0.716308,0.736752


### meta model

In [15]:
meta_model = CatBoostClassifier(random_state=42, silent=True)
meta_model.fit(meta_df, y_val)

test_meta_df = pd.DataFrame()
for name, model in zip(simple_models_names, simple_models):
    print("Predicting using", name, end = " ")
    if name != "linear":
        test_meta_df[name] = model.predict_proba(test)[:,1]
    else:
        test_meta_df[name] = model.predict(test)
    print("done")

test_meta_df["preds"] = meta_model.predict(test_meta_df)
test_meta_df.head(3)

Predicting using linear done
Predicting using logistic done
Predicting using rf done
Predicting using lgbm done
Predicting using catboost done


Unnamed: 0,linear,logistic,rf,lgbm,catboost,preds
0,0.257588,0.245833,0.207031,0.209238,0.178658,0
1,0.733078,0.754758,0.820312,0.692517,0.927645,1
2,0.336925,0.320274,0.367188,0.358034,0.44333,0


In [16]:
sample_submission[:3]

Unnamed: 0,client_id,default
0,74835,0
1,17527,0
2,75683,0


In [17]:
sample_submission["default"] = test_meta_df["preds"]

In [18]:
sample_submission.to_csv("submission.csv", index=False)

## kaggle: 0.34142