In [3]:
# A test for tree models with SMOTE sampling on 2016Q1-2018Q4 datasets

import numpy as np
import pandas as pd


path = 'D:\\2022lendingclub_ML\\'

In [7]:
data = pd.read_parquet(path + '2016Q1_to_2018Q4.parquet', engine='fastparquet')

In [8]:
# 已清洗好的数据（未归一化，树模型不需要归一化）。直接建模

idata = data.reset_index(drop=True)

In [15]:
# 备选模型：随机森林、XGBoost、LightGBM（快速）、Catboost；神经网络（后续补充）。先尝试未过采样的数据

from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm import LGBMClassifier
import catboost as cb

np.random.seed(0)

In [10]:
X = idata.drop(columns=['target'])
y = idata['target']

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [14]:
# 随机森林，先粗设一些参数，如果出现过拟合再使用更严格剪枝策略

rf0 = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=0)
y_pred_rf = rf0.fit(x_train, y_train).predict(x_test)

In [15]:
# XGBoost

xgb0 = XGBClassifier(seed=0)
y_pred_xgb = xgb0.fit(x_train, y_train).predict(x_test)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




In [16]:
# lightGBM

lgb0 = LGBMClassifier(max_depth=6, learning_rate=0.5, n_estimators=250, random_state=0)
y_pred_lgb = lgb0.fit(x_train, y_train).predict(x_test)

In [17]:
# catboost

cb0 = cb.CatBoostClassifier(random_state=0)
y_pred_cb = cb0.fit(x_train, y_train, verbose = 0).predict(x_test)

In [13]:
# 事实证明，kaggle参赛者偏爱lightGBM不是没道理，训练速度确实很快。

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def score_sum(y_pred, y_true):
    acc = accuracy_score(y_true, y_pred)
    p = precision_score(y_true, y_pred)
    r = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f'The accuracy score is:{acc}, the precision score is:{p}, the recall score is:{r}, and the f1 score is:{f1}.')


In [23]:
score_sum(y_test, y_pred_rf) # 模型过于保守，查全率太高，查准率有点低

The accuracy score is:0.9772706861272055, the precision score is:0.7862174940898345, the recall score is:0.9904401691583775, and the f1 score is:0.8765913703576794.


In [24]:
score_sum(y_test, y_pred_xgb) # 双高，性能很好

The accuracy score is:0.9944657373312717, the precision score is:0.9541371158392435, the recall score is:0.9916461916461916, and the f1 score is:0.9725301204819278.


In [17]:
score_sum(y_test, y_pred_lgb) # 性能略低于xgb，但是它的训练速度真的很快

The accuracy score is:0.9899994902652806, the precision score is:0.9363829787234043, the recall score is:0.9651786149422487, and the f1 score is:0.9505627684849649.


In [26]:
score_sum(y_test, y_pred_cb) # 效果基本同xgboost，问题也和xgboost一样

The accuracy score is:0.9944511734821435, the precision score is:0.953404255319149, the recall score is:0.9922497785651019, and the f1 score is:0.9724392361111112.


In [23]:
# 针对LightGBM进行调参。只调了一部分，因为电脑性能不够。

# from sklearn.model_selection import GridSearchCV
from tune_sklearn import TuneGridSearchCV

lgb_params = {
    'max_depth' : [15, 20, 25],
    'n_estimators' : [400, 700, 1000],
    'num_leaves': [50, 100, 200],
    #'reg_alpha' : [1.1, 1.2],
    #'min_split_gain': [0.25, 0.5]
}

lgb_search = TuneGridSearchCV(
    LGBMClassifier(),
    lgb_params,
    early_stopping='MedianStoppingRule',
    max_iters=10
)


In [24]:
y_pred_lgb_tuned = lgb_search.fit(x_train, y_train).predict(x_test) 

2023-01-27 18:32:55,793	ERROR tune.py:758 -- Trials did not complete: [_Trainable_badbc_00000, _Trainable_badbc_00001, _Trainable_badbc_00002, _Trainable_badbc_00003, _Trainable_badbc_00004, _Trainable_badbc_00005, _Trainable_badbc_00006, _Trainable_badbc_00007, _Trainable_badbc_00008, _Trainable_badbc_00009, _Trainable_badbc_00010, _Trainable_badbc_00011, _Trainable_badbc_00012, _Trainable_badbc_00013, _Trainable_badbc_00014, _Trainable_badbc_00015, _Trainable_badbc_00016, _Trainable_badbc_00017, _Trainable_badbc_00018, _Trainable_badbc_00019, _Trainable_badbc_00020, _Trainable_badbc_00021, _Trainable_badbc_00022, _Trainable_badbc_00023]


ZeroDivisionError: division by zero