# Setting working directory

## Load the Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Change the workding dir to: 

`'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl'`

In [None]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl')
!ls

2.0-EDA-1.ipynb
3.0-FeatureEngineering-original.ipynb
3.1-FeatureEngineering-LagrangeInterpolate.ipynb
3.2-FeatureEngineering-From3.1-Lgrg+onehot.ipynb
3.3-FeatureEngineering-From3.2+Log1p.ipynb
3.4-FeatureEngineering.ipynb
3.5-FeatureEngineering-backToOrigin.ipynb
3.6-FeatureEngineering-brandNewScheme.ipynb
3.7-FeatureEngineering-GoBack.ipynb
4-Tweaking.ipynb
5.1-Ensemble-Stacking.ipynb
5.2-Ensemble-Stacking-weightedKFold.ipynb
originalDataset
preprocessedData
submissionResults
Untitled
wasted


## Go to this place for original dataset: 

`'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl/originalDataset'`

# Importing libraries

In [None]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/52/39/128fff65072c8327371e3c594f3c826d29c85b21cb6485980353b168e0e4/catboost-0.24.2-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.2MB 47kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.2


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

# Loading preprocessed data

In [None]:
x_train = pd.read_csv("preprocessedData/x_train-1110-3_7-1.csv")
x_test = pd.read_csv("preprocessedData/x_test-1110-3_7-1.csv")
y_train = pd.read_csv("preprocessedData/y_train-1110-3_7-1.csv")

In [None]:
x_train.info

<bound method DataFrame.info of         loanAmnt  term  ...  grade_to_mean_n14  grade_to_std_n14
0        35000.0     5  ...           1.845480          3.899647
1        18000.0     5  ...           1.476384          3.119718
2        12000.0     5  ...           1.299807          3.019553
3        11000.0     3  ...           0.391052          0.820342
4         3000.0     3  ...           0.974855          2.264665
...          ...   ...  ...                ...               ...
799995   25000.0     3  ...           1.021537          2.302376
799996   17000.0     3  ...           0.419385          0.853507
799997    6000.0     3  ...           0.974855          2.264665
799998   19200.0     3  ...           0.314779          0.743271
799999    9000.0     3  ...           0.738192          1.559859

[800000 rows x 84 columns]>

In [None]:
x_test.info

<bound method DataFrame.info of         loanAmnt  term  ...  grade_to_mean_n14  grade_to_std_n14
0        14000.0     3  ...           0.682304          1.531164
1        20000.0     5  ...           1.104785          2.336065
2        12000.0     3  ...           1.176775          2.884269
3        17500.0     5  ...           1.023456          2.296746
4        35000.0     3  ...           1.562159          3.262761
...          ...   ...  ...                ...               ...
199995    7000.0     3  ...           0.651624          1.509376
199996    6000.0     3  ...           0.419487          0.855134
199997   14000.0     5  ...           1.258461          2.565402
199998    8000.0     3  ...           1.473047          3.114754
199999    8000.0     3  ...           0.419487          0.855134

[200000 rows x 84 columns]>

In [None]:
y_train.info

<bound method DataFrame.info of         isDefault
0               1
1               0
2               0
3               0
4               0
...           ...
799995          0
799996          0
799997          1
799998          0
799999          0

[800000 rows x 1 columns]>

In [None]:
# data_train = pd.read_csv("preprocessedData/data_train.csv")
# data_test_a = pd.read_csv("preprocessedData/data_test_a.csv")

# x_train_small = pd.read_csv("preprocessedData/x_train_small.csv") #.head(200)
# x_test_small = pd.read_csv("preprocessedData/x_test_small.csv") #.head(200)
# y_train = pd.read_csv("preprocessedData/y_train.csv") #.head(200)

In [None]:
# folds = 5
# seed = 34
# kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

In [None]:
# train_x = x_train_small
# train_y = y_train
# for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
#     print('************************************ {} ************************************'.format(i+1))
#     print(train_index, valid_index)
#     trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]


# Greedy search

In [None]:
from sklearn.model_selection import GridSearchCV

def get_best_cv_params_xgb(gamma = 1, min_child_weight = 1.5, max_depth = 5, 
                    reg_lambda = 10, subsample = 0.7, colsample_bytree = 0.7, 
                    colsample_bylevel = 0.7, eta = 0.04,
                       param_grid=None):
    # 设置5折交叉验证
    cv_fold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True, )
    
    model_xgb = xgb.XGBClassifier(
        booster = 'gbtree',
        objective = 'binary:logistic',
        eval_metric = 'auc',

        gamma = gamma, ## 注意，有的时候，有的时候该int的参数，就一定要这样int一下。
        min_child_weight = min_child_weight,
        max_depth = max_depth,
        reg_lambda = reg_lambda,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        colsample_bylevel = colsample_bylevel,
        eta = eta,


        tree_method = "gpu_hist", #'exact', # 
        seed = 2020,
        nthread = 36,
        silent = True,
    )

    grid_search = GridSearchCV(estimator=model_xgb, 
                               cv=cv_fold,
                               param_grid=param_grid,
                               scoring='roc_auc',
                               verbose = 2,
                              #  n_jobs = 2
                              )
    grid_search.fit(x_train, y_train)

    print('模型当前最优参数为:{}'.format(grid_search.best_params_))
    print('模型当前最优得分为:{}'.format(grid_search.best_score_))

In [None]:
# params = {'booster': 'gbtree',
#                   'objective': 'binary:logistic',
#                   'eval_metric': 'auc',
#                   'gamma': 1,
#                   'min_child_weight': 1.5,
#                   'max_depth': 5,
#                   'lambda': 10,
#                   'subsample': 0.7,
#                   'colsample_bytree': 0.7,
#                   'colsample_bylevel': 0.7,
#                   'eta': 0.04,
#                   'tree_method': "gpu_hist", #'exact',
#                   'seed': 2020,
#                   'nthread': 36,
#                   "silent": True,
#                   }

In [None]:
xgb_params = {
    'gamma': [i * 0.2 for i in range(0, 10 + 1)], 
    "min_child_weight": [i * 0.5 for i in range(0, 4 + 1)]
}

get_best_cv_params_xgb(gamma = None, min_child_weight = 1.5, max_depth = 5, 
                    reg_lambda = 10, subsample = 0.7, colsample_bytree = 0.7, 
                    colsample_bylevel = 0.7, eta = 0.04,
                            param_grid=xgb_params)

* 模型当前最优参数为:{'gamma': 1.0, 'min_child_weight': 0.0}
* 模型当前最优得分为:0.7285129387837725

In [None]:
xgb_params = {
    'max_depth': range(3, 8), 
    "reg_lambda": range(0, 20, 5)
}

get_best_cv_params_xgb(gamma = 1, min_child_weight = 0, max_depth = None, 
                    reg_lambda = None, subsample = 0.7, colsample_bytree = 0.7, 
                    colsample_bylevel = 0.7, eta = 0.04,
                            param_grid=xgb_params)

* 模型当前最优参数为:{'max_depth': 7, 'reg_lambda': 10}
* 模型当前最优得分为:0.7311631311823202

In [None]:
xgb_params = {
    'subsample': [i * 0.1 for i in range(1, 9 + 1)], # [0.1], #
    # "colsample_bytree": [i * 0.1 for i in range(1, 9 + 1)], 
    # 'colsample_bylevel': [i * 0.1 for i in range(1, 9 + 1)], 
    "eta": [i * 0.005 for i in range(4, 12 + 1)] # [0.01] # 
}

get_best_cv_params_xgb(gamma = 1, min_child_weight = 0, max_depth = 7, 
                    reg_lambda = 10, subsample = 0.7, colsample_bytree = 0.7, 
                    colsample_bylevel = 0.7, eta = 0.04,
                            param_grid=xgb_params)

* 模型当前最优参数为:{'eta': 0.06, 'subsample': 0.9}
* 模型当前最优得分为:0.7280618319887984

In [None]:
xgb_params = {
    'subsample': [0.85 + i * 0.01 for i in range(0, 10 + 1)], # [0.1], #
    "eta": [0.06 + i * 0.01 for i in range(0, 5 + 1)] # [0.01] # 
}

get_best_cv_params_xgb(gamma = 1, min_child_weight = 0, max_depth = 7, 
                    reg_lambda = 10, subsample = 0.7, colsample_bytree = 0.7, 
                    colsample_bylevel = 0.7, eta = 0.04,
                            param_grid=xgb_params)

* 模型当前最优参数为:{'eta': 0.06, 'subsample': 0.9199999999999999}
* 模型当前最优得分为:0.7313398934800516

In [None]:
xgb_params = {
    # 'subsample': [i * 0.1 for i in range(1, 9 + 1)], 
    # "eta":  [i * 0.005 for i in range(1, 20 + 1)], 
    "colsample_bytree": [i * 0.1 for i in range(1, 9 + 1)], 
    'colsample_bylevel': [i * 0.1 for i in range(1, 9 + 1)], 
}

get_best_cv_params_xgb(gamma = 1, min_child_weight = 0, max_depth = 7, 
                    reg_lambda = 10, subsample = 0.92, colsample_bytree = 0.7, 
                    colsample_bylevel = 0.7, eta = 0.06,
                            param_grid=xgb_params)

* 模型当前最优参数为:{'colsample_bylevel': 0.9, 'colsample_bytree': 0.8}
* 模型当前最优得分为:0.7316998565644242

The final result is shown as follow: 

```
get_best_cv_params_xgb(gamma = 1, min_child_weight = 0, max_depth = 7, 
                    reg_lambda = 10, subsample = 0.92, colsample_bytree = 0.8, 
                    colsample_bylevel = 0.9, eta = 0.06,
                            param_grid=xgb_params)
```

--------------------------------

In [None]:
# # from sklearn.model_selection import GridSearchCV

# def get_best_cv_params(learning_rate=0.1, n_estimators=581, num_leaves=31, max_depth=-1, bagging_fraction=1.0, 
#                        feature_fraction=1.0, bagging_freq=0, min_data_in_leaf=20, min_child_weight=0.001, 
#                        min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=None):
#     # 设置5折交叉验证
#     cv_fold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True, verbose)
    
#     model_lgb = lgb.LGBMClassifier(learning_rate=learning_rate,
#                                    n_estimators=n_estimators,
#                                    num_leaves=num_leaves,
#                                    max_depth=max_depth,
#                                    bagging_fraction=bagging_fraction,
#                                    feature_fraction=feature_fraction,
#                                    bagging_freq=bagging_freq,
#                                    min_data_in_leaf=min_data_in_leaf,
#                                    min_child_weight=min_child_weight,
#                                    min_split_gain=min_split_gain,
#                                    reg_lambda=reg_lambda,
#                                    reg_alpha=reg_alpha,
#                                    n_jobs= 8
#                                   )
#     grid_search = GridSearchCV(estimator=model_lgb, 
#                                cv=cv_fold,
#                                param_grid=param_grid,
#                                scoring='roc_auc'
#                               )
#     grid_search.fit(x_train, y_train)

#     print('模型当前最优参数为:{}'.format(grid_search.best_params_))
#     print('模型当前最优得分为:{}'.format(grid_search.best_score_))

In [None]:
# lgb_params = {'num_leaves': range(10, 80, 5), 'max_depth': range(3,10,2)}
# get_best_cv_params(learning_rate=0.1, n_estimators=581, num_leaves=None, max_depth=None, min_data_in_leaf=20, 
#                    min_child_weight=0.001,bagging_fraction=1.0, feature_fraction=1.0, bagging_freq=0, 
#                    min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)

# Bayesian Optimization

**Mind this**: other optimization can be used. 

Some references: 
* https://blog.csdn.net/ssswill/article/details/85274097
* https://www.cnblogs.com/PythonLearner/p/12903602.html
* https://www.kaggle.com/tilii7/bayesian-optimization-of-xgboost-parameters/notebook

In [None]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp36-none-any.whl size=11685 sha256=2da83f52d4e154876115743e3baf1757098495891f132eed79b7a27d7154c303
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
"""定义优化函数"""
def rf_cv_xgb(gamma, min_child_weight, max_depth, 
              reg_lambda, subsample, colsample_bytree, 
              colsample_bylevel, eta):
  
    # original HP settings
    # params = {'booster': 'gbtree',
    #                   'objective': 'binary:logistic',
    #                   'eval_metric': 'auc',
    #                   'gamma': 1,
    #                   'min_child_weight': 1.5,
    #                   'max_depth': 5,
    #                   'lambda': 10,
    #                   'subsample': 0.7,
    #                   'colsample_bytree': 0.7,
    #                   'colsample_bylevel': 0.7,
    #                   'eta': 0.04,
    #                   'tree_method': "gpu_hist", #'exact',
    #                   'seed': 2020,
    #                   'nthread': 36,
    #                   "silent": True,
    #                   }

    # 选择参数
    ## 参考这个：https://xgboost.readthedocs.io/en/latest/python/python_api.html
    model_xgb = xgb.XGBClassifier(
        booster = 'gbtree',
        objective = 'binary:logistic',
        eval_metric = 'auc',

        gamma = int(gamma), ## 注意，有的时候，有的时候该int的参数，就一定要这样int一下。
        min_child_weight = min_child_weight,
        max_depth = int(max_depth),
        reg_lambda = reg_lambda,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        colsample_bylevel = colsample_bylevel,
        eta = eta,


        tree_method = "gpu_hist",
        seed = 2020,
        nthread = 36,
        silent = True,
    )
    
    val = cross_val_score(model_xgb, x_train, y_train, cv=5, scoring='roc_auc').mean()
    
    return val

In [None]:
from bayes_opt import BayesianOptimization
"""定义优化参数"""
bayes_xgb = BayesianOptimization(
    rf_cv_xgb, 
    {
        'gamma': (0.0, 2),
        'min_child_weight': (0.0, 3),
        'max_depth': (0, 10),
        'reg_lambda': (0, 20),
        'subsample': (0.0, 1),
        'colsample_bytree': (0.0, 1),
        'colsample_bylevel': (0.0, 1),
        'eta': (0.0, 0.2),
    }
)

"""开始优化"""
bayes_xgb.maximize(n_iter=10)

|   iter    |  target   | colsam... | colsam... |    eta    |   gamma   | max_depth | min_ch... | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7199  [0m | [0m 0.5707  [0m | [0m 0.1549  [0m | [0m 0.08093 [0m | [0m 1.919   [0m | [0m 4.933   [0m | [0m 2.785   [0m | [0m 6.981   [0m | [0m 0.404   [0m |
| [0m 2       [0m | [0m 0.7085  [0m | [0m 0.4175  [0m | [0m 0.1295  [0m | [0m 0.1964  [0m | [0m 0.6126  [0m | [0m 2.868   [0m | [0m 0.2835  [0m | [0m 7.766   [0m | [0m 0.3502  [0m |
| [95m 3       [0m | [95m 0.7286  [0m | [95m 0.89    [0m | [95m 0.7455  [0m | [95m 0.1479  [0m | [95m 0.2351  [0m | [95m 5.643   [0m | [95m 0.7774  [0m | [95m 11.81   [0m | [95m 0.7485  [0m |
| [0m 4       [0m | [0m 0.7157  [0m | [0m 0.3534  [0m | [0m 0.4306  [0m | [0m 0.04366 [0m | [0m 0.6346  [0m | [0m 2.226   [0m 