# Setting working directory

## Load the Google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Change the workding dir to: 

`'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl'`

In [2]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl')
!ls

2.0-EDA-1.ipynb
3.0-FeatureEngineering-original.ipynb
3.1-FeatureEngineering-LagrangeInterpolate.ipynb
3.2-FeatureEngineering-From3.1-Lgrg+onehot.ipynb
3.3-FeatureEngineering-From3.2+Log1p.ipynb
3.4-FeatureEngineering.ipynb
3.5-FeatureEngineering-backToOrigin.ipynb
4-Tweaking.ipynb
5.1-Ensemble-Stacking.ipynb
5.2-Ensemble-Stacking-weightedKFold.ipynb
originalDataset
preprocessedData
submissionResults
wasted


## Go to this place for original dataset: 

`'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl/originalDataset'`

# Importing libraries

In [3]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/52/39/128fff65072c8327371e3c594f3c826d29c85b21cb6485980353b168e0e4/catboost-0.24.2-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.2MB 43kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.2


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

# Loading preprocessed data

In [7]:
# reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [8]:
x_train = pd.read_csv("preprocessedData/x_train-1108-3.5-1.csv") #.head(200)
x_test = pd.read_csv("preprocessedData/x_test-1108-3.5-1.csv") #.head(200)
y_train = pd.read_csv("preprocessedData/y_train-1108-3.5-1.csv") #.head(200)

In [9]:
x_train_small = reduce_mem_usage(x_train)
x_test_small = reduce_mem_usage(x_test)

Memory usage of dataframe is 633600128.00 MB
Memory usage after optimization is: 273600128.00 MB
Decreased by 56.8%
Memory usage of dataframe is 158400128.00 MB
Memory usage after optimization is: 61200128.00 MB
Decreased by 61.4%


In [10]:
x_train_small.shape

(800000, 99)

In [11]:
x_train.isnull().sum()

loanAmnt             0
interestRate         0
installment          0
grade                0
subGrade             0
                    ..
grade_to_min_n13     0
grade_to_mean_n14    0
grade_to_std_n14     0
grade_to_max_n14     0
grade_to_min_n14     0
Length: 99, dtype: int64

------------

In [None]:
x_train_small = pd.read_csv("preprocessedData/x_train-1105-2.csv") #.head(200)
x_test_small = pd.read_csv("preprocessedData/x_test-1105-2.csv") #.head(200)
y_train = pd.read_csv("preprocessedData/y_train-1105-2.csv") #.head(200)

---------------

In [None]:
# data_train = pd.read_csv("preprocessedData/data_train.csv")
# data_test_a = pd.read_csv("preprocessedData/data_test_a.csv")

x_train_small = pd.read_csv("preprocessedData/x_train_small.csv") #.head(200)
x_test_small = pd.read_csv("preprocessedData/x_test_small.csv") #.head(200)
y_train = pd.read_csv("preprocessedData/y_train.csv") #.head(200)

In [None]:
# folds = 5
# seed = 34
# kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

In [None]:
# train_x = x_train_small
# train_y = y_train
# for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
#     print('************************************ {} ************************************'.format(i+1))
#     print(train_index, valid_index)
#     trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]


# Stacking Ensembling

https://zhuanlan.zhihu.com/p/81220131

## Training separate classifiers

In [12]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': "gpu_hist", #'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            # https://stackoverflow.com/questions/55579610/xgboost-attributeerror-dataframe-object-has-no-attribute-feature-names
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(xgb.DMatrix(test_x) , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        ## 下面这个test，可以考虑像如下这样进行，或者是求平均亦可。
        # test += test_pred / kf.n_splits ## Minke's invention, no basis or reference.
        test += test_pred
        ##
        train[valid_index] = val_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    print("what is kf.n_splits?", kf.n_splits)
    return train, test / kf.n_splits

In [13]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

### LGB

In [None]:
lgb_train, lgb_test = lgb_model(x_train_small, y_train, x_test_small)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.747799	valid_1's auc: 0.734891
[400]	training's auc: 0.760757	valid_1's auc: 0.735722
Early stopping, best iteration is:
[392]	training's auc: 0.760396	valid_1's auc: 0.735735
[0.735735071101635]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.748844	valid_1's auc: 0.732017
[400]	training's auc: 0.761416	valid_1's auc: 0.732508
Early stopping, best iteration is:
[305]	training's auc: 0.75571	valid_1's auc: 0.732579
[0.735735071101635, 0.7325788268485839]
************************************ 3 ************************************
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.747933	valid_1's auc: 0.735456
[400]	training's auc: 0.760537	valid_1's auc: 0.736203
[600]	training

### CAT

In [None]:
cat_train, cat_test = cat_model(x_train_small, y_train, x_test_small)

************************************ 1 ************************************
0:	learn: 0.3985414	test: 0.3966309	best: 0.3966309 (0)	total: 231ms	remaining: 1h 16m 50s
500:	learn: 0.3760707	test: 0.3747870	best: 0.3747870 (500)	total: 1m 12s	remaining: 47m 18s
1000:	learn: 0.3745442	test: 0.3741192	best: 0.3741192 (1000)	total: 2m 20s	remaining: 44m 24s
1500:	learn: 0.3734311	test: 0.3738159	best: 0.3738159 (1500)	total: 3m 27s	remaining: 42m 37s
2000:	learn: 0.3725252	test: 0.3736800	best: 0.3736799 (1983)	total: 4m 33s	remaining: 41m 4s
2500:	learn: 0.3717084	test: 0.3736113	best: 0.3736113 (2500)	total: 5m 41s	remaining: 39m 47s
3000:	learn: 0.3709331	test: 0.3735546	best: 0.3735546 (3000)	total: 6m 48s	remaining: 38m 32s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3735327145
bestIteration = 3182

Shrink model to first 3183 iterations.
[0.7362366190112378]
************************************ 2 ************************************
0:	learn: 0.3979542	test: 0.3

### XGBoost

At least for XGBoost, the `test_pred` will be probabilities, not the classification results. See https://blog.csdn.net/weixin_42320576/article/details/88414238 

In [14]:
xgb_train, xgb_test = xgb_model(x_train_small, y_train, x_test_small)

************************************ 1 ************************************
[0]	train-auc:0.695651	eval-auc:0.696424
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.726243	eval-auc:0.723101
[400]	train-auc:0.735759	eval-auc:0.728556
[600]	train-auc:0.741566	eval-auc:0.730737
[800]	train-auc:0.746227	eval-auc:0.731971
[1000]	train-auc:0.750178	eval-auc:0.732669
[1200]	train-auc:0.753772	eval-auc:0.733188
[1400]	train-auc:0.757131	eval-auc:0.733503
[1600]	train-auc:0.760268	eval-auc:0.733842
[1800]	train-auc:0.763354	eval-auc:0.73398
[2000]	train-auc:0.766289	eval-auc:0.73403
[2200]	train-auc:0.769025	eval-auc:0.734183
[2400]	train-auc:0.771768	eval-auc:0.734333
[2600]	train-auc:0.774361	eval-auc:0.734376
[2800]	train-auc:0.776962	eval-auc:0.734466
Stopping. Best iteration:
[2767]	train-auc:0.776514	eval-auc:0.734482

[0.734481616377555]
************************************ 2 *

## See results of individual model

In [None]:
testA_result = pd.read_csv('originalDataset/sample_submit.csv')
testA_result_pred = testA_result.copy()

In [None]:
testA_result_pred["isDefault"] = xgb_test
# testA_result_pred

In [None]:
testA_result_pred.to_csv("submissionResults/xgboost-1107-3.4.csv", index=False)

## Generating the stacked data sets

In [None]:
newStackingTrainingSet = pd.DataFrame(columns=["lgb", "cat", "xgb"])
newStackingTrainingSet["lgb"] = lgb_train
newStackingTrainingSet["cat"] = cat_train
newStackingTrainingSet["xgb"] = xgb_train
newStackingTrainingSet.head()

Unnamed: 0,lgb,cat,xgb
0,0.303128,0.261341,0.3185
1,0.294998,0.295826,0.263322
2,0.483363,0.442711,0.463407
3,0.077385,0.062399,0.062771
4,0.468531,0.384302,0.478617


In [None]:
newStackingTestSet = pd.DataFrame(columns=["lgb", "cat", "xgb"])
newStackingTestSet["lgb"] = lgb_test
newStackingTestSet["cat"] = cat_test
newStackingTestSet["xgb"] = xgb_test

In [None]:
newStackingTestSet.head()

Unnamed: 0,lgb,cat,xgb
0,0.158495,0.186207,0.229685
1,0.256325,0.264853,0.286334
2,0.21193,0.248287,0.230568
3,0.199787,0.217511,0.293853
4,0.304836,0.353438,0.399229


## Using the new data set to train the final model

In [None]:
xgb_train_final, xgb_test_final = xgb_model(newStackingTrainingSet, y_train, newStackingTestSet)

************************************ 1 ************************************
[0]	train-auc:0.734887	eval-auc:0.735389
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.737649	eval-auc:0.737626
Stopping. Best iteration:
[73]	train-auc:0.737197	eval-auc:0.737662

[0.7376617806789582]
************************************ 2 ************************************
[0]	train-auc:0.735759	eval-auc:0.732344
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.738446	eval-auc:0.734387
Stopping. Best iteration:
[35]	train-auc:0.737877	eval-auc:0.734477

[0.7376617806789582, 0.7344774043563302]
************************************ 3 ************************************
[0]	train-auc:0.734887	eval-auc:0.736044
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopp

# Generate submission result

In [None]:
testA_result = pd.read_csv('originalDataset/sample_submit.csv')
testA_result_pred = testA_result.copy()

In [None]:
# testA_result = pd.read_csv('originalDataset/sample_submit.csv')
# testA_result_pred = testA_result.copy()

In [None]:
testA_result_pred["isDefault"] = xgb_test_final
# testA_result_pred

In [None]:
testA_result_pred.to_csv("submissionResults/ensemble-lgb_cat_xgb-1105-3.csv", index=False)

In [None]:
newStackingTestSet.to_csv("preprocessedData/lgb_cat_xgb-test.csv", index=False)
newStackingTrainingSet.to_csv("preprocessedData/lgb_cat_xgb-train.csv", index=False)