In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)

from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
from lightgbm import LGBMClassifier

In [2]:
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter()
sh = logging.StreamHandler()
sh.setFormatter(formatter)
logger.addHandler(sh)

In [3]:
train_data = pd.read_csv('train_public.csv')

In [4]:
test_data = pd.read_csv('test_public.csv')

In [5]:
data = pd.concat([train_data, test_data])

print(data.shape)
data.tail()

(15000, 39)


Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,known_outstanding_loan,known_dero,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,app_type,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault
4995,1008856,208856,9454.545,5,12.015,183.47,C,政府机构,信息传输、软件和信息技术服务业,< 1 year,1,0,2011/1/1,0,48,10,14.9,0,568.636,710.795,11,0,0.0,5370.462,75.785,1,0,5-Jul,10595,1,,,,,,0,0,0.0,
4996,1016651,216651,5500.0,3,7.97,172.28,A,政府机构,房地产业,5 years,1,0,2017/5/1,0,31,2,12.16,0,796.364,995.455,5,0,0.0,6326.538,43.077,0,0,12-Apr,0,1,3.0,0.0,2.0,5.0,3.0,3,1564,0.0,
4997,1024140,224140,30545.455,3,8.9,889.09,A,上市企业,房地产业,10+ years,0,2,2013/12/1,0,464,37,21.55,0,710.0,769.167,17,0,0.0,63903.692,79.385,0,0,Oct-86,21735,1,8.0,0.0,17.0,20.0,14.0,2,5456,1510.892,
4998,1014316,214316,4090.909,3,6.03,152.18,A,政府机构,文化和体育业,10+ years,0,2,2012/9/1,0,566,30,13.222,0,709.091,768.182,7,0,0.0,2703.692,29.423,1,0,Mar-99,24272,1,1.0,0.0,6.0,10.0,10.0,3,223,41.169,
4999,1012946,212946,14727.273,3,7.97,563.81,A,普通企业,交通运输、仓储和邮政业,4 years,1,1,2017/6/1,0,724,2,16.74,0,622.727,778.409,7,0,0.0,41270.923,107.538,0,0,5-Jul,0,1,6.0,0.0,5.0,8.0,7.0,3,1561,360.231,


In [6]:
data['issue_date'] = pd.to_datetime(data['issue_date'])
data['issue_mon'] = data['issue_date'].dt.year * 100 + data['issue_date'].dt.month
data.drop(['issue_date'], axis=1, inplace=True)

data['class'] = data['class'].map({
    'A': 0, 'B': 1, 'C': 2, 'D': 3,
    'E': 4, 'F': 5, 'G': 6
})
lbe = LabelEncoder()
data['employer_type'] = lbe.fit_transform(data['employer_type'])

In [7]:
lbe = LabelEncoder()
data['industry'] = lbe.fit_transform(data['industry'])

In [8]:
data['work_year'] = data['work_year'].map({
    '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4,
    '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9,
    '10+ years': 10
})

data['work_year'].fillna(-1, inplace=True)

In [9]:
def clean_mon(x):
    mons = {'jan':1, 'feb':2, 'mar':3, 'apr':4,  'may':5,  'jun':6,
            'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
    year_group = re.search('(\d+)', x)
    if year_group:
        year = int(year_group.group(1))
        if year < 22:
            year += 2000
        elif 100 > year > 22:
            year += 1900
        else:
            year = 2022
    else:
        year = 2022

    month_group = re.search('([a-zA-Z]+)', x)
    if month_group:
        mon = month_group.group(1).lower()
        month = mons[mon]
    else:
        month = 0

    return year*100 + month

data['earlies_credit_mon'] = data['earlies_credit_mon'].apply(lambda x: clean_mon(x))

In [10]:
cat_feas=[]
#后期可以 做类别特征间的交叉，数值特征间的加减
amount_feas=['issue_mon','recircle_b','interest','debt_loan_ratio','recircle_u','monthly_payment','total_loan','scoring_low','scoring_high','earlies_credit_mon']
for i in data.columns:
    if(i in ['loan_id','user_id','isDefault']):
        continue
    if(data[i].drop_duplicates().shape[0]<=1000):
        cat_feas.append(i)

In [30]:
for f in [[i]for i in cat_feas]:
    df_temp = data.groupby(f).size().reset_index()
    df_temp.columns = f + ['{}_count'.format('_'.join(f))]
    temp_sum = df_temp.iloc[:,1].sum()
    df_temp.iloc[:,1] /= temp_sum
#     print(df_temp)
#     df_temp[1] = df_temp[1]/temp_sum
    data = data.merge(df_temp, how='left')

In [31]:
data.head()

Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,known_outstanding_loan,known_dero,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,app_type,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault,issue_mon,year_of_loan_count,class_count,employer_type_count,industry_count,work_year_count,house_exist_count,censor_status_count,use_count,post_code_count,region_count,del_in_18month_count,scoring_low_count,scoring_high_count,known_outstanding_loan_count,known_dero_count,pub_dero_bankrup_count,initial_list_status_count,app_type_count,earlies_credit_mon_count,policy_code_count,f0_count,f1_count,f2_count,f3_count,f4_count,early_return_count,issue_mon_count
0,1040418,240418,31818.182,3,11.466,1174.91,2,3,13,3.0,0,1,2,193,13,2.43,0,556.364,649.091,3,0,0.0,7734.231,91.8,0,0,200112,5,1,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0.0,201610,0.758,0.286,0.25,0.163,0.077,0.493,0.383,0.066,0.001,0.082,0.812,0.021,0.007,0.013,0.824,0.87,0.588,0.981,0.007,1.0,0.035,0.999,0.087,0.035,0.097,0.285,0.014
1,1025197,225197,28000.0,5,16.841,670.69,2,3,13,10.0,0,2,0,491,30,11.005,1,715.0,893.75,3,0,0.0,31329.0,54.8,1,0,199004,40642,1,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0.0,201306,0.242,0.286,0.25,0.163,0.333,0.493,0.313,0.576,0.002,0.034,0.125,0.009,0.003,0.013,0.824,0.87,0.412,0.981,0.002,1.0,0.091,0.999,0.087,0.001,0.004,0.454,0.009
2,1009360,209360,17272.727,3,8.9,603.32,0,3,3,10.0,1,0,4,459,8,6.409,0,774.545,903.636,5,0,0.0,18514.0,57.692,1,0,199110,154,1,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0.0,201401,0.758,0.172,0.25,0.108,0.333,0.401,0.304,0.218,0.002,0.145,0.812,0.011,0.003,0.045,0.824,0.87,0.412,0.981,0.002,1.0,0.117,0.999,0.077,0.012,0.009,0.454,0.011
3,1039708,239708,20000.0,3,4.788,602.3,0,1,10,6.0,0,1,0,157,8,9.205,0,750.0,875.0,3,0,0.0,20707.0,42.6,0,0,200106,0,1,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0.0,201507,0.758,0.172,0.055,0.077,0.047,0.493,0.383,0.576,0.004,0.145,0.812,0.003,0.006,0.013,0.824,0.87,0.588,0.981,0.005,1.0,0.136,0.999,0.045,0.048,0.072,0.454,0.032
4,1027483,227483,15272.727,3,12.79,470.31,2,3,2,0.0,2,1,0,38,21,15.578,0,609.091,710.606,15,0,0.0,14016.154,30.462,0,0,200205,0,1,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0.0,201607,0.758,0.286,0.25,0.082,0.076,0.106,0.383,0.576,0.006,0.065,0.812,0.021,0.006,0.044,0.824,0.87,0.588,0.981,0.006,1.0,0.034,0.999,0.077,0.048,0.097,0.454,0.014


In [32]:
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()

    return df_merge


def statis_feat(df_know, df_unknow):
    for f in tqdm(cat_feas):
        df_unknow = stat(df_know, df_unknow, [f], {
                         'isDefault': ['mean']})

    return df_unknow

In [33]:
# 5折交叉
df_train = data[~data['isDefault'].isnull()]
df_test = data[data['isDefault'].isnull()]
seed=2022
df_stas_feat = None
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kfold.split(df_train, df_train['isDefault']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
data = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0

In [35]:
import toad
# initialise
c = toad.transform.Combiner()
chosen_cols = ['total_loan','interest','monthly_payment','debt_loan_ratio','scoring_low','scoring_high','recircle_b','recircle_u','early_return_amount','early_return_amount_3mon','isDefault']

# box_train = pd.concat([train_public_less, train_internet_less])
box_train = data[data['isDefault'].notna()]

# 使用特征筛选后的数据进行训练：使用决策树分箱，规定每箱至少有5%数据, 空值将自动被归到最佳箱。
c.fit(box_train[chosen_cols], y = 'isDefault', method = 'dt', min_samples = 0.05)

<toad.transform.Combiner at 0x25ac2bd8ca0>

In [36]:
# 只做分箱 不做woe编码
data_tmp = c.transform(data)

In [37]:
data_tmp.head(10)

Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,known_outstanding_loan,known_dero,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,app_type,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault,issue_mon,year_of_loan_count,class_count,employer_type_count,industry_count,work_year_count,house_exist_count,censor_status_count,use_count,post_code_count,region_count,del_in_18month_count,scoring_low_count,scoring_high_count,known_outstanding_loan_count,known_dero_count,pub_dero_bankrup_count,initial_list_status_count,app_type_count,earlies_credit_mon_count,policy_code_count,f0_count,f1_count,f2_count,f3_count,f4_count,early_return_count,issue_mon_count,year_of_loan_isDefault_mean,class_isDefault_mean,employer_type_isDefault_mean,industry_isDefault_mean,work_year_isDefault_mean,house_exist_isDefault_mean,censor_status_isDefault_mean,use_isDefault_mean,post_code_isDefault_mean,region_isDefault_mean,del_in_18month_isDefault_mean,scoring_low_isDefault_mean,scoring_high_isDefault_mean,known_outstanding_loan_isDefault_mean,known_dero_isDefault_mean,pub_dero_bankrup_isDefault_mean,initial_list_status_isDefault_mean,app_type_isDefault_mean,earlies_credit_mon_isDefault_mean,policy_code_isDefault_mean,f0_isDefault_mean,f1_isDefault_mean,f2_isDefault_mean,f3_isDefault_mean,f4_isDefault_mean,early_return_isDefault_mean,issue_mon_isDefault_mean
0,1040418,240418,14,3,5,14,2,3,13,3.0,0,1,2,193,13,0,0,1,1,3,0,0.0,4,14,0,0,200112,5,1,1.0,0.0,4.0,5.0,4.0,3,9,0,0.0,201610,0.758,0.286,0.25,0.163,0.077,0.493,0.383,0.066,0.001,0.082,0.812,0.021,0.007,0.013,0.824,0.87,0.588,0.981,0.007,1.0,0.035,0.999,0.087,0.035,0.097,0.285,0.014,0.143,0.193,0.179,0.153,0.186,0.145,0.174,0.142,0.167,0.158,0.166,0.257,0.25,0.156,0.16,0.163,0.166,0.168,0.154,0.168,0.167,0.169,0.168,0.136,0.172,0.026,0.139
1,1027483,227483,9,3,7,9,2,3,2,0.0,2,1,0,38,21,6,0,3,4,15,0,0.0,8,3,0,0,200205,0,1,10.0,0.0,6.0,15.0,4.0,0,0,0,0.0,201607,0.758,0.286,0.25,0.082,0.076,0.106,0.383,0.576,0.006,0.065,0.812,0.021,0.006,0.044,0.824,0.87,0.588,0.981,0.006,1.0,0.034,0.999,0.077,0.048,0.097,0.454,0.014,0.143,0.193,0.179,0.192,0.147,0.182,0.174,0.179,0.143,0.177,0.166,0.251,0.292,0.188,0.16,0.163,0.166,0.168,0.133,0.168,0.2,0.169,0.176,0.152,0.172,0.321,0.273
2,1024822,224822,6,3,5,8,1,4,5,2.0,1,1,4,242,8,0,0,7,11,4,0,0.0,2,4,0,0,200810,4,1,4.0,0.0,0.0,9.0,9.0,0,0,0,0.0,201703,0.758,0.294,0.458,0.029,0.087,0.401,0.383,0.218,0.004,0.145,0.812,0.017,0.005,0.028,0.824,0.87,0.588,0.981,0.002,1.0,0.156,0.999,0.031,0.061,0.072,0.454,0.013,0.143,0.118,0.165,0.171,0.17,0.194,0.174,0.153,0.088,0.164,0.166,0.192,0.146,0.127,0.16,0.163,0.166,0.168,0.176,0.168,0.158,0.169,0.172,0.18,0.148,0.321,0.146
3,1045305,245305,6,3,9,6,2,2,1,4.0,1,1,0,263,0,11,0,2,4,8,0,0.0,1,14,0,0,200112,37768,1,3.0,0.0,15.0,6.0,6.0,0,0,0,0.0,201307,0.758,0.286,0.104,0.089,0.059,0.401,0.383,0.576,0.002,0.035,0.812,0.015,0.005,0.086,0.824,0.87,0.588,0.981,0.007,1.0,0.135,0.999,0.02,0.047,0.1,0.454,0.009,0.143,0.193,0.161,0.161,0.166,0.194,0.174,0.179,0.125,0.171,0.166,0.148,0.18,0.182,0.16,0.163,0.166,0.168,0.154,0.168,0.147,0.169,0.219,0.176,0.153,0.321,0.114
4,1010877,210877,14,5,7,13,2,0,7,3.0,0,2,0,19,14,14,0,12,16,17,0,0.0,1,0,0,0,199512,0,1,3.0,0.0,10.0,18.0,6.0,0,0,0,1.0,201504,0.242,0.286,0.097,0.068,0.077,0.493,0.313,0.576,0.012,0.08,0.812,0.001,0.0,0.031,0.824,0.87,0.588,0.981,0.003,1.0,0.135,0.999,0.045,0.034,0.1,0.454,0.023,0.247,0.193,0.167,0.177,0.186,0.145,0.2,0.179,0.119,0.151,0.166,0.0,0.0,0.193,0.16,0.163,0.166,0.168,0.032,0.168,0.147,0.169,0.128,0.19,0.153,0.321,0.162
5,1025265,225265,13,3,0,14,0,2,7,10.0,0,1,0,54,28,14,0,12,15,13,0,0.0,11,3,1,0,199812,0,1,6.0,0.0,9.0,10.0,6.0,0,0,0,0.0,201804,0.758,0.172,0.104,0.068,0.333,0.493,0.383,0.576,0.003,0.005,0.812,0.001,0.0,0.063,0.824,0.87,0.412,0.981,0.004,1.0,0.117,0.999,0.056,0.055,0.1,0.454,0.004,0.143,0.058,0.161,0.177,0.152,0.145,0.174,0.179,0.103,0.1,0.166,0.0,0.0,0.179,0.16,0.163,0.171,0.168,0.1,0.168,0.15,0.169,0.155,0.156,0.153,0.321,0.114
6,1000808,200808,14,5,8,12,2,3,10,8.0,0,1,3,340,14,4,0,2,2,10,1,0.0,9,12,1,0,198807,3,1,5.0,0.0,6.0,8.0,6.0,0,0,0,0.0,201702,0.242,0.286,0.25,0.077,0.045,0.493,0.383,0.022,0.002,0.08,0.812,0.015,0.005,0.085,0.147,0.87,0.412,0.981,0.001,1.0,0.136,0.999,0.077,0.057,0.1,0.454,0.011,0.247,0.193,0.179,0.168,0.144,0.145,0.174,0.131,0.105,0.151,0.166,0.17,0.205,0.143,0.19,0.163,0.171,0.168,0.2,0.168,0.168,0.169,0.176,0.165,0.153,0.321,0.18
7,1001157,201157,1,3,3,1,1,4,1,0.0,1,1,0,267,27,3,0,0,0,8,2,2.0,0,0,0,0,199611,0,1,3.0,0.0,5.0,15.0,6.0,1,3,4,0.0,201508,0.758,0.294,0.458,0.089,0.076,0.401,0.383,0.576,0.002,0.012,0.812,0.022,0.007,0.086,0.019,0.006,0.588,0.981,0.004,1.0,0.135,0.999,0.084,0.048,0.1,0.107,0.023,0.143,0.118,0.165,0.161,0.147,0.194,0.174,0.179,0.182,0.121,0.166,0.217,0.268,0.182,0.333,0.432,0.166,0.168,0.233,0.168,0.147,0.169,0.182,0.152,0.153,0.018,0.166
8,1046115,246115,3,3,1,3,0,4,3,2.0,1,0,2,167,8,3,1,10,15,18,0,0.0,3,0,0,0,200108,5,1,8.0,0.0,7.0,23.0,17.0,3,3,3,0.0,201506,0.758,0.172,0.458,0.108,0.087,0.401,0.304,0.066,0.008,0.145,0.125,0.02,0.006,0.024,0.824,0.87,0.588,0.981,0.007,1.0,0.069,0.999,0.069,0.019,0.013,0.285,0.02,0.143,0.058,0.165,0.181,0.17,0.194,0.128,0.142,0.179,0.164,0.175,0.17,0.17,0.22,0.16,0.163,0.166,0.168,0.175,0.168,0.189,0.169,0.15,0.143,0.139,0.026,0.163
9,1000323,200323,9,3,0,8,0,3,11,10.0,0,0,2,230,2,3,0,9,15,12,0,0.0,7,6,0,0,200210,5,1,6.0,0.0,4.0,12.0,4.0,3,6,5,0.0,201509,0.758,0.172,0.25,0.124,0.333,0.493,0.304,0.066,0.005,0.04,0.812,0.007,0.002,0.07,0.824,0.87,0.588,0.981,0.005,1.0,0.117,0.999,0.087,0.056,0.097,0.285,0.016,0.143,0.058,0.179,0.142,0.152,0.145,0.128,0.142,0.154,0.161,0.166,0.13,0.071,0.193,0.16,0.163,0.166,0.168,0.125,0.168,0.15,0.169,0.168,0.181,0.172,0.026,0.162


In [16]:
#模型
# train = data[data['isDefault'].notna()]
# test  = data[data['isDefault'].isna()]

# # 初始化
# transer = toad.transform.WOETransformer()

# to_drop = ['loan_id']

# # combiner.transform() & transer.fit_transform() 转化训练数据，并去掉target列
# transer.fit_transform(train, 'isDefault', exclude=to_drop)
# train_woe = transer.transform(c.transform(train))
# test_woe = transer.transform(c.transform(test))

In [22]:
# 将woe转化后的数据做逐步回归

# final_data = toad.selection.stepwise(train_woe, target = 'isDefault', estimator='ols', direction = 'both', criterion = 'aic', exclude = to_drop)

# 将选出的变量应用于test/OOT数据
# final_train = train_woe[final_data.columns]
# final_test = test_woe[final_data.columns]

In [16]:
is_gridsearch = False

#模型
train = data_tmp[data['isDefault'].notna()]
test  = data_tmp[data['isDefault'].isna()]
print(train.shape)
print(test.shape)

ycol = 'isDefault'
feature_names = list(
    filter(lambda x: x not in [ycol, 'loan_id'], train.columns))

# 交叉验证lightgbm
if is_gridsearch:
    grid = [{
        'num_leaves': [10, 16, 25, 30],
        'max_depth': [3, 4, 5, 6],
        'reg_alpha': [0, 0.1, 0.2],
        'reg_lambda': [0, 0.2, 0.5, 1.0],
        'learning_rate': [0.01, 0.02, 0.05]
    }]
    score_detail = []
    best_score = 0
    for param in ParameterGrid(grid):
        logger.info(param)
        param['random_state'] = 1
        scores = []
        skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
        count = 0
        for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
            X_train = train.iloc[trn_idx][feature_names]
            Y_train = train.iloc[trn_idx][ycol]

            X_val = train.iloc[val_idx][feature_names]
            Y_val = train.iloc[val_idx][ycol]

            clf = LGBMClassifier(**param)
            print('\nFold_{} Training ================================\n'.format(fold_id+1))

            lgb_model = clf.fit(X_train,
                                  Y_train,
                                  eval_names=['train', 'valid'],
                                  eval_set=[(X_train, Y_train), (X_val, Y_val)],
                                  verbose=200,
                                  eval_metric='auc',
                                  early_stopping_rounds=500)
            
            y_proba = lgb_model.predict_proba(X_val)[:, 1]
            score = roc_auc_score(Y_val, y_proba)
            scores.append(score)
        scores = np.array(scores)
        logger.info(scores)
        logger.info(scores.mean())
        if scores.mean() > best_score:
            best_score = scores.mean()
        logger.info(f'best score {best_score}')
        score_detail.append([param, scores, scores.mean()])
    best_param = sorted(score_detail, key=lambda x: x[2])[-1][0]

(10000, 93)
(5000, 93)


In [25]:
#模型
train = data_tmp[data['isDefault'].notna()]
test  = data_tmp[data['isDefault'].isna()]
# print(train.shape)
# print(test.shape)


ycol = 'isDefault'
feature_names = list(
    filter(lambda x: x not in [ycol, 'loan_id'], train.columns))
#     filter(lambda x: x not in [ycol], train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=16,
                           max_depth=5,
                           learning_rate=0.02,
                           n_estimators=10000,
                           subsample=0.45,
                           feature_fraction=0.5,
                           reg_alpha=0.1,
                           reg_lambda=0.5,
                           random_state=2021,
                           is_unbalance=True,
                           num_thread=40,
                           metric='auc')



prediction = test[['loan_id']]
prediction[ycol] = 0
df_importance_list = []
seeds=[2021]#后面可以用多个种子增加鲁棒性
for seed in seeds:
    oof = []
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
        X_train = train.iloc[trn_idx][feature_names]
        Y_train = train.iloc[trn_idx][ycol]

        X_val = train.iloc[val_idx][feature_names]
        Y_val = train.iloc[val_idx][ycol]

        print('\nFold_{} Training ================================\n'.format(fold_id+1))

        lgb_model = model.fit(X_train,
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=200,
                              eval_metric='auc',
                              early_stopping_rounds=500)

        pred_val = lgb_model.predict_proba(
            X_val, num_iteration=lgb_model.best_iteration_)
        df_oof = train.iloc[val_idx][['loan_id', ycol]].copy()
        df_oof['pred'] = pred_val[:, 1]
        oof.append(df_oof)

        pred_test = lgb_model.predict_proba(
            test[feature_names], num_iteration=lgb_model.best_iteration_)
        prediction[ycol] += pred_test[:, 1] / (kfold.n_splits*len(seeds))
        del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
        gc.collect()
    oof = pd.concat(oof)
    print('roc_auc_score:', roc_auc_score(oof['isDefault'], oof['pred']))



Training until validation scores don't improve for 500 rounds
[200]	train's auc: 0.92199	valid's auc: 0.882841
[400]	train's auc: 0.956015	valid's auc: 0.883426
[600]	train's auc: 0.975737	valid's auc: 0.882709
Early stopping, best iteration is:
[246]	train's auc: 0.93106	valid's auc: 0.884077


Training until validation scores don't improve for 500 rounds
[200]	train's auc: 0.923245	valid's auc: 0.880455
[400]	train's auc: 0.953359	valid's auc: 0.879817
[600]	train's auc: 0.973819	valid's auc: 0.880033
Early stopping, best iteration is:
[184]	train's auc: 0.920171	valid's auc: 0.880927


Training until validation scores don't improve for 500 rounds
[200]	train's auc: 0.923683	valid's auc: 0.883825
[400]	train's auc: 0.956343	valid's auc: 0.885451
[600]	train's auc: 0.974951	valid's auc: 0.88522
[800]	train's auc: 0.986119	valid's auc: 0.885699
[1000]	train's auc: 0.993037	valid's auc: 0.885954
[1200]	train's auc: 0.996435	valid's auc: 0.885561
[1400]	train's auc: 0.998275	valid's au

In [26]:
prediction.columns = ['id', 'isDefault']
prediction.head()
prediction.to_csv('submission.csv', index=False)