In [1]:
import logging

import numpy as np
import toad
import pandas as pd
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

import config

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter()
sh = logging.StreamHandler()
sh.setFormatter(formatter)
logger.addHandler(sh)

is_eda = False
is_gridsearch = False

In [3]:
# 读数据
train_public = pd.read_csv(config.TRAIN_PUBLIC_PATH)
train_internet = pd.read_csv(config.TRAIN_INTERNET_PATH)
test_public = pd.read_csv(config.TEST_PUBLIC_PATH)

In [4]:
# 标签列重命名
train_internet = train_internet.rename(columns={'is_default': 'isDefault'})

In [5]:
# 缺失值求平均，work_year先转换然后再平均
def work_year_apply(x):
    if x == '< 1 year':
        return 1
    elif x == '1 year':
        return 2
    elif x == '2 years':
        return 3
    elif x == '3 years':
        return 4
    elif x == '4 years':
        return 5
    elif x == '5 years':
        return 6
    elif x == '6 years':
        return 7
    elif x == '7 years':
        return 8
    elif x == '8 years':
        return 9
    elif x == '9 years':
        return 10
    elif x == '10+ years':
        return 11

In [6]:
# 对work_year的转换处理
train_public['work_year'] = train_public['work_year'].apply(work_year_apply)
train_internet['work_year'] = train_internet['work_year'].apply(work_year_apply)
test_public['work_year'] = test_public['work_year'].apply(work_year_apply)

train_internet['work_year'] = train_internet['work_year'].fillna(train_internet['work_year'].median())
test_public['work_year'] = test_public['work_year'].fillna(test_public['work_year'].median())

In [7]:
# 特征工程
## 找相同的特征
common_cols = []
for col in train_public.columns:
    if col in train_internet.columns:
        common_cols.append(col)
logger.info(len(common_cols))

## 只取相同的列
train_public_less = train_public.loc[:, common_cols]
train_internet_less = train_internet.loc[:, common_cols]
test_public_less = test_public.loc[:, common_cols[:-1]]

36


In [8]:
# 补充train_public_less的缺失数据
# work_year用中位数填补
train_public_less['work_year'] = train_public_less['work_year'].fillna(train_public_less['work_year'].median())
# pub_dero_bankrup用众数填补
train_public_less['pub_dero_bankrup'] = train_public_less['pub_dero_bankrup'].fillna(train_public_less['pub_dero_bankrup'].mode()[0])
# f0-f4用平均数填补
train_public_less['f0'] = train_public_less['f0'].fillna(train_public_less['f0'].mean())
train_public_less['f1'] = train_public_less['f1'].fillna(train_public_less['f1'].mean())
train_public_less['f2'] = train_public_less['f2'].fillna(train_public_less['f2'].mean())
train_public_less['f3'] = train_public_less['f3'].fillna(train_public_less['f3'].mean())
train_public_less['f4'] = train_public_less['f4'].fillna(train_public_less['f4'].mean())

In [9]:
# 补充train_internet_less的缺失数据
# debt_loan_ratio用均值填补
train_internet_less['debt_loan_ratio'] = train_internet_less['debt_loan_ratio'].fillna(train_internet_less['debt_loan_ratio'].mean())
# pub_dero_bankrup用众数填补
train_internet_less['pub_dero_bankrup'] = train_internet_less['pub_dero_bankrup'].fillna(train_internet_less['pub_dero_bankrup'].mode()[0])
# post_code用众数填补
train_internet_less['post_code'] = train_internet_less['post_code'].fillna(train_internet_less['post_code'].mode()[0])
# recircle_u用均值填补
train_internet_less['recircle_u'] = train_internet_less['recircle_u'].fillna(train_internet_less['recircle_u'].mean())
# f0-f4用平均数填充
train_internet_less['f0'] = train_internet_less['f0'].fillna(train_internet_less['f0'].mean())
train_internet_less['f1'] = train_internet_less['f1'].fillna(train_internet_less['f1'].mean())
train_internet_less['f2'] = train_internet_less['f2'].fillna(train_internet_less['f2'].mean())
train_internet_less['f3'] = train_internet_less['f3'].fillna(train_internet_less['f3'].mean())
train_internet_less['f4'] = train_internet_less['f4'].fillna(train_internet_less['f4'].mean())
# title用众数填充
train_internet_less['title'] = train_internet_less['title'].fillna(train_internet_less['title'].mode()[0])

In [10]:
# 补充test_public_less的缺失数据
# work_year用中位数填补
test_public_less['work_year'] = test_public_less['work_year'].fillna(test_public_less['work_year'].median())
# pub_dero_bankrup用众数填补
test_public_less['pub_dero_bankrup'] = test_public_less['pub_dero_bankrup'].fillna(test_public_less['pub_dero_bankrup'].mode()[0])
# f0-f4用平均数填充
test_public_less['f0'] = test_public_less['f0'].fillna(test_public_less['f0'].mean())
test_public_less['f1'] = test_public_less['f1'].fillna(test_public_less['f1'].mean())
test_public_less['f2'] = test_public_less['f2'].fillna(test_public_less['f2'].mean())
test_public_less['f3'] = test_public_less['f3'].fillna(test_public_less['f3'].mean())
test_public_less['f4'] = test_public_less['f4'].fillna(test_public_less['f4'].mean())

In [11]:
# 处理类别特征
def class_apply(x):
    if x == 'A':
        return 1
    elif x == 'B':
        return 2
    elif x == 'C':
        return 3
    elif x == 'D':
        return 4
    elif x == 'E':
        return 5
    elif x == 'F':
        return 6
    elif x == 'G':
        return 7
    else:
        return 0

In [12]:
# 'class', 'employer_type', 'industry'三个特征的标签
train_public_less['class'] = train_public_less['class'].apply(class_apply)
train_internet_less['class'] = train_internet_less['class'].apply(class_apply)
test_public_less['class'] = test_public_less['class'].apply(class_apply)

In [13]:
from tensorflow.keras.utils import to_categorical

In [14]:
cat_cols = ['employer_type', 'industry']
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    lbl = LabelEncoder().fit(train_public_less[col])
    train_public_less[col] = lbl.transform(train_public_less[col])
    train_internet_less[col] = lbl.transform(train_internet_less[col])
    test_public_less[col] = lbl.transform(test_public_less[col])

In [15]:
# 对employer_type做one-hot编码并加入表格
train_public_employer_type = to_categorical(train_public_less['employer_type'])
train_public_employer_type = pd.DataFrame(train_public_employer_type)
train_public_employer_type.columns=['上市企业','世界五百强','幼教与中小学校','政府机构','普通企业','高等教育机构']
train_public_less = train_public_less.join(train_public_employer_type)

train_internet_employer_type = to_categorical(train_internet_less['employer_type'])
train_internet_employer_type = pd.DataFrame(train_internet_employer_type)
train_internet_employer_type.columns=['上市企业','世界五百强','幼教与中小学校','政府机构','普通企业','高等教育机构']
train_internet_less = train_internet_less.join(train_internet_employer_type)

test_public_employer_type = to_categorical(test_public_less['employer_type'])
test_public_employer_type = pd.DataFrame(test_public_employer_type)
test_public_employer_type.columns=['上市企业','世界五百强','幼教与中小学校','政府机构','普通企业','高等教育机构']
test_public_less = test_public_less.join(test_public_employer_type)

In [16]:
train_public_less['issue_date'] = pd.to_datetime(train_public_less['issue_date'])
train_internet_less['issue_date'] = pd.to_datetime(train_internet_less['issue_date'])
test_public_less['issue_date'] = pd.to_datetime(test_public_less['issue_date'])

train_public_less['issue_date_year'] = pd.to_datetime(train_public_less['issue_date']).dt.year
train_internet_less['issue_date_year'] = pd.to_datetime(train_internet_less['issue_date']).dt.year
test_public_less['issue_date_year'] = pd.to_datetime(test_public_less['issue_date']).dt.year


train_public_less['issue_date_month'] = train_public_less['issue_date'].dt.month
train_internet_less['issue_date_month'] = train_internet_less['issue_date'].dt.month
test_public_less['issue_date_month'] = test_public_less['issue_date'].dt.month

train_public_less['issue_date_dayofweek'] = train_public_less['issue_date'].dt.dayofweek
train_internet_less['issue_date_dayofweek'] = train_internet_less['issue_date'].dt.dayofweek
test_public_less['issue_date_dayofweek'] = test_public_less['issue_date'].dt.dayofweek

train_public_less = train_public_less.drop(columns=['issue_date','earlies_credit_mon'])
train_internet_less = train_internet_less.drop(columns=['issue_date','earlies_credit_mon'])
test_public_less = test_public_less.drop(columns=['issue_date','earlies_credit_mon'])

In [17]:
# initialise
c = toad.transform.Combiner()

box_train = pd.concat([train_public_less, train_internet_less])

to_drop = ['loan_id']+['user_id']+['employer_type']+['industry']+['censor_status']+['use']+['post_code']+['region']+['initial_list_status']+['title']+['policy_code']+['上市企业']+['世界五百强']+['幼教与中小学校']+['政府机构']+['普通企业']+['高等教育机构']+['issue_date_year']+['issue_date_month']+['issue_date_dayofweek']

# 使用特征筛选后的数据进行训练：使用决策树分箱，规定每箱至少有5%数据, 空值将自动被归到最佳箱。
# 卡方分箱特别慢？
c.fit(box_train.drop(columns=to_drop), y = 'isDefault', method = 'dt', min_samples = 0.05)

<toad.transform.Combiner at 0x231cd40b4f0>

In [18]:
test_public_less = c.transform(test_public_less, labels=False)

In [19]:
train_public_less.columns

Index(['loan_id', 'user_id', 'total_loan', 'year_of_loan', 'interest',
       'monthly_payment', 'class', 'employer_type', 'industry', 'work_year',
       'house_exist', 'censor_status', 'use', 'post_code', 'region',
       'debt_loan_ratio', 'del_in_18month', 'scoring_low', 'scoring_high',
       'pub_dero_bankrup', 'recircle_b', 'recircle_u', 'initial_list_status',
       'title', 'policy_code', 'f0', 'f1', 'f2', 'f3', 'f4', 'early_return',
       'early_return_amount', 'early_return_amount_3mon', 'isDefault', '上市企业',
       '世界五百强', '幼教与中小学校', '政府机构', '普通企业', '高等教育机构', 'issue_date_year',
       'issue_date_month', 'issue_date_dayofweek'],
      dtype='object')

In [20]:
train_public_less = c.transform(train_public_less, labels=False)
train_internet_less = c.transform(train_internet_less, labels=False)

In [21]:
train_public_less

Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,...,isDefault,上市企业,世界五百强,幼教与中小学校,政府机构,普通企业,高等教育机构,issue_date_year,issue_date_month,issue_date_dayofweek
0,1040418,240418,12,0,6,14,2,3,13,3,...,0,0.0,0.0,0.0,1.0,0.0,0.0,2016,10,5
1,1025197,225197,11,1,12,12,2,3,13,8,...,0,0.0,0.0,0.0,1.0,0.0,0.0,2013,6,5
2,1009360,209360,9,0,3,11,0,3,3,8,...,0,0.0,0.0,0.0,1.0,0.0,0.0,2014,1,2
3,1039708,239708,10,0,0,11,0,1,10,6,...,0,0.0,1.0,0.0,0.0,0.0,0.0,2015,7,2
4,1027483,227483,8,0,8,9,2,3,2,0,...,0,0.0,0.0,0.0,1.0,0.0,0.0,2016,7,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1028093,228093,9,0,11,10,1,4,7,7,...,0,0.0,0.0,0.0,0.0,1.0,0.0,2013,11,4
9996,1043911,243911,6,0,0,9,0,3,4,2,...,0,0.0,0.0,0.0,1.0,0.0,0.0,2015,12,1
9997,1023503,223503,11,0,10,12,1,4,2,8,...,0,0.0,0.0,0.0,0.0,1.0,0.0,2012,12,5
9998,1024616,224616,10,0,13,12,3,3,4,8,...,0,0.0,0.0,0.0,1.0,0.0,0.0,2018,3,3


In [22]:
train_public_less.columns

Index(['loan_id', 'user_id', 'total_loan', 'year_of_loan', 'interest',
       'monthly_payment', 'class', 'employer_type', 'industry', 'work_year',
       'house_exist', 'censor_status', 'use', 'post_code', 'region',
       'debt_loan_ratio', 'del_in_18month', 'scoring_low', 'scoring_high',
       'pub_dero_bankrup', 'recircle_b', 'recircle_u', 'initial_list_status',
       'title', 'policy_code', 'f0', 'f1', 'f2', 'f3', 'f4', 'early_return',
       'early_return_amount', 'early_return_amount_3mon', 'isDefault', '上市企业',
       '世界五百强', '幼教与中小学校', '政府机构', '普通企业', '高等教育机构', 'issue_date_year',
       'issue_date_month', 'issue_date_dayofweek'],
      dtype='object')

In [23]:
# 去除非数值类型特征
# drop_cols = list(train_public_less.select_dtypes('object').columns) + ['isDefault']
drop_cols = ['employer_type'] + ['isDefault']

# 构造模型数据
X_train = train_public_less.drop(columns=drop_cols).values
y_train = train_public_less['isDefault'].values

X_internet = train_internet_less.drop(columns=drop_cols).values
y_internet = train_internet_less['isDefault'].values

X_test = test_public_less.drop(columns=drop_cols[:-1]).values

In [24]:
drop_cols

['employer_type', 'isDefault']

In [25]:
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
count = 0
scores = []
clf = LGBMClassifier(num_leaves=32, random_state=1, max_depth=5, n_estimators=100, metric='auc')
# 两个表一起测
for train_index, valid_index in skf.split(X_train, y_train):
    count += 1
    X_more, X_less = X_train[train_index], X_train[valid_index]
    y_more, y_less = y_train[train_index], y_train[valid_index]
    X_more_internet = np.concatenate([X_more, X_internet])
    y_more_internet = np.concatenate([y_more, y_internet])
    clf.fit(X_more_internet, y_more_internet, eval_metric='auc')
    y_proba = clf.predict_proba(X_less)[:, 1]
    score = roc_auc_score(y_less, y_proba)
    scores.append(score)
    logger.info(f'train k fold {count}:{score}')
scores = np.array(scores)
logger.info(scores)
logger.info(scores.mean())

train k fold 1:0.8826872996794871
train k fold 2:0.8742416437728938
train k fold 3:0.8764897016760314
train k fold 4:0.8806739812751258
train k fold 5:0.87631483625995
[0.8826873  0.87424164 0.8764897  0.88067398 0.87631484]
0.8780814925326975


In [26]:
# 合并两个数据集
X_final_train = np.concatenate([X_train, X_internet])
y_final_train = np.concatenate([y_train, y_internet])

In [27]:
# 测试集
clf_ex = LGBMClassifier(num_leaves=32, random_state=1, max_depth=5, n_estimators=100, metric='auc')
clf_ex.fit(X_final_train, y_final_train, eval_metric='auc')
y_proba = clf_ex.predict_proba(X_test)[:, 1]

In [28]:
submission = pd.DataFrame({'id': test_public['loan_id'], 'isDefault': y_proba})
submission.to_csv('submission.csv', index=None)

In [29]:
clf_ex.feature_importances_

array([199,  54, 102,  80, 228,  89, 102,  11,  78, 107,  29,  40, 105,
       124, 141,  16, 114,  20,  15, 131,  49,   1,  76,   0, 131,  30,
       119,  84,  48,  15,  90,  17,  11,  31,  13,  78,  68,  50, 245,
       104,  13])

In [43]:
train_public_less.columns

Index(['loan_id', 'user_id', 'total_loan', 'year_of_loan', 'interest',
       'monthly_payment', 'class', 'employer_type', 'industry', 'work_year',
       'house_exist', 'censor_status', 'issue_date', 'use', 'post_code',
       'region', 'debt_loan_ratio', 'del_in_18month', 'scoring_low',
       'scoring_high', 'pub_dero_bankrup', 'recircle_b', 'recircle_u',
       'initial_list_status', 'earlies_credit_mon', 'title', 'policy_code',
       'f0', 'f1', 'f2', 'f3', 'f4', 'early_return', 'early_return_amount',
       'early_return_amount_3mon', 'isDefault', '上市企业', '世界五百强', '幼教与中小学校',
       '政府机构', '普通企业', '高等教育机构', 'issue_date_year', 'issue_date_month',
       'issue_date_dayofweek', 'stable_factor'],
      dtype='object')