In [1]:
import logging

import numpy as np
import toad
import pandas as pd
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

import config

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter()
sh = logging.StreamHandler()
sh.setFormatter(formatter)
logger.addHandler(sh)

is_eda = False
is_gridsearch = False

In [3]:
# 读数据
train_public = pd.read_csv(config.TRAIN_PUBLIC_PATH)
train_internet = pd.read_csv(config.TRAIN_INTERNET_PATH)
test_public = pd.read_csv(config.TEST_PUBLIC_PATH)

In [4]:
# 标签列重命名
train_internet = train_internet.rename(columns={'is_default': 'isDefault'})

In [5]:
# 缺失值求平均，work_year先转换然后再平均
def work_year_apply(x):
    if x == '< 1 year':
        return 1
    elif x == '1 year':
        return 2
    elif x == '2 years':
        return 3
    elif x == '3 years':
        return 4
    elif x == '4 years':
        return 5
    elif x == '5 years':
        return 6
    elif x == '6 years':
        return 7
    elif x == '7 years':
        return 8
    elif x == '8 years':
        return 9
    elif x == '9 years':
        return 10
    elif x == '10+ years':
        return 11

In [6]:
# 对work_year的转换处理
train_public['work_year'] = train_public['work_year'].apply(work_year_apply)
train_internet['work_year'] = train_internet['work_year'].apply(work_year_apply)
test_public['work_year'] = test_public['work_year'].apply(work_year_apply)

train_internet['work_year'] = train_internet['work_year'].fillna(train_internet['work_year'].median())
test_public['work_year'] = test_public['work_year'].fillna(test_public['work_year'].median())

In [7]:
# 特征工程
## 找相同的特征
common_cols = []
for col in train_public.columns:
    if col in train_internet.columns:
        common_cols.append(col)
logger.info(len(common_cols))

## 只取相同的列
train_public_less = train_public.loc[:, common_cols]
train_internet_less = train_internet.loc[:, common_cols]
test_public_less = test_public.loc[:, common_cols[:-1]]

36


In [8]:
# 补充train_public_less的缺失数据
# work_year用中位数填补
train_public_less['work_year'] = train_public_less['work_year'].fillna(train_public_less['work_year'].median())
# pub_dero_bankrup用众数填补
train_public_less['pub_dero_bankrup'] = train_public_less['pub_dero_bankrup'].fillna(train_public_less['pub_dero_bankrup'].mode()[0])
# f0-f4用平均数填补
train_public_less['f0'] = train_public_less['f0'].fillna(train_public_less['f0'].mean())
train_public_less['f1'] = train_public_less['f1'].fillna(train_public_less['f1'].mean())
train_public_less['f2'] = train_public_less['f2'].fillna(train_public_less['f2'].mean())
train_public_less['f3'] = train_public_less['f3'].fillna(train_public_less['f3'].mean())
train_public_less['f4'] = train_public_less['f4'].fillna(train_public_less['f4'].mean())

In [9]:
# 补充train_internet_less的缺失数据
# debt_loan_ratio用均值填补
train_internet_less['debt_loan_ratio'] = train_internet_less['debt_loan_ratio'].fillna(train_internet_less['debt_loan_ratio'].mean())
# pub_dero_bankrup用众数填补
train_internet_less['pub_dero_bankrup'] = train_internet_less['pub_dero_bankrup'].fillna(train_internet_less['pub_dero_bankrup'].mode()[0])
# post_code用众数填补
train_internet_less['post_code'] = train_internet_less['post_code'].fillna(train_internet_less['post_code'].mode()[0])
# recircle_u用均值填补
train_internet_less['recircle_u'] = train_internet_less['recircle_u'].fillna(train_internet_less['recircle_u'].mean())
# f0-f4用平均数填充
train_internet_less['f0'] = train_internet_less['f0'].fillna(train_internet_less['f0'].mean())
train_internet_less['f1'] = train_internet_less['f1'].fillna(train_internet_less['f1'].mean())
train_internet_less['f2'] = train_internet_less['f2'].fillna(train_internet_less['f2'].mean())
train_internet_less['f3'] = train_internet_less['f3'].fillna(train_internet_less['f3'].mean())
train_internet_less['f4'] = train_internet_less['f4'].fillna(train_internet_less['f4'].mean())
# title用众数填充
train_internet_less['title'] = train_internet_less['title'].fillna(train_internet_less['title'].mode()[0])

In [10]:
# 补充test_public_less的缺失数据
# work_year用中位数填补
test_public_less['work_year'] = test_public_less['work_year'].fillna(test_public_less['work_year'].median())
# pub_dero_bankrup用众数填补
test_public_less['pub_dero_bankrup'] = test_public_less['pub_dero_bankrup'].fillna(test_public_less['pub_dero_bankrup'].mode()[0])
# f0-f4用平均数填充
test_public_less['f0'] = test_public_less['f0'].fillna(test_public_less['f0'].mean())
test_public_less['f1'] = test_public_less['f1'].fillna(test_public_less['f1'].mean())
test_public_less['f2'] = test_public_less['f2'].fillna(test_public_less['f2'].mean())
test_public_less['f3'] = test_public_less['f3'].fillna(test_public_less['f3'].mean())
test_public_less['f4'] = test_public_less['f4'].fillna(test_public_less['f4'].mean())

In [11]:
# 处理类别特征
def class_apply(x):
    if x == 'A':
        return 1
    elif x == 'B':
        return 2
    elif x == 'C':
        return 3
    elif x == 'D':
        return 4
    elif x == 'E':
        return 5
    elif x == 'F':
        return 6
    elif x == 'G':
        return 7
    else:
        return 0


# def employer_type_apply(x):
#     if x == '普通企业':
#         return 1
#     elif x == '幼教与中小学校':
#         return 2
#     elif x == '高等教育机构':
#         return 3
#     elif x == '政府机构':
#         return 4
#     elif x == '上市企业':
#         return 5
#     elif x == '世界五百强':
#         return 6
#     else:
#         return 0



# def industry_apply(x):
#     if x == '金融业':
#         return 1
#     elif x == '公共服务、社会组织':
#         return 2
#     elif x == '文化和体育业':
#         return 3
#     elif x == '信息传输、软件和信息技术服务业':
#         return 4
#     elif x == '制造业':
#         return 5
#     elif x == '住宿和餐饮业':
#         return 6
#     elif x == '建筑业':
#         return 7
#     elif x == '电力、热力生产供应业':
#         return 8
#     elif x == '房地产业':
#         return 9
#     elif x == '交通运输、仓储和邮政业':
#         return 10
#     elif x == '批发和零售业':
#         return 11
#     elif x == '农、林、牧、渔业':
#         return 12
#     elif x == '采矿业':
#         return 13
#     elif x == '国际组织':
#         return 14
#     else:
#         return 0

In [12]:
# 'class', 'employer_type', 'industry'三个特征的标签
train_public_less['class'] = train_public_less['class'].apply(class_apply)
# train_public_less['employer_type'] = train_public_less['employer_type'].apply(employer_type_apply)
# train_public_less['industry'] = train_public_less['industry'].apply(industry_apply)

train_internet_less['class'] = train_internet_less['class'].apply(class_apply)
# train_internet_less['employer_type'] = train_internet_less['employer_type'].apply(employer_type_apply)
# train_internet_less['industry'] = train_internet_less['industry'].apply(industry_apply)

test_public_less['class'] = test_public_less['class'].apply(class_apply)
# test_public_less['employer_type'] = test_public_less['employer_type'].apply(employer_type_apply)
# test_public_less['industry'] = test_public_less['industry'].apply(industry_apply)

In [13]:
from tensorflow.keras.utils import to_categorical

In [14]:
cat_cols = ['employer_type', 'industry']
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    lbl = LabelEncoder().fit(train_public_less[col])
    train_public_less[col] = lbl.transform(train_public_less[col])
    train_internet_less[col] = lbl.transform(train_internet_less[col])
    test_public_less[col] = lbl.transform(test_public_less[col])

In [15]:
# 对employer_type做one-hot编码并加入表格
train_public_employer_type = to_categorical(train_public_less['employer_type'])
train_public_employer_type = pd.DataFrame(train_public_employer_type)
train_public_employer_type.columns=['上市企业','世界五百强','幼教与中小学校','政府机构','普通企业','高等教育机构']
train_public_less = train_public_less.join(train_public_employer_type)

train_internet_employer_type = to_categorical(train_internet_less['employer_type'])
train_internet_employer_type = pd.DataFrame(train_internet_employer_type)
train_internet_employer_type.columns=['上市企业','世界五百强','幼教与中小学校','政府机构','普通企业','高等教育机构']
train_internet_less = train_internet_less.join(train_internet_employer_type)

test_public_employer_type = to_categorical(test_public_less['employer_type'])
test_public_employer_type = pd.DataFrame(test_public_employer_type)
test_public_employer_type.columns=['上市企业','世界五百强','幼教与中小学校','政府机构','普通企业','高等教育机构']
test_public_less = test_public_less.join(test_public_employer_type)

In [16]:
# 对industry做one-hot编码并加入表格
train_public_industry = to_categorical(train_public_less['industry'])
train_public_industry = pd.DataFrame(train_public_industry)
# train_public_industry.columns=['上市企业','世界五百强','幼教与中小学校','政府机构','普通企业','高等教育机构']
train_public_less = train_public_less.join(train_public_industry)

train_internet_industry = to_categorical(train_internet_less['industry'])
train_internet_industry = pd.DataFrame(train_internet_industry)
# train_public_industry.columns=['上市企业','世界五百强','幼教与中小学校','政府机构','普通企业','高等教育机构']
train_internet_less = train_internet_less.join(train_internet_industry)

test_public_industry = to_categorical(test_public_less['industry'])
test_public_industry = pd.DataFrame(test_public_industry)
# train_public_industry.columns=['上市企业','世界五百强','幼教与中小学校','政府机构','普通企业','高等教育机构']
test_public_less = test_public_less.join(test_public_industry)

In [17]:
train_public_less['issue_date'] = pd.to_datetime(train_public_less['issue_date'])
train_internet_less['issue_date'] = pd.to_datetime(train_internet_less['issue_date'])
test_public_less['issue_date'] = pd.to_datetime(test_public_less['issue_date'])

train_public_less['issue_date_year'] = pd.to_datetime(train_public_less['issue_date']).dt.year
train_internet_less['issue_date_year'] = pd.to_datetime(train_internet_less['issue_date']).dt.year
test_public_less['issue_date_year'] = pd.to_datetime(test_public_less['issue_date']).dt.year


train_public_less['issue_date_month'] = train_public_less['issue_date'].dt.month
train_internet_less['issue_date_month'] = train_internet_less['issue_date'].dt.month
test_public_less['issue_date_month'] = test_public_less['issue_date'].dt.month

train_public_less['issue_date_dayofweek'] = train_public_less['issue_date'].dt.dayofweek
train_internet_less['issue_date_dayofweek'] = train_internet_less['issue_date'].dt.dayofweek
test_public_less['issue_date_dayofweek'] = test_public_less['issue_date'].dt.dayofweek

In [18]:
# 去除非数值类型特征
# drop_cols = list(train_public_less.select_dtypes('object').columns) + ['isDefault']
drop_cols = ['employer_type'] + ['industry'] + ['issue_date'] + ['earlies_credit_mon'] + ['isDefault']

# 构造模型数据
X_train = train_public_less.drop(columns=drop_cols).values
y_train = train_public_less['isDefault'].values

X_internet = train_internet_less.drop(columns=drop_cols).values
y_internet = train_internet_less['isDefault'].values

X_test = test_public_less.drop(columns=drop_cols[:-1]).values

In [19]:
drop_cols

['employer_type', 'industry', 'issue_date', 'earlies_credit_mon', 'isDefault']

In [20]:
train_public_less.shape

(10000, 59)

In [21]:
train_internet_less.shape

(750000, 59)

In [26]:
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
count = 0
scores = []
clf = LGBMClassifier(num_leaves=32, random_state=1, max_depth=6, n_estimators=100, metric='auc')
# 两个表一起测
for train_index, valid_index in skf.split(X_train, y_train):
    count += 1
    X_more, X_less = X_train[train_index], X_train[valid_index]
    y_more, y_less = y_train[train_index], y_train[valid_index]
    X_more_internet = np.concatenate([X_more, X_internet])
    y_more_internet = np.concatenate([y_more, y_internet])
    clf.fit(X_more_internet, y_more_internet, eval_metric='auc')
    y_proba = clf.predict_proba(X_less)[:, 1]
    score = roc_auc_score(y_less, y_proba)
    scores.append(score)
    logger.info(f'train k fold {count}:{score}')
scores = np.array(scores)
logger.info(scores)
logger.info(scores.mean())

train k fold 1:0.8867527329441391
train k fold 2:0.8685486063415752
train k fold 3:0.8748695200658065
train k fold 4:0.8826260503077098
train k fold 5:0.874551907371291
[0.88675273 0.86854861 0.87486952 0.88262605 0.87455191]
0.8774697634061044


In [27]:
# 合并两个数据集
X_final_train = np.concatenate([X_train, X_internet])
y_final_train = np.concatenate([y_train, y_internet])

In [28]:
# 测试集
clf_ex = LGBMClassifier(num_leaves=32, random_state=1, max_depth=6, n_estimators=100, metric='auc')
clf_ex.fit(X_final_train, y_final_train, eval_metric='auc')
y_proba = clf_ex.predict_proba(X_test)[:, 1]

In [29]:
submission = pd.DataFrame({'id': test_public['loan_id'], 'isDefault': y_proba})
submission.to_csv('submission.csv', index=None)