In [1]:
import logging

import numpy as np
import toad
import pandas as pd
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

import config

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter()
sh = logging.StreamHandler()
sh.setFormatter(formatter)
logger.addHandler(sh)

is_eda = False
is_gridsearch = False

In [3]:
# 读数据
train_public = pd.read_csv(config.TRAIN_PUBLIC_PATH)
train_internet = pd.read_csv(config.TRAIN_INTERNET_PATH)
test_public = pd.read_csv(config.TEST_PUBLIC_PATH)

In [4]:
# 标签列重命名
train_internet = train_internet.rename(columns={'is_default': 'isDefault'})

In [5]:
# 特征工程
## 找相同的特征
common_cols = []
for col in train_public.columns:
    if col in train_internet.columns:
        common_cols.append(col)
logger.info(len(common_cols))

## 只取相同的列
train_public_less = train_public.loc[:, common_cols]
train_internet_less = train_internet.loc[:, common_cols]
test_public_less = test_public.loc[:, common_cols[:-1]]

36


In [6]:
# EDA
if is_eda:
    train_public_detect = toad.detect(train_public_less)
    train_internet_detect = toad.detect(train_internet_less)
    train_public_quality = toad.quality(train_public_less, 'isDefault')
    train_internet_quality = toad.quality(train_internet_less, 'isDefault')

In [7]:
# 处理类别特征
def class_apply(x):
    if x == 'A':
        return 1
    elif x == 'B':
        return 2
    elif x == 'C':
        return 3
    elif x == 'D':
        return 4
    elif x == 'E':
        return 5
    elif x == 'F':
        return 6
    elif x == 'G':
        return 7
    else:
        return 0


def employer_type_apply(x):
    if x == '普通企业':
        return 1
    elif x == '幼教与中小学校':
        return 2
    elif x == '高等教育机构':
        return 3
    elif x == '政府机构':
        return 4
    elif x == '上市企业':
        return 5
    elif x == '世界五百强':
        return 6
    else:
        return 0


def industry_apply(x):
    if x == '金融业':
        return 1
    elif x == '公共服务、社会组织':
        return 2
    elif x == '文化和体育业':
        return 3
    elif x == '信息传输、软件和信息技术服务业':
        return 4
    elif x == '制造业':
        return 5
    elif x == '住宿和餐饮业':
        return 6
    elif x == '建筑业':
        return 7
    elif x == '电力、热力生产供应业':
        return 8
    elif x == '房地产业':
        return 9
    elif x == '交通运输、仓储和邮政业':
        return 10
    elif x == '批发和零售业':
        return 11
    elif x == '农、林、牧、渔业':
        return 12
    elif x == '采矿业':
        return 13
    elif x == '国际组织':
        return 14
    else:
        return 0


def work_year_apply(x):
    if x == '< 1 year':
        return 1
    elif x == '1 year':
        return 2
    elif x == '2 years':
        return 3
    elif x == '3 years':
        return 4
    elif x == '4 years':
        return 5
    elif x == '5 years':
        return 6
    elif x == '6 years':
        return 7
    elif x == '7 years':
        return 8
    elif x == '8 years':
        return 9
    elif x == '9 years':
        return 10
    elif x == '10+ years':
        return 11
    else:
        return 0

In [8]:
train_public_less['class'] = train_public_less['class'].apply(class_apply)
train_public_less['employer_type'] = train_public_less['employer_type'].apply(employer_type_apply)
train_public_less['industry'] = train_public_less['industry'].apply(industry_apply)
train_public_less['work_year'] = train_public_less['work_year'].apply(work_year_apply)

train_internet_less['class'] = train_internet_less['class'].apply(class_apply)
train_internet_less['employer_type'] = train_internet_less['employer_type'].apply(employer_type_apply)
train_internet_less['industry'] = train_internet_less['industry'].apply(industry_apply)
train_internet_less['work_year'] = train_internet_less['work_year'].apply(work_year_apply)

test_public_less['class'] = test_public_less['class'].apply(class_apply)
test_public_less['employer_type'] = test_public_less['employer_type'].apply(employer_type_apply)
test_public_less['industry'] = test_public_less['industry'].apply(industry_apply)
test_public_less['work_year'] = test_public_less['work_year'].apply(work_year_apply)

In [9]:
# 去除非数值类型特征
drop_cols = list(train_public_less.select_dtypes('object').columns) + ['isDefault']

# 构造模型数据
X_train = train_public_less.drop(columns=drop_cols).values
y_train = train_public_less['isDefault'].values

X_internet = train_internet_less.drop(columns=drop_cols).values
y_internet = train_internet_less['isDefault'].values

X_test = test_public_less.drop(columns=drop_cols[:-1]).values

In [26]:
is_gridsearch = True
# 交叉验证lightgbm
if is_gridsearch:
    grid = [{
        'num_leaves': [5, 10, 20, 30],
        'learning_rate': [0.01, 0.03, 0.1, 0.3],
        'reg_alpha': [0, 0.1, 0.2],
        'reg_lambda': [0, 0.1, 0.2]
    }]
    score_detail = []
    best_score = 0
    for param in ParameterGrid(grid):
        logger.info(param)
        param['random_state'] = 1
        scores = []
        skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
        count = 0
        for train_index, valid_index in skf.split(X_train, y_train):
            count += 1
            logger.info(f'train k fold {count}')
            X_more, X_less = X_train[train_index], X_train[valid_index]
            y_more, y_less = y_train[train_index], y_train[valid_index]
            X_more_internet = np.concatenate([X_more, X_internet])
            y_more_internet = np.concatenate([y_more, y_internet])
            clf = LGBMClassifier(**param)
            clf.fit(X_more_internet, y_more_internet)
            y_proba = clf.predict_proba(X_less)[:, 1]
            score = roc_auc_score(y_less, y_proba)
            scores.append(score)
        scores = np.array(scores)
        logger.info(scores)
        logger.info(scores.mean())
        if scores.mean() > best_score:
            best_score = scores.mean()
            logger.info(f'best score {best_score}')
        score_detail.append([param, scores, scores.mean()])
    best_param = sorted(score_detail, key=lambda x: x[2])[-1][0]

{'learning_rate': 0.01, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.85440991 0.82812589 0.83809693 0.83663109 0.82682257]
0.8368172799207312
best score 0.8368172799207312
{'learning_rate': 0.01, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0.1}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.85416488 0.82812589 0.83809693 0.83663109 0.82682257]
0.8367682729381304
{'learning_rate': 0.01, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0.2}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.85416488 0.82812589 0.83809693 0.83663109 0.82682257]
0.8367682729381304
{'learning_rate': 0.01, 'num_leaves': 5, 'reg_alpha': 0.1, 'reg_lambda': 0}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.85416488 0.82812589 0.83809693 0.83663109 0.82682257]
0.8367682729381304
{'learning_rate': 0.01, 'num_leaves': 5, 'reg_alpha': 0.1, 'reg_

0.8554294191283558
{'learning_rate': 0.01, 'num_leaves': 30, 'reg_alpha': 0.2, 'reg_lambda': 0.2}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.86874177 0.84875891 0.85406678 0.86250047 0.84363374]
0.8555403335653835
{'learning_rate': 0.03, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.86931591 0.84691399 0.8541221  0.85589395 0.84419938]
0.854089062550976
{'learning_rate': 0.03, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0.1}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.86931591 0.84691041 0.8541221  0.85589395 0.84419581]
0.854087633384181
{'learning_rate': 0.03, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0.2}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.8694581  0.84691041 0.8541221  0.85589395 0.84419581]
0.8541160717426974
{'learning_rate': 0.03, 'num_leaves': 5, 'reg_alpha': 0.1, 'reg_lambda': 0

train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.87639956 0.85620564 0.86392616 0.866873   0.84647263]
0.8619753964737005
{'learning_rate': 0.03, 'num_leaves': 30, 'reg_alpha': 0.2, 'reg_lambda': 0.2}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.87643891 0.85590874 0.86423306 0.86678646 0.84659575]
0.8619925826124358
{'learning_rate': 0.1, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.87640582 0.8569014  0.86757335 0.86492003 0.85422826]
0.8640057736567407
best score 0.8640057736567407
{'learning_rate': 0.1, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0.1}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.87639509 0.8569014  0.86756978 0.86492003 0.85422469]
0.8640021998927473
{'learning_rate': 0.1, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0.2}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
trai

train k fold 4
train k fold 5
[0.88111693 0.86121544 0.87439845 0.87946777 0.86621368]
0.8724824554891393
{'learning_rate': 0.1, 'num_leaves': 30, 'reg_alpha': 0.2, 'reg_lambda': 0.1}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.88522708 0.86487487 0.87322793 0.8785185  0.86438473]
0.8732466210063684
{'learning_rate': 0.1, 'num_leaves': 30, 'reg_alpha': 0.2, 'reg_lambda': 0.2}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.88353866 0.86308093 0.8744841  0.87829545 0.86451321]
0.872782470704767
{'learning_rate': 0.3, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.88091124 0.85613231 0.86723254 0.86955754 0.85806638]
0.866380002818552
{'learning_rate': 0.3, 'num_leaves': 5, 'reg_alpha': 0, 'reg_lambda': 0.1}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.87848057 0.85817665 0.86849585 0.86833526 0.86109262]
0.86691

train k fold 5
[0.86774554 0.86039449 0.85985608 0.86592283 0.85909951]
0.862603689593152
{'learning_rate': 0.3, 'num_leaves': 30, 'reg_alpha': 0.2, 'reg_lambda': 0.1}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.87183959 0.86071464 0.86398682 0.86891517 0.86249155]
0.865589554989896
{'learning_rate': 0.3, 'num_leaves': 30, 'reg_alpha': 0.2, 'reg_lambda': 0.2}
train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.87460651 0.8607683  0.86285912 0.86745915 0.86311963]
0.8657625443196977


In [28]:
best_param

{'learning_rate': 0.1,
 'num_leaves': 30,
 'reg_alpha': 0,
 'reg_lambda': 0,
 'random_state': 1}

In [27]:
# best_param: 
# {'learning_rate': 0.1,
#  'num_leaves': 30,
#  'reg_alpha': 0,
#  'reg_lambda': 0,
#  'random_state': 1}
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
count = 0
scores = []
for train_index, valid_index in skf.split(X_train, y_train):
    count += 1
    logger.info(f'train k fold {count}')
    X_more, X_less = X_train[train_index], X_train[valid_index]
    y_more, y_less = y_train[train_index], y_train[valid_index]
    X_more_internet = np.concatenate([X_more, X_internet])
    y_more_internet = np.concatenate([y_more, y_internet])
    clf = LGBMClassifier(num_leaves=30, random_state=1)
    clf.fit(X_more_internet, y_more_internet)
    y_proba = clf.predict_proba(X_less)[:, 1]
    score = roc_auc_score(y_less, y_proba)
    scores.append(score)
scores = np.array(scores)
logger.info(scores)
logger.info(scores.mean())

train k fold 1
train k fold 2
train k fold 3
train k fold 4
train k fold 5
[0.88568853 0.86587826 0.87484632 0.87806527 0.86769825]
0.8744353289285414


In [30]:
X_final_train = np.concatenate([X_train, X_internet])
# pd.concat([X_train, X_internet])
y_final_train = np.concatenate([y_train, y_internet])

In [31]:
clf_ex = LGBMClassifier(num_leaves=30, random_state=1)
clf_ex.fit(X_final_train, y_final_train)
y_proba = clf.predict_proba(X_test)[:, 1]

In [32]:
submission = pd.DataFrame({'id': test_public['loan_id'], 'isDefault': y_proba})
submission.to_csv('submission.csv', index=None)