In [1]:
# 首先 import 必要的模块
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

## 读取数据

In [2]:
# 数据路径
dpath = './data/FE/'

# 数据名称
test_file = 'test_FE.csv'

test = pd.read_csv(dpath + test_file)

In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130000 entries, 0 to 129999
Columns: 470 entries, user_id to id_city_355
dtypes: float64(22), int64(442), object(6)
memory usage: 466.2+ MB


In [4]:
# 数据名称
tag_file = 'my_user_taglist_embedding_16.csv'

tag_df = pd.read_csv(dpath + tag_file, parse_dates=['tag_insert_date'])

# 数据名称
behavior_file = 'new_user_behavior.csv'

behavior_df = pd.read_csv(dpath + behavior_file)

## 准备数据

### 更改数据类型

In [5]:
# 设置日期型数据类型
date_cols = ['auditing_date', 'due_date', 'reg_mon', 'info_insert_date', 'tag_insert_date']
for col in date_cols:
    test[col] = pd.to_datetime(test[col])

# # 将独热的数据类型改为uint8节省内存方便计算
# oh_names = pd.read_csv(dpath + 'onehot_names.csv')
# for col in oh_names.values:
#     test[col] = test[col].astype('uint8')

# 将独热的数据删除
oh_names = pd.read_csv(dpath + 'onehot_names.csv')
del_oh_names = []
for index, row in oh_names.iterrows():
    del_oh_names.append(row['oh_name'])
test = test.drop(del_oh_names, axis=1)

In [6]:
test = test.merge(tag_df, on='user_id', how='left')
test = test.merge(behavior_df, on='user_id', how='left')

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130000 entries, 0 to 129999
Data columns (total 70 columns):
user_id                       130000 non-null int64
listing_id                    130000 non-null int64
auditing_date                 130000 non-null datetime64[ns]
due_date                      130000 non-null datetime64[ns]
due_amt                       130000 non-null float64
term                          130000 non-null int64
rate                          130000 non-null float64
principal                     130000 non-null int64
reg_mon                       130000 non-null datetime64[ns]
gender                        130000 non-null int64
age                           130000 non-null int64
cell_province                 130000 non-null int64
id_province                   130000 non-null int64
id_city                       130000 non-null int64
info_insert_date              130000 non-null datetime64[ns]
taglist                       79786 non-null object
tag_insert_date_x

In [8]:
test.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,...,user_tag_em10,user_tag_em11,user_tag_em12,user_tag_em13,user_tag_em14,user_tag_em15,sum,behaviro_type1,behaviro_type2,behaviro_type3
0,498765,5431438,2019-03-12,2019-04-12,138.5903,12,7.2,1600,2017-05-01,0,...,0.100971,-0.044635,0.135056,0.141117,0.104204,-0.020327,20.0,10.0,,10.0
1,34524,5443211,2019-03-15,2019-04-15,208.0805,9,6.9,1820,2015-07-01,0,...,0.317818,0.087033,0.104103,-0.086061,-0.212869,-0.043935,77.0,65.0,,12.0
2,821741,5461707,2019-03-22,2019-04-22,421.2097,6,6.5,2480,2018-03-01,1,...,,,,,,,86.0,72.0,,14.0
3,263534,5472320,2019-03-26,2019-04-26,212.6537,9,6.9,1860,2016-09-01,1,...,0.11324,0.070465,0.163667,-0.078055,0.02437,-0.124732,97.0,89.0,8.0,
4,238853,5459750,2019-03-21,2019-04-21,817.4593,9,6.9,7150,2016-08-01,0,...,0.196384,0.252185,0.216401,-0.011173,0.075562,0.06221,30.0,27.0,3.0,


In [9]:
# date_test = test[date_cols]
tag_test = test['taglist']

ID = ['user_id', 'listing_id']
id_test = test[ID]
X_test = test.drop(['auditing_date', 'due_date', 'reg_mon', 'info_insert_date', 'tag_insert_date_x', 'tag_insert_date_y', 'taglist'], axis=1)

#保存特征名字以备后用
feat_names = X_test.columns

#sklearn的学习器大多之一稀疏数据输入，模型训练会快很多
#查看一个学习器是否支持稀疏数据，可以看fit函数是否支持: X: {array-like, sparse matrix}.
#可自行用timeit比较稠密数据和稀疏数据的训练时间
from scipy.sparse import csr_matrix
X_test = csr_matrix(X_test)

## 测试模型

### 导入模型

In [10]:
#load训练好的模型
import pickle

model = pickle.load(open("lgb_gbdt.pkl", 'rb'))

In [11]:
# 预测每天的可能性
day_prob = model.predict_proba(X_test)

# 转为df连接listing_id
day_prob_df = pd.DataFrame(day_prob)
day_prob_df = pd.concat([test['listing_id'], day_prob_df], axis=1)

day_prob_df.head()

Unnamed: 0,listing_id,0,1,2,3,4,5,6,7,8,...,23,24,25,26,27,28,29,30,31,32
0,5431438,0.969389,0.009059,0.002481,0.002688,0.001551,0.001018,0.000902,0.00056,0.000548,...,0.000199,0.000156,0.00018,0.000126,0.000122,0.000147,0.00011,0.000114,9.5e-05,0.005572
1,5443211,0.945763,0.016165,0.004098,0.005647,0.003608,0.001921,0.001733,0.001529,0.001039,...,0.000311,0.000259,0.000233,0.000302,0.000268,0.000187,0.000381,0.000589,0.000358,0.005821
2,5461707,0.252121,0.045484,0.025451,0.026841,0.017898,0.013545,0.008757,0.011312,0.007481,...,0.002708,0.002866,0.053085,0.000932,0.001148,0.001332,0.002304,0.001242,0.000987,0.457811
3,5472320,0.181246,0.238131,0.073464,0.065719,0.031339,0.026933,0.020551,0.012595,0.020125,...,0.002305,0.001301,0.000903,0.001553,0.000857,0.000494,0.000653,0.000692,0.000439,0.16621
4,5459750,0.274355,0.044381,0.011738,0.015167,0.005998,0.003781,0.002425,0.002272,0.001441,...,0.000392,0.000323,0.000371,0.000348,0.000325,0.000434,0.000544,0.000928,0.001704,0.621116


In [13]:
# 预测最可能的还款日期
repay_day = model.predict(X_test)

In [12]:
type(day_prob)

numpy.ndarray

## 预测结果

In [15]:
# 导入提交数据
sub_sample = pd.read_csv('./data/' + 'submission.csv', parse_dates=['repay_date'])
sub_sample.head()

Unnamed: 0,listing_id,repay_amt,repay_date
0,5431438,4.3309,2019-03-12
1,5431438,4.3309,2019-03-13
2,5431438,4.3309,2019-03-14
3,5431438,4.3309,2019-03-15
4,5431438,4.3309,2019-03-16


In [16]:
sub = test[['listing_id', 'due_date', 'due_amt']]
sub_sample = sub_sample.merge(sub, on='listing_id', how='left')
sub_sample.head()

Unnamed: 0,listing_id,repay_amt,repay_date,due_date,due_amt
0,5431438,4.3309,2019-03-12,2019-04-12,138.5903
1,5431438,4.3309,2019-03-13,2019-04-12,138.5903
2,5431438,4.3309,2019-03-14,2019-04-12,138.5903
3,5431438,4.3309,2019-03-15,2019-04-12,138.5903
4,5431438,4.3309,2019-03-16,2019-04-12,138.5903


In [17]:
sub_sample['days'] = (sub_sample['due_date'] - sub_sample['repay_date']).dt.days

In [18]:
sub_sample = sub_sample.merge(day_prob_df, on='listing_id', how='left')
sub_sample.head()

Unnamed: 0,listing_id,repay_amt,repay_date,due_date,due_amt,days,0,1,2,3,...,23,24,25,26,27,28,29,30,31,32
0,5431438,4.3309,2019-03-12,2019-04-12,138.5903,31,0.969389,0.009059,0.002481,0.002688,...,0.000199,0.000156,0.00018,0.000126,0.000122,0.000147,0.00011,0.000114,9.5e-05,0.005572
1,5431438,4.3309,2019-03-13,2019-04-12,138.5903,30,0.969389,0.009059,0.002481,0.002688,...,0.000199,0.000156,0.00018,0.000126,0.000122,0.000147,0.00011,0.000114,9.5e-05,0.005572
2,5431438,4.3309,2019-03-14,2019-04-12,138.5903,29,0.969389,0.009059,0.002481,0.002688,...,0.000199,0.000156,0.00018,0.000126,0.000122,0.000147,0.00011,0.000114,9.5e-05,0.005572
3,5431438,4.3309,2019-03-15,2019-04-12,138.5903,28,0.969389,0.009059,0.002481,0.002688,...,0.000199,0.000156,0.00018,0.000126,0.000122,0.000147,0.00011,0.000114,9.5e-05,0.005572
4,5431438,4.3309,2019-03-16,2019-04-12,138.5903,27,0.969389,0.009059,0.002481,0.002688,...,0.000199,0.000156,0.00018,0.000126,0.000122,0.000147,0.00011,0.000114,9.5e-05,0.005572


In [19]:
repay_amt = []
for index, row in sub_sample.iterrows():
    row['repay_amt'] = row['due_amt'] * row[row['days']]
    repay_amt.append(row['repay_amt'])

In [20]:
repay_amt[:5]

[0.013156367585050592,
 0.015829527150250877,
 0.01530350210702447,
 0.020324145820360632,
 0.016885983834516376]

In [21]:
sub_sample['repay_amt'] = repay_amt

In [22]:
num_cols = list(range(33))
del_cols = ['due_date', 'due_amt', 'days'] + num_cols
sub_sample = sub_sample.drop(del_cols, axis=1)

In [23]:
sub_sample.head()

Unnamed: 0,listing_id,repay_amt,repay_date
0,5431438,0.013156,2019-03-12
1,5431438,0.01583,2019-03-13
2,5431438,0.015304,2019-03-14
3,5431438,0.020324,2019-03-15
4,5431438,0.016886,2019-03-16


## 保存测试结果

In [24]:
sub_sample.to_csv('sub6.csv', index=False)