# 训练集特征工程

In [1]:
# 导入工具包
import pandas as pd
import numpy as np
from scipy.stats import kurtosis
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

## 1.连接train和test

In [2]:
# 读取train
# 数据路径
dpath = './data/'

# 数据集名称
filename = 'train.csv'

# 读取数据
train = pd.read_csv(dpath + filename, parse_dates=['auditing_date','due_date','repay_date'])

In [3]:
train.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,repay_date,repay_amt
0,748147,3163926,2018-04-25,2018-05-25,72.1167,2018-05-25,72.1167
1,672952,3698760,2018-06-09,2018-07-09,258.7045,2018-07-08,258.7045
2,404196,2355665,2018-02-18,2018-03-18,307.927,\N,\N
3,342769,1994522,2018-01-13,2018-02-13,252.9809,2018-02-13,252.9809
4,828139,3602352,2018-06-01,2018-07-01,107.6503,2018-06-25,107.6503


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
user_id          1000000 non-null int64
listing_id       1000000 non-null int64
auditing_date    1000000 non-null datetime64[ns]
due_date         1000000 non-null datetime64[ns]
due_amt          1000000 non-null float64
repay_date       1000000 non-null object
repay_amt        1000000 non-null object
dtypes: datetime64[ns](2), float64(1), int64(2), object(2)
memory usage: 53.4+ MB


In [5]:
# 将'repay_date'中所有的空替换为2200-01-01
train['repay_date'] = train['repay_date'].replace('\\N', '2200-01-01')

# 'repay_date'转换成datetime格式
train['repay_date'] = pd.to_datetime(train['repay_date'])

# 将'repay_amt'转换为float
train['repay_amt'] = train['repay_amt'].replace('\\N', 0)
train['repay_amt'] = train['repay_amt'].astype(np.float32)

In [6]:
# due_date - repay_date得到还款周期
#（0~31，如果为负数或者repay_date为空则设为-1表示逾期）用作新的特征label
label = []
for index, row in train.iterrows():
    y = row['due_date'] - row['repay_date']
    y = y.days
    if y >= 0:
        label.append(y)
    elif y < 0:
        y = -1
        label.append(y)
train['label'] = label

clf_labels = train['label'].values
amt_labels = train['repay_amt'].values
del train['label'], train['repay_amt'], train['repay_date']

In [7]:
train.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt
0,748147,3163926,2018-04-25,2018-05-25,72.1167
1,672952,3698760,2018-06-09,2018-07-09,258.7045
2,404196,2355665,2018-02-18,2018-03-18,307.927
3,342769,1994522,2018-01-13,2018-02-13,252.9809
4,828139,3602352,2018-06-01,2018-07-01,107.6503


In [8]:
# 读取test
# 数据集名称
filename = 'test.csv'

# 读取数据
test = pd.read_csv(dpath + filename, parse_dates=['auditing_date','due_date'])

In [9]:
test.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt
0,498765,5431438,2019-03-12,2019-04-12,138.5903
1,34524,5443211,2019-03-15,2019-04-15,208.0805
2,821741,5461707,2019-03-22,2019-04-22,421.2097
3,263534,5472320,2019-03-26,2019-04-26,212.6537
4,238853,5459750,2019-03-21,2019-04-21,817.4593


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130000 entries, 0 to 129999
Data columns (total 5 columns):
user_id          130000 non-null int64
listing_id       130000 non-null int64
auditing_date    130000 non-null datetime64[ns]
due_date         130000 non-null datetime64[ns]
due_amt          130000 non-null float64
dtypes: datetime64[ns](2), float64(1), int64(2)
memory usage: 5.0 MB


In [11]:
# 保存listing_id,auditing_date,due_amt用作预测
sub = test[['listing_id', 'auditing_date', 'due_amt']]
sub.to_csv(dpath + 'mysub.csv', index=False, header=True)

In [12]:
# 连接train和test
df_all = pd.concat([train, test], axis=0, ignore_index=True)

# 保存训练集数量
train_num = train.shape[0]

df_all.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt
0,748147,3163926,2018-04-25,2018-05-25,72.1167
1,672952,3698760,2018-06-09,2018-07-09,258.7045
2,404196,2355665,2018-02-18,2018-03-18,307.927
3,342769,1994522,2018-01-13,2018-02-13,252.9809
4,828139,3602352,2018-06-01,2018-07-01,107.6503


In [13]:
df_all.loc[train_num:train_num+5]

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt
1000000,498765,5431438,2019-03-12,2019-04-12,138.5903
1000001,34524,5443211,2019-03-15,2019-04-15,208.0805
1000002,821741,5461707,2019-03-22,2019-04-22,421.2097
1000003,263534,5472320,2019-03-26,2019-04-26,212.6537
1000004,238853,5459750,2019-03-21,2019-04-21,817.4593
1000005,21071,5393299,2019-03-01,2019-04-01,112.0961


In [14]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1130000 entries, 0 to 1129999
Data columns (total 5 columns):
user_id          1130000 non-null int64
listing_id       1130000 non-null int64
auditing_date    1130000 non-null datetime64[ns]
due_date         1130000 non-null datetime64[ns]
due_amt          1130000 non-null float64
dtypes: datetime64[ns](2), float64(1), int64(2)
memory usage: 43.1 MB


## 2.连接listing_info

In [15]:
# 数据集名称
filename = 'listing_info.csv'

# 读取数据
listing_info = pd.read_csv(dpath + filename, parse_dates=['auditing_date'])

In [16]:
listing_info.head()

Unnamed: 0,user_id,listing_id,auditing_date,term,rate,principal
0,316610,1556649,2017-11-26,9,7.6,4800
1,62002,1556633,2017-11-26,6,7.6,4000
2,192135,1556629,2017-11-26,12,8.0,8660
3,487382,1556628,2017-11-26,9,7.6,4780
4,235186,1556627,2017-11-26,9,7.6,1480


In [17]:
# 删除user_id和auditing_date
listing_info = listing_info.drop(['user_id', 'auditing_date'], axis=1)
listing_info.head()

Unnamed: 0,listing_id,term,rate,principal
0,1556649,9,7.6,4800
1,1556633,6,7.6,4000
2,1556629,12,8.0,8660
3,1556628,9,7.6,4780
4,1556627,9,7.6,1480


In [18]:
# 按照listing_id进行连接
df_all = pd.merge(df_all, listing_info)
df_all.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630


In [19]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1130000 entries, 0 to 1129999
Data columns (total 8 columns):
user_id          1130000 non-null int64
listing_id       1130000 non-null int64
auditing_date    1130000 non-null datetime64[ns]
due_date         1130000 non-null datetime64[ns]
due_amt          1130000 non-null float64
term             1130000 non-null int64
rate             1130000 non-null float64
principal        1130000 non-null int64
dtypes: datetime64[ns](2), float64(2), int64(4)
memory usage: 77.6 MB


## 3.连接user_info

In [20]:
# 文件名称
filename = "user_info.csv"

# 导入数据
user_info = pd.read_csv(dpath + filename, parse_dates=['reg_mon','insertdate'])

In [21]:
# 查看前五行数据
user_info.head()

Unnamed: 0,user_id,reg_mon,gender,age,cell_province,id_province,id_city,insertdate
0,483833,2017-04-01,男,19,c29,c26,c26241,2018-12-11
1,156772,2016-05-01,男,31,c11,c11,c11159,2018-02-13
2,173388,2016-05-01,男,34,c02,c02,c02182,2018-08-21
3,199107,2016-07-01,女,25,c09,c09,c09046,2018-06-05
4,122560,2016-03-01,男,23,c05,c05,c05193,2018-04-02


In [22]:
# 按照insertdate列降序排列
user_info = user_info.sort_values('insertdate', ascending=False)
user_info.head()

Unnamed: 0,user_id,reg_mon,gender,age,cell_province,id_province,id_city,insertdate
410345,907196,2018-09-01,男,30,c04,c11,c11076,2019-03-30
511927,504119,2017-05-01,男,30,c02,c02,c02139,2019-03-30
864634,909870,2018-10-01,男,25,c02,c02,c02321,2019-03-30
511594,542229,2017-06-01,男,35,c09,c09,c09205,2019-03-30
307635,554821,2017-06-01,男,27,c04,c04,c04344,2019-03-30


In [23]:
# 删除重复的user_id行，保留最新数据
user_info.drop_duplicates('user_id', 'first', inplace=True)

# # 删除reg_mon和insertdate
# user_info = user_info.drop(['reg_mon', 'insertdate'], axis=1)

user_info.rename(columns={'insertdate': 'info_insert_date'}, inplace=True)

# 按照user_id进行连接
df_all = pd.merge(df_all, user_info, on='user_id', how='left')

df_all.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,age,cell_province,id_province,id_city,info_insert_date
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630,2017-12-01,男,21,c20,c20,c20089,2018-04-24
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260,2017-09-01,男,37,c14,c17,c17250,2018-06-04
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690,2017-02-01,男,24,c04,c04,c04251,2018-02-17
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210,2016-12-01,男,23,c17,c17,c17246,2018-01-12
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630,2018-04-01,男,23,c02,c17,c17096,2018-05-31


In [24]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1130000 entries, 0 to 1129999
Data columns (total 15 columns):
user_id             1130000 non-null int64
listing_id          1130000 non-null int64
auditing_date       1130000 non-null datetime64[ns]
due_date            1130000 non-null datetime64[ns]
due_amt             1130000 non-null float64
term                1130000 non-null int64
rate                1130000 non-null float64
principal           1130000 non-null int64
reg_mon             1130000 non-null datetime64[ns]
gender              1130000 non-null object
age                 1130000 non-null int64
cell_province       1130000 non-null object
id_province         1130000 non-null object
id_city             1130000 non-null object
info_insert_date    1130000 non-null datetime64[ns]
dtypes: datetime64[ns](4), float64(2), int64(5), object(4)
memory usage: 137.9+ MB


## 4.连接user_taglist

In [25]:
# 数据路径
filename = "user_taglist.csv"

# 导入数据
user_taglist = pd.read_csv(dpath + filename, parse_dates=['insertdate'])

In [26]:
user_taglist.head()

Unnamed: 0,user_id,taglist,insertdate
0,113401,4707|473|3498|4759|1654|298|2869|1164|212|1885...,2018-10-03
1,378358,751|2207|1100|2099|1832|1911|5347|2254|171|360...,2018-11-30
2,434838,877|3795|5628|70|2684|691|719|4228|631|1541|12...,2018-03-25
3,577061,2431|3242|340|1823|4020|4357|164|620|2168|1192...,2018-05-25
4,566753,3980|3125|1819|1333|1177|3972|621|5800|3632|16...,2018-12-02


In [27]:
# 按照insertdate列降序排列
user_taglist = user_taglist.sort_values('insertdate', ascending=False)
user_taglist.head()

Unnamed: 0,user_id,taglist,insertdate
160864,847942,271|5639|1314|404|2017|631|2365|3576|1804|1541...,2019-03-30
571295,13643,2017|3006|1178|2466|1804|1493|70|727|1522|2500...,2019-03-30
410834,740882,5682|2117|440|2824|1981|124|393|4926|340|830|2...,2019-03-30
267027,905326,2267|1189|1654|5660|5756|298|212|3972|4481|286...,2019-03-30
543141,411367,530|42|2682|2596|1100|1279|2073|1180|4749|5269...,2019-03-30


In [28]:
# 删除重复的user_id行，保留最新数据
user_taglist.drop_duplicates('user_id', 'first', inplace=True)

# # 删除reg_mon和insertdate
# user_info = user_info.drop(['reg_mon', 'insertdate'], axis=1)

user_taglist.rename(columns={'insertdate': 'tag_insert_date'}, inplace=True)

# 按照user_id进行连接
df_all4 = pd.merge(df_all, user_taglist, on='user_id', how='left')

df_all4.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,age,cell_province,id_province,id_city,info_insert_date,taglist,tag_insert_date
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630,2017-12-01,男,21,c20,c20,c20089,2018-04-24,127|5539|5556|1749|1100|239|1803|1911|2519|351...,2018-11-12
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260,2017-09-01,男,37,c14,c17,c17250,2018-06-04,1421|3116|5145|5201|2785|1812|4671|3125|1058|2...,2018-06-04
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690,2017-02-01,男,24,c04,c04,c04251,2018-02-17,4617|436|1338|3871|1667|2828|4121|1478|1654|30...,2018-02-17
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210,2016-12-01,男,23,c17,c17,c17246,2018-01-12,1100|472|3422|3296|4949|4025|3328|974|4092|218...,2018-01-12
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630,2018-04-01,男,23,c02,c17,c17096,2018-05-31,5784|3643|2017|3432|469|5151|1625|1753|2762|15...,2018-05-31


In [29]:
# #先后调用CountVectorizer和TfidfTransformer两种方法
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# vectorizer = CountVectorizer(min_df=10)
# transformer = TfidfTransformer()
# user_taglist_tfidf = transformer.fit_transform(vectorizer.fit_transform(user_taglist['taglist'])).toarray()

# #重新组成DtaFrame，可视化
# df_user_taglist_tfidf = pd.DataFrame(data=user_taglist_tfidf)

# df_user_taglist_tfidf.head()

In [30]:
df_all4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1130000 entries, 0 to 1129999
Data columns (total 17 columns):
user_id             1130000 non-null int64
listing_id          1130000 non-null int64
auditing_date       1130000 non-null datetime64[ns]
due_date            1130000 non-null datetime64[ns]
due_amt             1130000 non-null float64
term                1130000 non-null int64
rate                1130000 non-null float64
principal           1130000 non-null int64
reg_mon             1130000 non-null datetime64[ns]
gender              1130000 non-null object
age                 1130000 non-null int64
cell_province       1130000 non-null object
id_province         1130000 non-null object
id_city             1130000 non-null object
info_insert_date    1130000 non-null datetime64[ns]
taglist             654255 non-null object
tag_insert_date     654255 non-null datetime64[ns]
dtypes: datetime64[ns](5), float64(2), int64(5), object(5)
memory usage: 155.2+ MB


## 5.连接user_behavior_logs

In [31]:
# # 数据路径
# filename = "user_behavior_logs.csv"

# # 导入数据
# user_behavior_logs = pd.read_csv(dpath + filename)

In [32]:
# # 删除大于'2019-03-02'的数据
# user_behavior_logs['behavior_time'] = pd.to_datetime(user_behavior_logs['behavior_time'])
# del_index = []
# for index, row in user_behavior_logs.iterrows():
#     if row['behavior_time'] >= DATELINE:
#         del_index.append(index)
# user_behavior_logs = user_behavior_logs.drop([del_index])

In [33]:
# # 设置新的行为表，存放用户三种行为的次数
# df_user_behavior_logs = pd.DataFrame(columns=['user_id', 'behavior_1', 'behavior_2', 'behavior_3'])
# df_user_behavior_logs['user_id'] = user_behavior_logs['user_id'].unique()
# df_user_behavior_logs['behavior_1'] = df_user_behavior_logs['behavior_1'].replace(np.nan, 0)
# df_user_behavior_logs['behavior_2'] = df_user_behavior_logs['behavior_2'].replace(np.nan, 0)
# df_user_behavior_logs['behavior_3'] = df_user_behavior_logs['behavior_3'].replace(np.nan, 0)
# df_user_behavior_logs.head()

In [34]:
# # 统计每个用户三种行为的次数
# for index,row1 in user_behavior_logs.iterrows():
#     if row1['behavior_type'] == 1:
#         user_id = row1['user_id']
#         for index,row2 in df_user_behavior_logs.iterrows():
#             if row2['user_id'] == user_id:
#                 row2['behavior_1'] += 1
#                 break
#     elif row1['behavior_type'] == 2:
#         user_id = row1['user_id']
#         for index,row2 in df_user_behavior_logs.iterrows():
#             if row2['user_id'] == user_id:
#                 row2['behavior_2'] += 1
#                 break
#     elif row1['behavior_type'] == 3:
#         user_id = row1['user_id']
#         for index,row2 in df_user_behavior_logs.iterrows():
#             if row2['user_id'] == user_id:
#                 row2['behavior_3'] += 1
#                 break
#     else:

## 6.连接user_repay_logs

In [35]:
# 数据名称
filename = "user_repay_logs.csv"

# 导入数据
user_repay_logs = pd.read_csv(dpath + filename, parse_dates=['due_date', 'repay_date'])

In [36]:
user_repay_logs.loc[0:12]

Unnamed: 0,user_id,listing_id,order_id,due_date,due_amt,repay_date,repay_amt
0,748483,1858122,6,2018-06-29,528.6365,2018-06-20,528.6365
1,748483,1858122,4,2018-04-29,528.6365,2200-01-01,528.6365
2,748483,1858122,7,2018-07-29,528.6365,2018-06-20,528.6365
3,748483,1858122,5,2018-05-29,528.6365,2018-05-29,528.6365
4,748483,1858122,1,2018-01-29,528.6365,2018-01-28,528.6365
5,748483,1858122,2,2018-02-28,528.6365,2018-02-16,528.6365
6,748483,1858122,3,2018-03-29,528.6365,2018-03-28,528.6365
7,445749,185013,11,2018-03-05,197.9548,2017-05-25,197.9548
8,445749,185013,6,2017-10-05,197.9548,2017-05-25,197.9548
9,445749,185013,10,2018-02-05,197.9548,2017-05-25,197.9548


In [37]:
# 由于题目任务只预测第一期的还款情况，因此这里只保留第一期的历史记录。当然非第一期的记录也能提取很多特征。
repay_log_df = user_repay_logs[user_repay_logs['order_id'] == 1].reset_index(drop=True)
repay_log_df['repay'] = repay_log_df['repay_date'].astype('str').apply(lambda x: 1 if x != '2200-01-01' else 0)
repay_log_df['early_repay_days'] = (repay_log_df['due_date'] - repay_log_df['repay_date']).dt.days
repay_log_df['early_repay_days'] = repay_log_df['early_repay_days'].apply(lambda x: x if x >= 0 else -1)
for f in ['listing_id', 'order_id', 'due_date', 'repay_date', 'repay_amt']:
    del repay_log_df[f]
group = repay_log_df.groupby('user_id', as_index=False)
repay_log_df = repay_log_df.merge(
    group['repay'].agg({'repay_mean': 'mean'}), on='user_id', how='left'
)
repay_log_df = repay_log_df.merge(
    group['early_repay_days'].agg({
        'early_repay_days_max': 'max', 'early_repay_days_median': 'median', 'early_repay_days_sum': 'sum',
        'early_repay_days_mean': 'mean', 'early_repay_days_std': 'std'
    }), on='user_id', how='left'
)
repay_log_df = repay_log_df.merge(
    group['due_amt'].agg({
        'due_amt_max': 'max', 'due_amt_min': 'min', 'due_amt_median': 'median',
        'due_amt_mean': 'mean', 'due_amt_sum': 'sum', 'due_amt_std': 'std',
        'due_amt_skew': 'skew', 'due_amt_kurt': kurtosis, 'due_amt_ptp': np.ptp
    }), on='user_id', how='left'
)
del repay_log_df['repay'], repay_log_df['early_repay_days'], repay_log_df['due_amt']
repay_log_df = repay_log_df.drop_duplicates('user_id').reset_index(drop=True)
repay_log_df.head()

  return ptp(axis=axis, out=out, **kwargs)


Unnamed: 0,user_id,repay_mean,early_repay_days_max,early_repay_days_median,early_repay_days_sum,early_repay_days_mean,early_repay_days_std,due_amt_max,due_amt_min,due_amt_median,due_amt_mean,due_amt_sum,due_amt_std,due_amt_skew,due_amt_kurt,due_amt_ptp
0,748483,1.0,14,0.5,15,3.75,6.849574,528.6365,95.0109,216.76895,264.296325,1057.1853,191.972245,1.162271,-1.087608,433.6256
1,369368,1.0,18,12.0,34,11.333333,7.023769,394.6893,385.5078,386.1742,388.790433,1166.3713,5.119423,1.699092,-1.5,9.1815
2,749102,1.0,31,31.0,31,31.0,,338.5357,338.5357,338.5357,338.5357,338.5357,,,-3.0,0.0
3,385257,1.0,12,2.0,19,3.8,4.658326,1847.3679,573.1446,1374.8646,1259.75706,6298.7853,467.02447,-0.492076,-0.749738,1274.2233
4,648677,1.0,31,18.0,501,17.892857,10.640034,1130.0933,72.1167,201.3691,318.442114,8916.3792,239.271844,2.125041,4.095778,1057.9766


In [38]:
df_all = df_all4.merge(repay_log_df, on='user_id', how='left')
df_all.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,...,early_repay_days_std,due_amt_max,due_amt_min,due_amt_median,due_amt_mean,due_amt_sum,due_amt_std,due_amt_skew,due_amt_kurt,due_amt_ptp
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630,2017-12-01,男,...,16.165808,195.7454,72.1167,102.1022,123.321433,369.9643,64.488028,1.320375,-1.5,123.6287
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260,2017-09-01,男,...,4.969909,945.0062,133.9311,133.9311,321.1008,1605.504,352.933682,2.113048,0.126236,811.0751
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690,2017-02-01,男,...,17.67767,307.927,102.481,205.204,205.204,410.408,145.27226,,-2.0,205.446
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210,2016-12-01,男,...,1.732051,758.1972,457.9412,622.6028,612.913733,1838.7412,150.362312,-0.288768,-1.5,300.256
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630,2018-04-01,男,...,,148.9888,148.9888,148.9888,148.9888,148.9888,,,-3.0,0.0


In [39]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1130000 entries, 0 to 1129999
Data columns (total 32 columns):
user_id                    1130000 non-null int64
listing_id                 1130000 non-null int64
auditing_date              1130000 non-null datetime64[ns]
due_date                   1130000 non-null datetime64[ns]
due_amt                    1130000 non-null float64
term                       1130000 non-null int64
rate                       1130000 non-null float64
principal                  1130000 non-null int64
reg_mon                    1130000 non-null datetime64[ns]
gender                     1130000 non-null object
age                        1130000 non-null int64
cell_province              1130000 non-null object
id_province                1130000 non-null object
id_city                    1130000 non-null object
info_insert_date           1130000 non-null datetime64[ns]
taglist                    654255 non-null object
tag_insert_date            654255 non-null 

## 7.其他特征工程

### 离散型数据

In [40]:
# 标签编码
cate_cols = ['gender', 'cell_province', 'id_province', 'id_city']
for f in cate_cols:
    df_all[f] = df_all[f].map(dict(zip(df_all[f].unique(), range(df_all[f].nunique())))).astype('int32')

df_all.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,...,early_repay_days_std,due_amt_max,due_amt_min,due_amt_median,due_amt_mean,due_amt_sum,due_amt_std,due_amt_skew,due_amt_kurt,due_amt_ptp
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630,2017-12-01,0,...,16.165808,195.7454,72.1167,102.1022,123.321433,369.9643,64.488028,1.320375,-1.5,123.6287
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260,2017-09-01,0,...,4.969909,945.0062,133.9311,133.9311,321.1008,1605.504,352.933682,2.113048,0.126236,811.0751
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690,2017-02-01,0,...,17.67767,307.927,102.481,205.204,205.204,410.408,145.27226,,-2.0,205.446
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210,2016-12-01,0,...,1.732051,758.1972,457.9412,622.6028,612.913733,1838.7412,150.362312,-0.288768,-1.5,300.256
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630,2018-04-01,0,...,,148.9888,148.9888,148.9888,148.9888,148.9888,,,-3.0,0.0


### 日期型数据

In [41]:
# 每日应还
df_all['due_amt_per_days'] = df_all['due_amt'] / (train['due_date'] - train['auditing_date']).dt.days
date_cols = ['auditing_date', 'due_date', 'reg_mon', 'info_insert_date', 'tag_insert_date']
# 分离年月日
for f in date_cols:
    if f in ['reg_mon', 'info_insert_date', 'tag_insert_date']:
        df_all[f + '_year'] = df_all[f].dt.year
    df_all[f + '_month'] = df_all[f].dt.month
    if f in ['auditing_date', 'due_date', 'info_insert_date', 'tag_insert_date']:
        df_all[f + '_day'] = df_all[f].dt.day
        df_all[f + '_dayofweek'] = df_all[f].dt.dayofweek
# df_all.drop(columns=date_cols, axis=1, inplace=True)
df_all.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,...,reg_mon_year,reg_mon_month,info_insert_date_year,info_insert_date_month,info_insert_date_day,info_insert_date_dayofweek,tag_insert_date_year,tag_insert_date_month,tag_insert_date_day,tag_insert_date_dayofweek
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630,2017-12-01,0,...,2017,12,2018,4,24,1,2018.0,11.0,12.0,0.0
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260,2017-09-01,0,...,2017,9,2018,6,4,0,2018.0,6.0,4.0,0.0
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690,2017-02-01,0,...,2017,2,2018,2,17,5,2018.0,2.0,17.0,5.0
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210,2016-12-01,0,...,2016,12,2018,1,12,4,2018.0,1.0,12.0,4.0
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630,2018-04-01,0,...,2018,4,2018,5,31,3,2018.0,5.0,31.0,3.0


### taglist

In [42]:
# df_all['taglist'] = df_all['taglist'].astype('str').apply(lambda x: x.strip().replace('|', ' ').strip())
# tag_cv = CountVectorizer(min_df=10, max_df=0.9).fit_transform(df_all['taglist'])

### 独热编码

In [43]:
df_onehot = pd.DataFrame()
for col in cate_cols:
    df_new = pd.get_dummies(df_all[col], prefix=col)
    print("%s encoding is completed." % col)
    df_onehot = pd.concat([df_onehot, df_new], axis=1)
    print("%s onehot is connected." % col)
df_onehot.head()

gender encoding is completed.
gender onehot is connected.
cell_province encoding is completed.
cell_province onehot is connected.
id_province encoding is completed.
id_province onehot is connected.
id_city encoding is completed.
id_city onehot is connected.


Unnamed: 0,gender_0,gender_1,cell_province_0,cell_province_1,cell_province_2,cell_province_3,cell_province_4,cell_province_5,cell_province_6,cell_province_7,...,id_city_346,id_city_347,id_city_348,id_city_349,id_city_350,id_city_351,id_city_352,id_city_353,id_city_354,id_city_355
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# df_all = sparse.hstack((df_all.values, tag_cv), format='csr', dtype='float32')

In [65]:
onehot_name = df_onehot.columns
onehot_name = pd.DataFrame(onehot_name, columns=['oh_name'])
onehot_name.head()
onehot_name.to_csv(dpath + '/FE/' + 'onehot_names.csv', index=False, header=True)

In [45]:
# 连接独热编码
df_all = pd.concat([df_all, df_onehot], axis=1)
df_all.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,...,id_city_346,id_city_347,id_city_348,id_city_349,id_city_350,id_city_351,id_city_352,id_city_353,id_city_354,id_city_355
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630,2017-12-01,0,...,0,0,0,0,0,0,0,0,0,0
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260,2017-09-01,0,...,0,0,0,0,0,0,0,0,0,0
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690,2017-02-01,0,...,0,0,0,0,0,0,0,0,0,0
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210,2016-12-01,0,...,0,0,0,0,0,0,0,0,0,0
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630,2018-04-01,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1130000 entries, 0 to 1129999
Columns: 470 entries, user_id to id_city_355
dtypes: datetime64[ns](5), float64(22), int32(4), int64(17), object(1), uint8(421)
memory usage: 907.5+ MB


## 8.保存为新数据

In [54]:
# 分开训练集和测试集
df_train, df_test = df_all.loc[:train_num-1], df_all.loc[train_num:]

In [55]:
df_train.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,...,id_city_346,id_city_347,id_city_348,id_city_349,id_city_350,id_city_351,id_city_352,id_city_353,id_city_354,id_city_355
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630,2017-12-01,0,...,0,0,0,0,0,0,0,0,0,0
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260,2017-09-01,0,...,0,0,0,0,0,0,0,0,0,0
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690,2017-02-01,0,...,0,0,0,0,0,0,0,0,0,0
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210,2016-12-01,0,...,0,0,0,0,0,0,0,0,0,0
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630,2018-04-01,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
# 加上标签
clf_labels = pd.DataFrame(clf_labels, columns=['clf_label'])
amt_labels = pd.DataFrame(amt_labels, columns=['amt_labels'])
df_train = pd.concat([df_train, clf_labels, amt_labels], axis=1)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Columns: 472 entries, user_id to amt_labels
dtypes: datetime64[ns](5), float32(1), float64(22), int32(4), int64(18), object(1), uint8(421)
memory usage: 779.2+ MB


In [57]:
df_train.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,...,id_city_348,id_city_349,id_city_350,id_city_351,id_city_352,id_city_353,id_city_354,id_city_355,clf_label,amt_labels
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630,2017-12-01,0,...,0,0,0,0,0,0,0,0,0,72.116699
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260,2017-09-01,0,...,0,0,0,0,0,0,0,0,1,258.704498
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690,2017-02-01,0,...,0,0,0,0,0,0,0,0,-1,0.0
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210,2016-12-01,0,...,0,0,0,0,0,0,0,0,0,252.980896
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630,2018-04-01,0,...,0,0,0,0,0,0,0,0,6,107.650299


In [58]:
df_test = df_test.reset_index(drop=True)
df_test.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,...,id_city_346,id_city_347,id_city_348,id_city_349,id_city_350,id_city_351,id_city_352,id_city_353,id_city_354,id_city_355
0,498765,5431438,2019-03-12,2019-04-12,138.5903,12,7.2,1600,2017-05-01,0,...,0,0,0,0,0,0,0,0,0,0
1,34524,5443211,2019-03-15,2019-04-15,208.0805,9,6.9,1820,2015-07-01,0,...,0,0,0,0,0,0,0,0,0,0
2,821741,5461707,2019-03-22,2019-04-22,421.2097,6,6.5,2480,2018-03-01,1,...,0,0,0,0,0,0,0,0,0,0
3,263534,5472320,2019-03-26,2019-04-26,212.6537,9,6.9,1860,2016-09-01,1,...,0,0,0,0,0,0,0,0,0,0
4,238853,5459750,2019-03-21,2019-04-21,817.4593,9,6.9,7150,2016-08-01,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
# 保存训练集和测试集
df_train.to_csv(dpath + 'FE/' + 'train_FE.csv', index=False, header=True)
df_test.to_csv(dpath + 'FE/' + 'test_FE.csv', index=False, header=True)