# 数据探索

In [1]:
# 导入工具包
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 一、train

### 1.1 读取数据

In [2]:
# 数据路径
dpath = './data/'
filename = "train.csv"

# 导入数据
train = pd.read_csv(dpath + filename)

In [3]:
# 查看前五行数据
train.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,repay_date,repay_amt
0,748147,3163926,2018-04-25,2018-05-25,72.1167,2018-05-25,72.1167
1,672952,3698760,2018-06-09,2018-07-09,258.7045,2018-07-08,258.7045
2,404196,2355665,2018-02-18,2018-03-18,307.927,\N,\N
3,342769,1994522,2018-01-13,2018-02-13,252.9809,2018-02-13,252.9809
4,828139,3602352,2018-06-01,2018-07-01,107.6503,2018-06-25,107.6503


In [4]:
# 检查数据规模和基本信息
print("Train:", train.shape)
train.info()

Train: (1000000, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
user_id          1000000 non-null int64
listing_id       1000000 non-null int64
auditing_date    1000000 non-null object
due_date         1000000 non-null object
due_amt          1000000 non-null float64
repay_date       1000000 non-null object
repay_amt        1000000 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 53.4+ MB


### 1.2 数据分析

#### 离散型数据分布

In [5]:
# 统计日期分布，并查看id是否有重复
feat_names = train.columns.tolist()
feat_names.remove('due_amt')
feat_names.remove('repay_amt')
for col in feat_names:
    print('\n%s特征的不同取值和出现次数' % col)
    print(train[col].value_counts())


user_id特征的不同取值和出现次数
893965    13
79343     11
783346    11
832234     9
515666     9
839840     9
155666     9
571014     9
605998     9
116510     8
381970     8
340792     8
795492     8
416691     8
569132     8
505576     8
852364     8
35086      8
425630     8
725262     8
148451     7
343124     7
242477     7
733352     7
96686      7
240177     7
274177     7
228592     7
751361     7
452357     7
          ..
582550     1
588693     1
586644     1
574354     1
580497     1
578448     1
535439     1
594856     1
596905     1
590762     1
650167     1
633791     1
631742     1
637885     1
635836     1
625595     1
623546     1
629689     1
627640     1
654261     1
592811     1
652212     1
641971     1
639922     1
646065     1
644016     1
598958     1
605101     1
603052     1
2049       1
Name: user_id, Length: 823732, dtype: int64

listing_id特征的不同取值和出现次数
2101246    1
4034278    1
2716437    1
3950303    1
3935958    1
3938005    1
4984245    1
3942099    1
1885903    1
4

#### 数值型数据

In [6]:
train['repay_amt'] = train['repay_amt'].replace('\\N', 0)
train['repay_amt'] = train['repay_amt'].astype(np.float32)
#查看数值型特征的基本统计量
num_feats = ['due_amt','repay_amt']
train[num_feats].describe()

Unnamed: 0,due_amt,repay_amt
count,1000000.0,1000000.0
mean,452.026625,396.342102
std,514.202155,501.982025
min,17.4533,0.0
25%,160.4824,126.337196
50%,286.7401,245.065094
75%,546.0268,483.458405
max,18827.0184,18827.017578


## 二、listing_info

### 2.1 读取数据

In [7]:
# 数据路径
dpath = './data/'
filename = "listing_info.csv"

# 导入数据
listing_info = pd.read_csv(dpath + filename)

In [8]:
# 查看前五行数据
listing_info.head()

Unnamed: 0,user_id,listing_id,auditing_date,term,rate,principal
0,316610,1556649,2017-11-26,9,7.6,4800
1,62002,1556633,2017-11-26,6,7.6,4000
2,192135,1556629,2017-11-26,12,8.0,8660
3,487382,1556628,2017-11-26,9,7.6,4780
4,235186,1556627,2017-11-26,9,7.6,1480


In [9]:
# 检查数据规模和基本信息
print("Listing_info:", listing_info.shape)
listing_info.info()

Listing_info: (5484891, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5484891 entries, 0 to 5484890
Data columns (total 6 columns):
user_id          int64
listing_id       int64
auditing_date    object
term             int64
rate             float64
principal        int64
dtypes: float64(1), int64(4), object(1)
memory usage: 251.1+ MB


### 2.2 数据分析

#### 离散性数据

In [10]:
# 统计日期分布，并查看id是否有重复
feat_names = listing_info.columns.tolist()
feat_names.remove('term')
feat_names.remove('rate')
feat_names.remove('principal')
for col in feat_names:
    print('\n%s特征的不同取值和出现次数' % col)
    print(listing_info[col].value_counts())


user_id特征的不同取值和出现次数
712075    141
491698    109
35086      98
539106     93
531641     92
655008     88
441162     78
809262     77
196376     76
114491     75
399323     71
224381     69
341282     69
218560     67
210779     67
514308     66
288012     65
74810      64
627259     64
449599     62
79343      62
301729     61
350390     61
192276     60
333618     60
228976     59
333375     59
249503     58
637894     58
413045     56
         ... 
593357      1
139992      1
56567       1
130259      1
518136      1
244891      1
379989      1
709574      1
351303      1
359491      1
369192      1
287248      1
82611       1
299534      1
255937      1
260035      1
40190       1
44280       1
627625      1
191664      1
146602      1
607155      1
162978      1
158880      1
219095      1
248980      1
212106      1
220294      1
334926      1
187382      1
Name: user_id, Length: 928195, dtype: int64

listing_id特征的不同取值和出现次数
4196351    1
783403     1
771117     1
4971566    1
77521

#### 数值型数据

In [11]:
#查看数值型特征的基本统计量
num_feats = ['term','rate','principal']
listing_info[num_feats].describe()

Unnamed: 0,term,rate,principal
count,5484891.0,5484891.0,5484891.0
mean,7.985595,7.936558,3189.264
std,2.808216,0.7410161,2990.142
min,3.0,6.5,110.0
25%,6.0,7.2,1170.0
50%,6.0,8.0,2260.0
75%,12.0,8.6,3910.0
max,12.0,9.3,55810.0


数据正常，没有不合理数据

## 三、user_info

### 3.1 读取数据

In [12]:
# 数据路径
dpath = './data/'
filename = "user_info.csv"

# 导入数据
user_info = pd.read_csv(dpath + filename)

In [13]:
# 查看前五行数据
user_info.head()

Unnamed: 0,user_id,reg_mon,gender,age,cell_province,id_province,id_city,insertdate
0,483833,2017-04,男,19,c29,c26,c26241,2018-12-11
1,156772,2016-05,男,31,c11,c11,c11159,2018-02-13
2,173388,2016-05,男,34,c02,c02,c02182,2018-08-21
3,199107,2016-07,女,25,c09,c09,c09046,2018-06-05
4,122560,2016-03,男,23,c05,c05,c05193,2018-04-02


In [14]:
# 检查数据规模和基本信息
print("User_info:", user_info.shape)
user_info.info()

User_info: (954209, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954209 entries, 0 to 954208
Data columns (total 8 columns):
user_id          954209 non-null int64
reg_mon          954209 non-null object
gender           954209 non-null object
age              954209 non-null int64
cell_province    954209 non-null object
id_province      954209 non-null object
id_city          954209 non-null object
insertdate       954209 non-null object
dtypes: int64(2), object(6)
memory usage: 58.2+ MB


### 3.2 数据分析

#### 离散性数据

In [15]:
# 统计日期分布，并查看id是否有重复
feat_names = user_info.columns.tolist()
feat_names.remove('age')
for col in feat_names:
    print('\n%s特征的不同取值和出现次数' % col)
    print(user_info[col].value_counts())


user_id特征的不同取值和出现次数
670044    3
567844    3
799738    3
66268     3
336069    3
369333    3
315646    3
735926    3
216129    3
624751    3
594684    3
591491    3
507295    3
685862    2
37273     2
355901    2
849804    2
167772    2
456900    2
228513    2
33179     2
117004    2
460094    2
205364    2
662218    2
815613    2
619758    2
656073    2
833412    2
39320     2
         ..
263717    1
265764    1
314940    1
312893    1
319038    1
316991    1
499286    1
493141    1
495188    1
505427    1
507474    1
501329    1
503376    1
480847    1
482894    1
476749    1
478796    1
489035    1
491082    1
484937    1
486984    1
464455    1
466502    1
460357    1
462404    1
472643    1
474690    1
468545    1
470592    1
2049      1
Name: user_id, Length: 928195, dtype: int64

reg_mon特征的不同取值和出现次数
2017-08    40940
2017-03    38921
2017-09    38778
2017-07    38210
2017-05    37539
2017-06    37439
2017-04    35813
2017-10    35175
2017-01    31991
2016-11    30042
2017-11    3

#### 数值型数据

In [16]:
#查看数值型特征的基本统计量
num_feats = ['age']
user_info[num_feats].describe()

Unnamed: 0,age
count,954209.0
mean,29.473142
std,7.047088
min,18.0
25%,24.0
50%,28.0
75%,33.0
max,56.0


平均年龄29岁，最小18岁，最大56岁

## 四、user_taglist

### 4.1 读取数据

In [17]:
# 数据路径
dpath = './data/'
filename = "user_taglist.csv"

# 导入数据
user_taglist = pd.read_csv(dpath + filename)

In [18]:
user_taglist.head()

Unnamed: 0,user_id,taglist,insertdate
0,113401,4707|473|3498|4759|1654|298|2869|1164|212|1885...,2018-10-03
1,378358,751|2207|1100|2099|1832|1911|5347|2254|171|360...,2018-11-30
2,434838,877|3795|5628|70|2684|691|719|4228|631|1541|12...,2018-03-25
3,577061,2431|3242|340|1823|4020|4357|164|620|2168|1192...,2018-05-25
4,566753,3980|3125|1819|1333|1177|3972|621|5800|3632|16...,2018-12-02


In [19]:
# 检查数据规模和基本信息
print("user_taglist:", user_taglist.shape)
user_taglist.info()

user_taglist: (615160, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615160 entries, 0 to 615159
Data columns (total 3 columns):
user_id       615160 non-null int64
taglist       615160 non-null object
insertdate    615160 non-null object
dtypes: int64(1), object(2)
memory usage: 14.1+ MB


可以用tfidf做一个词频统计

### 4.2 数据分析

In [20]:
# 统计日期分布，并查看id是否有重复
feat_names = user_taglist.columns.tolist()
feat_names.remove('taglist')
for col in feat_names:
    print('\n%s特征的不同取值和出现次数' % col)
    print(user_taglist[col].value_counts())


user_id特征的不同取值和出现次数
515666    8
725262    8
340792    7
165520    7
416691    7
745034    7
461817    7
415706    6
87342     6
646213    6
228592    6
659898    6
402416    6
343124    6
644452    6
809262    6
410221    6
412676    6
666379    6
151004    6
30085     6
192923    6
441248    6
184419    6
208891    6
832234    6
366877    6
733352    6
685483    6
201241    6
         ..
401290    1
395145    1
130934    1
98150     1
96103     1
67433     1
73578     1
71531     1
75629     1
81774     1
79727     1
118640    1
122738    1
120691    1
126836    1
102264    1
397192    1
100217    1
106362    1
104315    1
110460    1
114558    1
112511    1
411521    1
417666    1
415619    1
421764    1
419717    1
423815    1
2049      1
Name: user_id, Length: 535380, dtype: int64

insertdate特征的不同取值和出现次数
2018-12-06    3225
2018-10-31    3204
2018-07-30    3178
2018-12-10    3040
2018-04-12    2854
2018-05-16    2721
2018-11-30    2685
2018-04-11    2652
2018-05-15    2613
2018-12-

## 五、user_behavior_logs

### 5.1 读取数据

In [21]:
# 数据路径
dpath = './data/'
filename = "user_behavior_logs.csv"

# 导入数据
user_behavior_logs = pd.read_csv(dpath + filename)

In [22]:
user_behavior_logs.head()

Unnamed: 0,user_id,behavior_time,behavior_type
0,842439,2018-09-13 23:17:21,3
1,842439,2018-09-13 23:17:21,3
2,905214,2018-09-13 15:19:30,3
3,905214,2018-09-13 15:19:30,3
4,842439,2018-09-13 23:17:21,3


In [23]:
# 检查数据规模和基本信息
print("user_behavior_logs:", user_behavior_logs.shape)
user_behavior_logs.info()

user_behavior_logs: (55781271, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55781271 entries, 0 to 55781270
Data columns (total 3 columns):
user_id          int64
behavior_time    object
behavior_type    int64
dtypes: int64(2), object(1)
memory usage: 1.2+ GB


### 5.2 数据分析

In [24]:
# 统计日期分布，并查看id是否有重复
feat_names = user_behavior_logs.columns.tolist()
for col in feat_names:
    print('\n%s特征的不同取值和出现次数' % col)
    print(user_behavior_logs[col].value_counts())


user_id特征的不同取值和出现次数
234311    40575
25799      8597
167141     8287
768034     7767
716991     6955
913109     5381
300045     4303
414157     3621
478684     3250
290479     3209
552003     2948
504933     2726
24892      2714
153072     2649
223856     2634
435365     2501
682007     2358
905608     2308
914964     2155
273557     2127
539106     2079
612504     2030
447696     1982
926351     1951
648124     1896
832390     1867
85596      1800
119648     1682
353162     1672
189804     1586
          ...  
33516         1
409068        1
313271        1
493492        1
83740         1
119239        1
717572        1
25776         1
448546        1
145585        1
687987        1
619337        1
39337         1
317538        1
542263        1
245550        1
292485        1
163678        1
70811         1
405885        1
449496        1
804554        1
134349        1
800222        1
131645        1
295661        1
296387        1
311723        1
112016        1
203071        1
Nam

## 六、user_repay_logs

### 6.1 读取数据

In [25]:
# 数据路径
dpath = './data/'
filename = "user_repay_logs.csv"

# 导入数据
user_repay_logs = pd.read_csv(dpath + filename)

In [26]:
user_repay_logs.head()

Unnamed: 0,user_id,listing_id,order_id,due_date,due_amt,repay_date,repay_amt
0,748483,1858122,6,2018-06-29,528.6365,2018-06-20,528.6365
1,748483,1858122,4,2018-04-29,528.6365,2200-01-01,528.6365
2,748483,1858122,7,2018-07-29,528.6365,2018-06-20,528.6365
3,748483,1858122,5,2018-05-29,528.6365,2018-05-29,528.6365
4,748483,1858122,1,2018-01-29,528.6365,2018-01-28,528.6365


In [27]:
# 检查数据规模和基本信息
print("user_repay_logs:", user_repay_logs.shape)
user_repay_logs.info()

user_repay_logs: (18001297, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18001297 entries, 0 to 18001296
Data columns (total 7 columns):
user_id       int64
listing_id    int64
order_id      int64
due_date      object
due_amt       float64
repay_date    object
repay_amt     float64
dtypes: float64(2), int64(3), object(2)
memory usage: 961.4+ MB


### 6.2 数据分析

#### 离散型数据

In [28]:
# 统计日期分布，并查看id是否有重复
feat_names = user_repay_logs.columns.tolist()
feat_names.remove('due_amt')
feat_names.remove('repay_amt')
for col in feat_names:
    print('\n%s特征的不同取值和出现次数' % col)
    print(user_repay_logs[col].value_counts())


user_id特征的不同取值和出现次数
531641    748
539106    723
712075    641
218560    563
35086     554
449599    550
491698    532
210779    531
74810     512
413045    512
655008    504
858506    492
274177    491
341282    463
114491    453
590583    451
200432    450
333375    448
79343     447
441496    443
129621    441
301729    439
809262    435
377856    435
734560    423
196376    422
368273    417
605998    414
297239    410
127537    409
         ... 
914846      1
432691      1
882062      1
627241      1
836278      1
823950      1
926647      1
908914      1
731941      1
145247      1
389171      1
692222      1
410386      1
522658      1
770109      1
914847      1
882063      1
581059      1
927112      1
700888      1
690531      1
259943      1
88820       1
405576      1
143560      1
18490       1
586128      1
215130      1
141558      1
367977      1
Name: user_id, Length: 874841, dtype: int64

listing_id特征的不同取值和出现次数
4793920    12
4454752    12
4259880    12
2004037    12
4

#### 数值型数据

In [29]:
#查看数值型特征的基本统计量
num_feats = ['due_amt','repay_amt']
user_repay_logs[num_feats].describe()

Unnamed: 0,due_amt,repay_amt
count,18001300.0,18001300.0
mean,428.1301,428.1301
std,419.6448,419.6448
min,17.4255,17.4255
25%,176.9331,176.9331
50%,293.4287,293.4287
75%,528.6365,528.6365
max,18827.02,18827.02


In [37]:
# 查看所有未还款标的
n = []
for index, row in user_repay_logs.iterrows():
    if row['repay_date'] == '2200-01-01':
        n.append(index)

In [39]:
user_repay_logs.iloc[n].head()

Unnamed: 0,user_id,listing_id,order_id,due_date,due_amt,repay_date,repay_amt
1,748483,1858122,4,2018-04-29,528.6365,2200-01-01,528.6365
46,237430,4058618,1,2018-08-23,212.525,2200-01-01,212.525
66,139770,959357,3,2017-12-12,190.3352,2200-01-01,190.3352
110,214405,1145618,2,2017-12-09,760.3875,2200-01-01,760.3875
132,495219,3766469,1,2018-07-17,92.5027,2200-01-01,92.5027


这些可能是逾期还款，而不是未还款。