In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
instalments = pd.read_csv("installments_payments.csv")

##### "Home credit"提供每位客戶的"每筆"貸款分期資料
##### 一個人可能有多筆貸款資料(1個SK_ID_CURR可能有多筆不同的SK_ID_PREV)
##### 貸款版本、合約分幾期、當期貸款應繳日期(距今多少天前)、當期貸款入帳日期(距今多少天前)、當期貸款應繳金額、當期貸款入帳金額

In [3]:
instalments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [4]:
instalments.shape

(13605401, 8)

In [5]:
#看缺失值 #兩Column有缺失值
instalments.isna().sum()/len(instalments)

SK_ID_PREV                0.000000
SK_ID_CURR                0.000000
NUM_INSTALMENT_VERSION    0.000000
NUM_INSTALMENT_NUMBER     0.000000
DAYS_INSTALMENT           0.000000
DAYS_ENTRY_PAYMENT        0.000214
AMT_INSTALMENT            0.000000
AMT_PAYMENT               0.000214
dtype: float64

In [6]:
#2905列為空值
instalments.isna().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
NUM_INSTALMENT_VERSION       0
NUM_INSTALMENT_NUMBER        0
DAYS_INSTALMENT              0
DAYS_ENTRY_PAYMENT        2905
AMT_INSTALMENT               0
AMT_PAYMENT               2905
dtype: int64

In [7]:
#稍微排序後檢視資料
pd.set_option('display.max_rows',None)
instalments.sort_values(by=['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER']).head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
1478621,1369693,100001,1.0,1,-1709.0,-1715.0,3951.0,3951.0
2568722,1369693,100001,1.0,2,-1679.0,-1715.0,3951.0,3951.0
3458712,1369693,100001,1.0,3,-1649.0,-1660.0,3951.0,3951.0
2624024,1369693,100001,2.0,4,-1619.0,-1628.0,17397.9,17397.9
1761012,1851984,100001,1.0,2,-2916.0,-2916.0,3982.05,3982.05
3774071,1851984,100001,1.0,3,-2886.0,-2875.0,3982.05,3982.05
3435373,1851984,100001,1.0,4,-2856.0,-2856.0,3980.925,3980.925
2144879,1038818,100002,1.0,1,-565.0,-587.0,9251.775,9251.775
2163032,1038818,100002,1.0,2,-535.0,-562.0,9251.775,9251.775
1675768,1038818,100002,1.0,3,-505.0,-529.0,9251.775,9251.775


In [8]:
#依各人的時間排序，讓表格比較好閱讀
pd.set_option('display.max_rows',None)
instalments = instalments.sort_values(by=['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER','DAYS_INSTALMENT','DAYS_ENTRY_PAYMENT'])
instalments.head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
1478621,1369693,100001,1.0,1,-1709.0,-1715.0,3951.0,3951.0
2568722,1369693,100001,1.0,2,-1679.0,-1715.0,3951.0,3951.0
3458712,1369693,100001,1.0,3,-1649.0,-1660.0,3951.0,3951.0
2624024,1369693,100001,2.0,4,-1619.0,-1628.0,17397.9,17397.9
1761012,1851984,100001,1.0,2,-2916.0,-2916.0,3982.05,3982.05
3774071,1851984,100001,1.0,3,-2886.0,-2875.0,3982.05,3982.05
3435373,1851984,100001,1.0,4,-2856.0,-2856.0,3980.925,3980.925
2144879,1038818,100002,1.0,1,-565.0,-587.0,9251.775,9251.775
2163032,1038818,100002,1.0,2,-535.0,-562.0,9251.775,9251.775
1675768,1038818,100002,1.0,3,-505.0,-529.0,9251.775,9251.775


In [9]:
#新增延遲還款天數欄位，正為遲還，負為早還
instalments['DAYS_ENTRY_DIFF'] = instalments['DAYS_ENTRY_PAYMENT']-instalments['DAYS_INSTALMENT'] 
#新增還款金額不足欄位，正為多還，負為少還
instalments['AMT_PAY_DIFF'] = instalments['AMT_PAYMENT']-instalments['AMT_INSTALMENT'] 
#新增是否延遲還款欄位，1為有延遲，0為沒延遲
instalments['DELAY'] = (instalments['DAYS_ENTRY_DIFF'] > 0).astype(int)
instalments.head(100)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,DAYS_ENTRY_DIFF,AMT_PAY_DIFF,DELAY
1478621,1369693,100001,1.0,1,-1709.0,-1715.0,3951.0,3951.0,-6.0,0.0,0
2568722,1369693,100001,1.0,2,-1679.0,-1715.0,3951.0,3951.0,-36.0,0.0,0
3458712,1369693,100001,1.0,3,-1649.0,-1660.0,3951.0,3951.0,-11.0,0.0,0
2624024,1369693,100001,2.0,4,-1619.0,-1628.0,17397.9,17397.9,-9.0,0.0,0
1761012,1851984,100001,1.0,2,-2916.0,-2916.0,3982.05,3982.05,0.0,0.0,0
3774071,1851984,100001,1.0,3,-2886.0,-2875.0,3982.05,3982.05,11.0,0.0,1
3435373,1851984,100001,1.0,4,-2856.0,-2856.0,3980.925,3980.925,0.0,0.0,0
2144879,1038818,100002,1.0,1,-565.0,-587.0,9251.775,9251.775,-22.0,0.0,0
2163032,1038818,100002,1.0,2,-535.0,-562.0,9251.775,9251.775,-27.0,0.0,0
1675768,1038818,100002,1.0,3,-505.0,-529.0,9251.775,9251.775,-24.0,0.0,0


In [10]:
instalments.shape

(13605401, 11)

新增以下欄位
- 該筆貸款紀錄中的最大、最小還款期數 (原始資料可能有缺，最大還款期數待與previous_application內的資料驗證) (NUM_INSTALMENT_NUMBER_GROUP_MAX,NUM_INSTALMENT_NUMBER_GROUP_MIN)
- 該筆貸款最後還款日為何時 (DAYS_ENTRY_PAYMENT_MAX)
- 該筆貸款最早還款日為何時 (DAYS_ENTRY_PAYMENT_MIN)
- 每筆貸款最多延遲幾天 (DAYS_ENTRY_DIFF_MAX)
- 每筆貸款平均延遲幾天 (DAYS_ENTRY_DIFF_MEAN)
- 每筆貸款的每期還款金額中位數 (AMT_PAYMENT_MEDIAN)
- 每筆貸款的每期還款金額最大值 (AMT_PAYMENT_MAX)
- 延遲的期數 (DELAY_SUM)
- 延遲的期數占該筆貸款總期數多少比例 (DELAY%)

In [11]:
#設為不重要的還款(不重要的還款=某期分期付款上實際支付的金額/某期分期付款的規定支付金額)
instalments['UNIMPORTANT_REPAYMENT'] = instalments['AMT_PAYMENT']/instalments['AMT_INSTALMENT']

In [12]:
pd.reset_option('all')
instalments[instalments['UNIMPORTANT_REPAYMENT']<0.02]

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,DAYS_ENTRY_DIFF,AMT_PAY_DIFF,DELAY,UNIMPORTANT_REPAYMENT
2328091,1940724,100007,1.0,1,-1076.0,-1106.0,22678.785,23.130,-30.0,-22655.655,0,0.001020
2656012,2462742,100007,1.0,10,-1466.0,-1497.0,3601.530,3.240,-31.0,-3598.290,0,0.000900
1757327,2462742,100007,1.0,12,-1406.0,-1436.0,3577.050,0.180,-30.0,-3576.870,0,0.000050
2358618,1907290,100008,1.0,10,-2029.0,-712.0,11986.155,227.160,1317.0,-11758.995,1,0.018952
1647439,2794143,100011,1.0,24,-469.0,-498.0,31295.250,461.565,-29.0,-30833.685,0,0.014749
...,...,...,...,...,...,...,...,...,...,...,...,...
12775776,2073384,456255,1.0,1,-618.0,-648.0,16400.610,34.965,-30.0,-16365.645,0,0.002132
13146798,2073384,456255,1.0,2,-588.0,-613.0,16400.610,81.495,-25.0,-16319.115,0,0.004969
13266002,2631384,456255,3.0,18,-246.0,-266.0,27489.690,110.880,-20.0,-27378.810,0,0.004034
12734779,2631384,456255,3.0,20,-186.0,-221.0,27489.690,447.255,-35.0,-27042.435,0,0.016270


In [13]:
#依每筆貸款，整理出每期還款的最大延遲天數作為該期代表與是否延遲
instalments = pd.DataFrame(instalments.groupby(['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER']).agg({'DAYS_ENTRY_PAYMENT':['max','min'],
                                                                                                 'DAYS_ENTRY_DIFF':'max',
                                                                                                 'AMT_PAYMENT':['median','max'],
                                                                                                 'DELAY':'max'}))
pd.set_option('display.max_rows',None)
instalments.head(200) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,DAYS_ENTRY_PAYMENT,DAYS_ENTRY_PAYMENT,DAYS_ENTRY_DIFF,AMT_PAYMENT,AMT_PAYMENT,DELAY
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,max,min,max,median,max,max
SK_ID_CURR,SK_ID_PREV,NUM_INSTALMENT_NUMBER,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
100001,1369693,1,-1715.0,-1715.0,-6.0,3951.0,3951.0,0
100001,1369693,2,-1715.0,-1715.0,-36.0,3951.0,3951.0,0
100001,1369693,3,-1660.0,-1660.0,-11.0,3951.0,3951.0,0
100001,1369693,4,-1628.0,-1628.0,-9.0,17397.9,17397.9,0
100001,1851984,2,-2916.0,-2916.0,0.0,3982.05,3982.05,0
100001,1851984,3,-2875.0,-2875.0,11.0,3982.05,3982.05,1
100001,1851984,4,-2856.0,-2856.0,0.0,3980.925,3980.925,0
100002,1038818,1,-587.0,-587.0,-22.0,9251.775,9251.775,0
100002,1038818,2,-562.0,-562.0,-27.0,9251.775,9251.775,0
100002,1038818,3,-529.0,-529.0,-24.0,9251.775,9251.775,0


In [14]:
#將其中一欄column改成表格資料
instalments.reset_index(level='NUM_INSTALMENT_NUMBER',inplace=True) 

In [15]:
instalments.columns = ['NUM_INSTALMENT_NUMBER','DAYS_ENTRY_PAYMENT_MAX','DAYS_ENTRY_PAYMENT_MIN','DAYS_ENTRY_DIFF','AMT_PAYMENT_MEDIAN','AMT_PAYMENT_MAX','DELAY']

In [16]:
instalments.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,NUM_INSTALMENT_NUMBER,DAYS_ENTRY_PAYMENT_MAX,DAYS_ENTRY_PAYMENT_MIN,DAYS_ENTRY_DIFF,AMT_PAYMENT_MEDIAN,AMT_PAYMENT_MAX,DELAY
SK_ID_CURR,SK_ID_PREV,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100001,1369693,1,-1715.0,-1715.0,-6.0,3951.0,3951.0,0
100001,1369693,2,-1715.0,-1715.0,-36.0,3951.0,3951.0,0
100001,1369693,3,-1660.0,-1660.0,-11.0,3951.0,3951.0,0
100001,1369693,4,-1628.0,-1628.0,-9.0,17397.9,17397.9,0
100001,1851984,2,-2916.0,-2916.0,0.0,3982.05,3982.05,0


In [17]:
instalments.columns

Index(['NUM_INSTALMENT_NUMBER', 'DAYS_ENTRY_PAYMENT_MAX',
       'DAYS_ENTRY_PAYMENT_MIN', 'DAYS_ENTRY_DIFF', 'AMT_PAYMENT_MEDIAN',
       'AMT_PAYMENT_MAX', 'DELAY'],
      dtype='object')

In [18]:
instalments.rename(columns={'NUM_INSTALMENT_NUMBER':'NUM_INSTALMENT_NUMBER_GROUP'},inplace=True)

In [19]:
instalments.columns

Index(['NUM_INSTALMENT_NUMBER_GROUP', 'DAYS_ENTRY_PAYMENT_MAX',
       'DAYS_ENTRY_PAYMENT_MIN', 'DAYS_ENTRY_DIFF', 'AMT_PAYMENT_MEDIAN',
       'AMT_PAYMENT_MAX', 'DELAY'],
      dtype='object')

In [20]:
# 整理出每筆貸款分成幾次還與延遲次數
instalments = pd.DataFrame(instalments.groupby(['SK_ID_CURR','SK_ID_PREV']).agg({'NUM_INSTALMENT_NUMBER_GROUP':['max','min'],
                                                                           'DAYS_ENTRY_PAYMENT_MAX':'max',
                                                                           'DAYS_ENTRY_PAYMENT_MIN':'min',
                                                                           'DAYS_ENTRY_DIFF':['max','mean'],
                                                                           'AMT_PAYMENT_MEDIAN':'median',
                                                                           'AMT_PAYMENT_MAX':'max',
                                                                           'DELAY':'sum'}))

In [21]:
instalments.columns

MultiIndex([('NUM_INSTALMENT_NUMBER_GROUP',    'max'),
            ('NUM_INSTALMENT_NUMBER_GROUP',    'min'),
            (     'DAYS_ENTRY_PAYMENT_MAX',    'max'),
            (     'DAYS_ENTRY_PAYMENT_MIN',    'min'),
            (            'DAYS_ENTRY_DIFF',    'max'),
            (            'DAYS_ENTRY_DIFF',   'mean'),
            (         'AMT_PAYMENT_MEDIAN', 'median'),
            (            'AMT_PAYMENT_MAX',    'max'),
            (                      'DELAY',    'sum')],
           )

In [22]:
instalments.columns = ['NUM_INSTALMENT_NUMBER_GROUP_MAX',
                    'NUM_INSTALMENT_NUMBER_GROUP_MIN',
                    'DAYS_ENTRY_PAYMENT_MAX',
                    'DAYS_ENTRY_PAYMENT_MIN',
                    'DAYS_ENTRY_DIFF_MAX',
                    'DAYS_ENTRY_DIFF_MEAN',
                    'AMT_PAYMENT_MEDIAN',
                    'AMT_PAYMENT_MAX',
                    'DELAY_SUM']

In [23]:
instalments.reset_index(level=('SK_ID_CURR','SK_ID_PREV'),inplace=True)

In [24]:
instalments['DELAY%'] = instalments['DELAY_SUM']/(instalments['NUM_INSTALMENT_NUMBER_GROUP_MAX']-instalments['NUM_INSTALMENT_NUMBER_GROUP_MIN']+1)

In [25]:
instalments.head(100)

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,NUM_INSTALMENT_NUMBER_GROUP_MAX,NUM_INSTALMENT_NUMBER_GROUP_MIN,DAYS_ENTRY_PAYMENT_MAX,DAYS_ENTRY_PAYMENT_MIN,DAYS_ENTRY_DIFF_MAX,DAYS_ENTRY_DIFF_MEAN,AMT_PAYMENT_MEDIAN,AMT_PAYMENT_MAX,DELAY_SUM,DELAY%
0,100001,1369693,4,1,-1628.0,-1715.0,-6.0,-15.5,3951.0,17397.9,0,0.0
1,100001,1851984,4,2,-2856.0,-2916.0,11.0,3.666667,3982.05,3982.05,1,0.333333
2,100002,1038818,19,1,-49.0,-587.0,-12.0,-20.421053,9251.775,53093.745,0,0.0
3,100003,1810518,7,1,-544.0,-719.0,-3.0,-4.428571,98356.995,560835.36,0,0.0
4,100003,2396755,12,1,-1985.0,-2324.0,-1.0,-6.75,6737.31,6737.31,0,0.0
5,100003,2636178,6,1,-661.0,-806.0,-9.0,-11.166667,64567.665,64567.665,0,0.0
6,100004,1564014,3,1,-727.0,-795.0,-3.0,-7.666667,5357.25,10573.965,0,0.0
7,100005,2495675,9,1,-470.0,-736.0,1.0,-23.555556,4813.2,17656.245,1,0.111111
8,100006,2078043,1,1,-174.0,-174.0,-23.0,-23.0,691786.89,691786.89,0,0.0
9,100006,2190416,10,1,-12.0,-285.0,-1.0,-4.5,29027.52,29027.52,0,0.0


In [26]:
instalments.isna().sum()

SK_ID_CURR                          0
SK_ID_PREV                          0
NUM_INSTALMENT_NUMBER_GROUP_MAX     0
NUM_INSTALMENT_NUMBER_GROUP_MIN     0
DAYS_ENTRY_PAYMENT_MAX             78
DAYS_ENTRY_PAYMENT_MIN             78
DAYS_ENTRY_DIFF_MAX                78
DAYS_ENTRY_DIFF_MEAN               78
AMT_PAYMENT_MEDIAN                 78
AMT_PAYMENT_MAX                    78
DELAY_SUM                           0
DELAY%                              0
dtype: int64

In [27]:
columns_to_fill = [
    'DAYS_ENTRY_PAYMENT_MAX',
    'DAYS_ENTRY_PAYMENT_MIN',
    'DAYS_ENTRY_DIFF_MAX',
    'DAYS_ENTRY_DIFF_MEAN',
    'AMT_PAYMENT_MEDIAN',
    'AMT_PAYMENT_MAX'
]

instalments[columns_to_fill] = instalments[columns_to_fill].fillna(0)

In [28]:
instalments.isna().sum()

SK_ID_CURR                         0
SK_ID_PREV                         0
NUM_INSTALMENT_NUMBER_GROUP_MAX    0
NUM_INSTALMENT_NUMBER_GROUP_MIN    0
DAYS_ENTRY_PAYMENT_MAX             0
DAYS_ENTRY_PAYMENT_MIN             0
DAYS_ENTRY_DIFF_MAX                0
DAYS_ENTRY_DIFF_MEAN               0
AMT_PAYMENT_MEDIAN                 0
AMT_PAYMENT_MAX                    0
DELAY_SUM                          0
DELAY%                             0
dtype: int64

In [29]:
instalments.shape

(997752, 12)

In [30]:
print(instalments.columns)

Index(['SK_ID_CURR', 'SK_ID_PREV', 'NUM_INSTALMENT_NUMBER_GROUP_MAX',
       'NUM_INSTALMENT_NUMBER_GROUP_MIN', 'DAYS_ENTRY_PAYMENT_MAX',
       'DAYS_ENTRY_PAYMENT_MIN', 'DAYS_ENTRY_DIFF_MAX', 'DAYS_ENTRY_DIFF_MEAN',
       'AMT_PAYMENT_MEDIAN', 'AMT_PAYMENT_MAX', 'DELAY_SUM', 'DELAY%'],
      dtype='object')


In [40]:
aggregations = {
    'SK_ID_PREV': 'first',  
    'NUM_INSTALMENT_NUMBER_GROUP_MAX': 'mean',  
    'NUM_INSTALMENT_NUMBER_GROUP_MIN': 'mean',  
    'DAYS_ENTRY_PAYMENT_MAX': 'mean',  
    'DAYS_ENTRY_PAYMENT_MIN': 'mean',  
    'DAYS_ENTRY_DIFF_MAX': 'mean',  
    'DAYS_ENTRY_DIFF_MEAN': 'mean',  
    'AMT_PAYMENT_MEDIAN': 'mean',  
    'AMT_PAYMENT_MAX': 'mean',  
    'DELAY_SUM': 'sum',  
    'DELAY%': 'mean'  
}

instalments = instalments.groupby('SK_ID_CURR').agg(aggregations).reset_index()

instalments.head(10)

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,NUM_INSTALMENT_NUMBER_GROUP_MAX,NUM_INSTALMENT_NUMBER_GROUP_MIN,DAYS_ENTRY_PAYMENT_MAX,DAYS_ENTRY_PAYMENT_MIN,DAYS_ENTRY_DIFF_MAX,DAYS_ENTRY_DIFF_MEAN,AMT_PAYMENT_MEDIAN,AMT_PAYMENT_MAX,DELAY_SUM,DELAY%
0,100001,1369693,4.0,1.5,-2242.0,-2315.5,2.5,-5.916667,3966.525,10689.975,1,0.166667
1,100002,1038818,19.0,1.0,-49.0,-587.0,-12.0,-20.421053,9251.775,53093.745,0,0.0
2,100003,1810518,8.333333,1.0,-1063.333333,-1283.0,-4.333333,-7.448413,56553.99,210713.445,0,0.0
3,100004,1564014,3.0,1.0,-727.0,-795.0,-3.0,-7.666667,5357.25,10573.965,0,0.0
4,100005,2495675,9.0,1.0,-470.0,-736.0,1.0,-23.555556,4813.2,17656.245,1,0.111111
5,100006,2078043,5.333333,1.0,-208.0,-344.666667,-12.333333,-25.3,241099.11,245324.685,0,0.0
6,100007,1692033,12.6,1.0,-917.4,-1270.4,3.8,-1.597843,12132.369,12132.369,16,0.3
7,100008,1186888,8.5,1.0,-892.25,-1451.0,322.75,20.579167,15839.69625,117566.87625,1,0.025
8,100009,1112703,6.5,1.125,-815.375,-974.75,-5.125,-9.433333,9564.395625,9565.048125,1,0.03125
9,100010,2349489,10.0,1.0,-774.0,-1069.0,-5.0,-11.9,27463.41,27463.41,0,0.0


In [32]:
instalments.shape

(339587, 12)

In [37]:
instalments.to_csv('instalments_payments_ETL_v2.csv', index=False)