In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
pos_cash = pd.read_csv('POS_CASH_balance.csv')

In [3]:
pos_cash.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [4]:
pos_cash.shape

(10001358, 8)

In [5]:
# Total_Months --> 總還款期數
# MONTHS_BALANCE_start --> 貸款開始時間
# MONTHS_BALANCE_finish --> 貸款結束時間
# CNT_INSTALMENT_max --> 最大申請期數(擬定合約期數)
# CNT_INSTALMENT_min --> 最小申請期數(中間改合約)
# CNT_INSTALMENT_median --> 申請期數中位數
# Delay_Rate --> 遲繳比例(ex 共四期 有一期遲繳就是1/4)
# SK_DPD_max --> 貸款愈期最大天數
# SK_DPD_mean --> 貸款逾期平均數
# Contract_Change -- > 提早繳完 且 不在目前進行中的貸款
# Contract_Change_count --> 合約縮短了多少期
# CNT_INSTALMENT/Total_Months_rate --> 申請期數與總還款期數比例
# Contract_Change_rate(CNT_INSTALMENT) --> 縮短了多少期除以申請期數
# Contract_Change_rate(Total_Months) --> 縮短了多少期除以總還款期數

# 將有特殊狀態的欄位新增 (onehot)
categorical_columns = []
for name in pos_cash:
    if pos_cash[name].dtype=='object':
        categorical_columns.append(name)
pos_cash = pd.get_dummies(pos_cash, columns = categorical_columns)

pos_cash.rename(columns={'NAME_CONTRACT_STATUS_Active':'Active',
                     'NAME_CONTRACT_STATUS_Amortized debt':'Amortized debt',
                    'NAME_CONTRACT_STATUS_Approved':'Approved',
                    'NAME_CONTRACT_STATUS_Canceled':'Canceled',
                    'NAME_CONTRACT_STATUS_Completed':'Completed',
                    'NAME_CONTRACT_STATUS_Demand':'Demand',
                    'NAME_CONTRACT_STATUS_Returned to the store':'Returned to the store',
                    'NAME_CONTRACT_STATUS_Signed':'Signed',
                    'NAME_CONTRACT_STATUS_XNA':'XNA'},inplace=True)

# 新增欄位

pos_cash['Delay']=(pos_cash['SK_DPD']>0).replace(True,1) # 該期貸款有愈期，1為有延遲，0為沒延遲
pos_cash['SK_DPD_mean']=pos_cash['SK_DPD'] # 要算平均
pos_cash['CNT_INSTALMENT_min']=pos_cash['CNT_INSTALMENT'] # 申請期最小值
pos_cash['CNT_INSTALMENT_median']=pos_cash['CNT_INSTALMENT'] # 申請期的中位數
pos_cash['MONTHS_BALANCE_start']=pos_cash['MONTHS_BALANCE'] # 貸款開始時間
pos_cash['MONTHS_BALANCE_finish']=pos_cash['MONTHS_BALANCE'] # 貸款結束時間


# 創造新的欄位 
num_aggregations = {
        'MONTHS_BALANCE':'count',
        'MONTHS_BALANCE_start':'min',
        'MONTHS_BALANCE_finish':'max', 
        'CNT_INSTALMENT' : 'max',
        'CNT_INSTALMENT_min':'min',
        'CNT_INSTALMENT_median':'median',
        'Delay':'mean',
        'SK_DPD':'max',
        'SK_DPD_mean':'mean',
        'Completed':'max',
        'Active':'sum',
        'Signed':'sum',
        'Demand':'sum',
        'Returned to the store':'sum',
        'Approved':'sum',
        'Amortized debt':'sum',
        'Canceled':'sum',
        'XNA':'sum'
    }

pos_cash = pos_cash.groupby(['SK_ID_CURR',
                                    'SK_ID_PREV']).agg({**num_aggregations})
# 做更名
pos_cash.rename(columns={'MONTHS_BALANCE':'Total_Months',
                      'Delay':"Delay_Rate",
                      'SK_DPD':'SK_DPD_max',
                      'CNT_INSTALMENT':'CNT_INSTALMENT_max'},inplace=True)
# 重新排列
pos_cash.reset_index(level=('SK_ID_CURR',
                          'SK_ID_PREV'),inplace=True)

# 提早繳完 且 不在目前進行中的貸款
pos_cash['Contract_Change'] = ((pos_cash['Total_Months'] < pos_cash ['CNT_INSTALMENT_max']) & (pos_cash['Completed'] != 0)).replace(True,1)
# 合約縮短了多少期
pos_cash['Contract_Change_count'] = pos_cash['CNT_INSTALMENT_max']-pos_cash['CNT_INSTALMENT_min']
# 申請期數與總還款期數比例
pos_cash['CNT_INSTALMENT/Total_Months_rate'] = pos_cash['CNT_INSTALMENT_max']/pos_cash['Total_Months']
# 縮短了多少期除以申請期數
pos_cash['Contract_Change_rate(CNT_INSTALMENT)'] = pos_cash['Contract_Change_count']/pos_cash['CNT_INSTALMENT_max']
# 縮短了多少期除以總還款期數
pos_cash['Contract_Change_rate(Total_Months)'] = pos_cash['Contract_Change_count']/pos_cash['Total_Months']
# 狀態發生期數除以總還款期數
pos_cash['Active'] = pos_cash['Active']/pos_cash['Total_Months']
pos_cash['Signed'] = pos_cash['Signed']/pos_cash['Total_Months']
pos_cash['Demand'] = pos_cash['Demand']/pos_cash['Total_Months']
pos_cash['Returned to the store'] = pos_cash['Returned to the store']/pos_cash['Total_Months']
pos_cash['Approved'] = pos_cash['Approved']/pos_cash['Total_Months']
pos_cash['Amortized debt'] = pos_cash['Amortized debt']/pos_cash['Total_Months']
pos_cash['Canceled'] = pos_cash['Canceled']/pos_cash['Total_Months']
pos_cash['XNA'] = pos_cash['XNA']/pos_cash['Total_Months']
# POS_data_1 = POS_data_1.drop(['SK_ID_CURR'],axis=1)

In [10]:
pos_cash.tail(200)

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,Total_Months,MONTHS_BALANCE_start,MONTHS_BALANCE_finish,CNT_INSTALMENT_max,CNT_INSTALMENT_min,CNT_INSTALMENT_median,Delay_Rate,SK_DPD_max,...,Returned to the store,Approved,Amortized debt,Canceled,XNA,Contract_Change,Contract_Change_count,CNT_INSTALMENT/Total_Months_rate,Contract_Change_rate(CNT_INSTALMENT),Contract_Change_rate(Total_Months)
936125,456196,1940994,12,-76,-65,12.0,12.0,12.0,0.000000,0,...,0.0,0.0,0.0,0.0,0.0,False,0.0,1.000000,0.000000,0.000000
936126,456196,2732872,14,-15,-2,48.0,48.0,48.0,0.142857,4,...,0.0,0.0,0.0,0.0,0.0,False,0.0,3.428571,0.000000,0.000000
936127,456197,1028255,7,-26,-20,6.0,6.0,6.0,0.000000,0,...,0.0,0.0,0.0,0.0,0.0,False,0.0,0.857143,0.000000,0.000000
936128,456197,1285925,3,-4,-2,6.0,6.0,6.0,0.000000,0,...,0.0,0.0,0.0,0.0,0.0,False,0.0,2.000000,0.000000,0.000000
936129,456197,1292596,3,-96,-94,9.0,9.0,9.0,0.000000,0,...,0.0,0.0,0.0,0.0,0.0,1,0.0,3.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936320,456255,1359084,9,-15,-7,12.0,8.0,12.0,0.000000,0,...,0.0,0.0,0.0,0.0,0.0,1,4.0,1.333333,0.333333,0.444444
936321,456255,1743609,11,-33,-23,12.0,10.0,12.0,0.090909,5,...,0.0,0.0,0.0,0.0,0.0,1,2.0,1.090909,0.166667,0.181818
936322,456255,2073384,5,-21,-17,24.0,3.0,24.0,0.000000,0,...,0.0,0.0,0.0,0.0,0.0,1,21.0,4.800000,0.875000,4.200000
936323,456255,2631384,25,-26,-2,36.0,24.0,36.0,0.000000,0,...,0.0,0.0,0.0,0.0,0.0,1,12.0,1.440000,0.333333,0.480000


In [7]:
pos_cash.isna().sum()

SK_ID_CURR                                0
SK_ID_PREV                                0
Total_Months                              0
MONTHS_BALANCE_start                      0
MONTHS_BALANCE_finish                     0
CNT_INSTALMENT_max                      890
CNT_INSTALMENT_min                      890
CNT_INSTALMENT_median                   890
Delay_Rate                                0
SK_DPD_max                                0
SK_DPD_mean                               0
Completed                                 0
Active                                    0
Signed                                    0
Demand                                    0
Returned to the store                     0
Approved                                  0
Amortized debt                            0
Canceled                                  0
XNA                                       0
Contract_Change                           0
Contract_Change_count                   890
CNT_INSTALMENT/Total_Months_rate

In [12]:
pos_cash.fillna(0, inplace=True)

In [13]:
pos_cash.isna().sum()

SK_ID_CURR                              0
SK_ID_PREV                              0
Total_Months                            0
MONTHS_BALANCE_start                    0
MONTHS_BALANCE_finish                   0
CNT_INSTALMENT_max                      0
CNT_INSTALMENT_min                      0
CNT_INSTALMENT_median                   0
Delay_Rate                              0
SK_DPD_max                              0
SK_DPD_mean                             0
Completed                               0
Active                                  0
Signed                                  0
Demand                                  0
Returned to the store                   0
Approved                                0
Amortized debt                          0
Canceled                                0
XNA                                     0
Contract_Change                         0
Contract_Change_count                   0
CNT_INSTALMENT/Total_Months_rate        0
Contract_Change_rate(CNT_INSTALMEN

In [14]:
pos_cash = pos_cash.groupby('SK_ID_CURR').mean().reset_index()

In [16]:
pos_cash.head(50)

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,Total_Months,MONTHS_BALANCE_start,MONTHS_BALANCE_finish,CNT_INSTALMENT_max,CNT_INSTALMENT_min,CNT_INSTALMENT_median,Delay_Rate,SK_DPD_max,...,Demand,Returned to the store,Approved,Amortized debt,Canceled,XNA,Contract_Change_count,CNT_INSTALMENT/Total_Months_rate,Contract_Change_rate(CNT_INSTALMENT),Contract_Change_rate(Total_Months)
0,100001,1610838.0,4.5,-76.5,-73.0,4.0,4.0,4.0,0.125,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.0,0.0
1,100002,1038818.0,19.0,-19.0,-1.0,24.0,24.0,24.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.263158,0.0,0.0
2,100003,2281150.0,9.333333,-43.333333,-35.0,10.0,8.333333,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.666667,1.083333,0.138889,0.208333
3,100004,1564014.0,4.0,-27.0,-24.0,4.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.25,0.25
4,100005,2495675.0,11.0,-25.0,-15.0,12.0,9.0,12.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.090909,0.25,0.272727
5,100006,2189263.0,7.0,-12.0,-6.0,24.0,6.0,8.333333,0.0,0.0,...,0.0,0.041667,0.0,0.0,0.0,0.0,18.0,6.233333,0.520833,5.513889
6,100007,2043343.0,13.2,-42.2,-30.0,15.2,15.0,15.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.2,1.169231,0.011111,0.011111
7,100008,1981561.0,20.75,-48.5,-28.75,14.0,8.5,14.0,0.195455,323.5,...,0.0,0.0,0.0,0.0,0.0,0.0,5.5,1.293561,0.183333,0.611111
8,100009,1816523.0,8.0,-32.375,-25.375,7.625,7.625,7.625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.345092,0.0,0.0
9,100010,2349489.0,11.0,-35.0,-25.0,10.0,10.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.909091,0.0,0.0


In [17]:
pos_cash = pos_cash.drop('SK_ID_PREV', axis=1)

In [18]:
pos_cash.head()

Unnamed: 0,SK_ID_CURR,Total_Months,MONTHS_BALANCE_start,MONTHS_BALANCE_finish,CNT_INSTALMENT_max,CNT_INSTALMENT_min,CNT_INSTALMENT_median,Delay_Rate,SK_DPD_max,SK_DPD_mean,...,Demand,Returned to the store,Approved,Amortized debt,Canceled,XNA,Contract_Change_count,CNT_INSTALMENT/Total_Months_rate,Contract_Change_rate(CNT_INSTALMENT),Contract_Change_rate(Total_Months)
0,100001,4.5,-76.5,-73.0,4.0,4.0,4.0,0.125,3.5,0.875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.0,0.0
1,100002,19.0,-19.0,-1.0,24.0,24.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.263158,0.0,0.0
2,100003,9.333333,-43.333333,-35.0,10.0,8.333333,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.666667,1.083333,0.138889,0.208333
3,100004,4.0,-27.0,-24.0,4.0,3.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.25,0.25
4,100005,11.0,-25.0,-15.0,12.0,9.0,12.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.090909,0.25,0.272727


In [19]:
pos_cash.to_csv('POS_CASH_balance_ETL_v1.csv', index=False)