In [1]:
# 合併 bureau.csv 和 average.csv(貸款狀況評分)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

head = ['userID', 'SK_ID_BUREAU', 'CA', 'CC', 'DC',
        'CDO', 'DE', 'DF', 'MO', 'CP', 'CS', 'CD', 'CL', 'CO', 'CT', 'CU', 'annuity']
df = pd.read_csv('./data/bureau.csv') 
bal_df = pd.read_csv('./data/average.csv')
br_df = df.copy(deep=True)
br_df.columns = head
br_df.drop(columns=['CC', 'DC', 'CDO', 'DE', 'DF',
                   'MO', 'CP', 'CL', 'CO', 'CU'], inplace=True)

br_df = br_df.merge(bal_df, on='SK_ID_BUREAU', how='left')
print(br_df.head())

   userID  SK_ID_BUREAU      CA         CS        CD               CT  \
0  215354       5714462  Closed    91323.0       0.0  Consumer credit   
1  215354       5714463  Active   225000.0  171342.0      Credit card   
2  215354       5714464  Active   464323.5       NaN  Consumer credit   
3  215354       5714465  Active    90000.0       NaN      Credit card   
4  215354       5714466  Active  2700000.0       NaN  Consumer credit   

   annuity  average  
0      NaN      NaN  
1      NaN      NaN  
2      NaN      NaN  
3      NaN      NaN  
4      NaN      NaN  


In [2]:
# Credit Type 名稱冗長，重新命名
print(f"命名前一共 {len(list(set(br_df['CT'])))} 個類別")
renameDict = {'Unknown type of loan' : 'Unknown type', 'Unknown type': 'Unknown type','Loan for business development':'business development',
               'Loan for working capital replenishment':'working capital replenishment','Loan for purchase of shares (margin lending)':'purchase of shares', 
              'Consumer credit':'Consumer credit', 'Mortgage':'Mortgage', 'Real estate loan' : 'Real estate loan', 'Another type of loan':'Another type of loan',
                'Loan for the purchase of equipment' :'purchase of equipment','business development':'business development',
              'Cash loan' : 'Cash loan', 'Car loan' : 'Car loan', 'Credit card':'Credit card', 'Interbank credit':'Interbank credit', 'Mobile operator loan' : 'Mobile operator loan',
               'Cash loan (non-earmarked)':'Cash loan', 'Microloan':'Microloan', 'working capital replenishment':'working capital replenishment',
              }

br_df['CT'].replace(renameDict, inplace=True)
loanType = list(set(br_df['CT']))
print(f"命名後一共 {len(set(br_df['CT']))} 個類別")
print(loanType)

命名前一共 15 個類別
命名後一共 15 個類別
['business development', 'working capital replenishment', 'Another type of loan', 'purchase of shares', 'Mobile operator loan', 'Car loan', 'Microloan', 'Cash loan', 'Consumer credit', 'Unknown type', 'Mortgage', 'Real estate loan', 'Interbank credit', 'Credit card', 'purchase of equipment']


In [5]:
CTdf = br_df.iloc[:,2:-1].groupby('CT')
CTMeandf = pd.DataFrame(columns=['CT','CDmean', 'CSmean', 'annuitymean'])
tempList = []
for i in loanType:
    # print(i,'的各項平均')
    tempList.append(i)
    for j in ['CD', 'CS', 'annuity']:
        tempList.append(round(np.mean(CTdf.get_group(i)[j]),2))
        # print(f'{j}的平均值: {np.mean(CTdf.get_group(i)[j])}')
tempArray = np.array(tempList).reshape(-1,4)

for i in range(len(tempArray.tolist())):
    CTMeandf.loc[len(CTMeandf)] = tempArray.tolist()[i]

# 貸款類型(CT) 在各欄位的平均表現
for i in ['CT','CDmean', 'CSmean', 'annuitymean']:
    if i == 'CT':
        CTMeandf[i] = CTMeandf[i]
    else:
        CTMeandf[i]  = pd.to_numeric(CTMeandf[i] ,errors='coerce')

# 有貸款類型的值不能計算平均，在此補 0
CTMeandf = CTMeandf.fillna(0.0)

values = br_df['CT'].value_counts(dropna=False).keys().tolist()
counts = br_df['CT'].value_counts(dropna=False).tolist()
value_dict = dict(zip(values, counts))

tempList = []
for i in range(len(CTMeandf['CT'])):
    tempList.append(value_dict[CTMeandf['CT'][i]])

CTMeandf['samples'] = tempList

tempList = []
for i in range(len(CTMeandf['samples'])):
    n = round(CTMeandf['samples'][i]/(CTMeandf['samples'].sum()),3)
    tempList.append(n)
CTMeandf['percentage'] = tempList

In [6]:
CTMeandf

Unnamed: 0,CT,CDmean,CSmean,annuitymean,samples,percentage
0,business development,239686.33,1462141.23,39319.37,1975,0.001
1,working capital replenishment,305191.47,1559215.49,17103.93,469,0.0
2,Another type of loan,102654.25,351663.17,30619.19,1017,0.001
3,purchase of shares,0.0,3334500.0,46868.49,4,0.0
4,Mobile operator loan,10178059.54,14850000.0,4054.5,1,0.0
5,Car loan,575797.22,1823915.54,33889.12,27690,0.016
6,Microloan,12026.26,52947.73,7966.02,12413,0.007
7,Cash loan,355250.96,541457.07,16102.72,56,0.0
8,Consumer credit,100011.84,306112.11,15389.3,1251615,0.729
9,Unknown type,186714.78,432345.18,20592.98,555,0.0


In [7]:
# 移除不到 1 % 的貸款類別(CT) (視為噪音)
rmCols = [ 'Mobile operator loan','Real estate loan','Cash loan','Interbank credit','purchase of equipment','purchase of shares']
for i in rmCols:
    br_df.drop(br_df.loc[br_df['CT']==i].index, inplace=True)
br_df = br_df.reset_index()
print(f"刪除後一共 {len(set(br_df['CT']))} 個類別")

刪除後一共 9 個類別


In [8]:
# 將債務金額(CD) 負值改成正值
br_df['CD'] = np.abs(br_df['CD'])
# print(br_df['CD'].min())

# 將極端值設為 NaN
for i in ['CD', 'CS', 'annuity']:
    if i == 'CD':
        br_df.loc[br_df[i] > 10000000 ,i] = np.nan
        br_df.loc[br_df[i] < 0, i] = 0
    elif i == 'CS':
        br_df.loc[br_df[i] > 20000000 ,i] = np.nan
    elif i == 'annuity':
        br_df.loc[br_df[i] > 10000000 ,i] = np.nan

print(br_df[['CD', 'CS', 'annuity', 'average']].describe())

                 CD            CS       annuity        average
count  1.457694e+06  1.716008e+06  4.895620e+05  694070.000000
mean   1.273508e+05  3.479523e+05  1.396244e+04       0.954128
std    5.072643e+05  8.190149e+05  9.334432e+04       0.251058
min    0.000000e+00  0.000000e+00  0.000000e+00      -5.000000
25%    0.000000e+00  5.129550e+04  0.000000e+00       1.000000
50%    0.000000e+00  1.254606e+05  0.000000e+00       1.000000
75%    3.976200e+04  3.150000e+05  1.350000e+04       1.000000
max    9.999716e+06  1.999800e+07  9.979880e+06       1.000000


In [9]:
# hot deck 補 NaN
# print(df.isna().any(axis = 0)) # 查看沒 NAN 的特徵有哪些，以選取可以預測 Annuity 的欄位 (不包括幣別)
withAnn = df[pd.isnull(df['AMT_ANNUITY']) == False]
withoutAnn = df[pd.isnull(df['AMT_ANNUITY'])]
CA_ohe = pd.get_dummies(withAnn['CREDIT_ACTIVE'],dtype=float) # 4 類
CT_ohe = pd.get_dummies(withAnn['CREDIT_TYPE'],dtype=float) # 13 類
CTType = list(set(df['CREDIT_TYPE']))
CTType.remove('Interbank credit')
CTType.remove('Mobile operator loan')
CAType = list(set(df['CREDIT_ACTIVE']))
withAnn = pd.concat([withAnn, CA_ohe, CT_ohe], axis = 1)
variables = ['DAYS_CREDIT','CREDIT_DAY_OVERDUE','CNT_CREDIT_PROLONG','AMT_CREDIT_SUM_OVERDUE','DAYS_CREDIT_UPDATE','AMT_ANNUITY'] + CAType + CTType # 23 cols
withAnn = withAnn[variables] # get_dum() 的 0 被轉成 NaN
withAnn = withAnn.fillna(0)

CA_ohe = pd.get_dummies(withoutAnn['CREDIT_ACTIVE'],dtype=float)
CT_ohe = pd.get_dummies(withoutAnn['CREDIT_TYPE'],dtype=float)
withoutAnn = pd.concat([withoutAnn, CA_ohe, CT_ohe], axis = 1)
withoutAnn = withoutAnn[variables]
withoutAnn = withoutAnn.fillna(0)

independentVariables = ['DAYS_CREDIT','CREDIT_DAY_OVERDUE','CNT_CREDIT_PROLONG','AMT_CREDIT_SUM_OVERDUE','DAYS_CREDIT_UPDATE'] + CAType + CTType

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# annuity NaN 預測並補值，eta: 4mins
trainX, trainY = withAnn[independentVariables], withAnn['AMT_ANNUITY']
rfModel_ann = RandomForestRegressor()
rfModel_ann.fit(trainX, trainY)
predictX = withoutAnn[independentVariables]
predictY = rfModel_ann.predict(predictX)
withoutAnn['AMT_ANNUITY'] = predictY.astype(int)
data = pd.concat([withAnn, withoutAnn])
data.reset_index(inplace=True)
data.drop('index',inplace=True,axis=1)
print(data.shape)
print(data['AMT_ANNUITY'].isna().sum())

(1716428, 23)
0


In [11]:
CTMeandf['ratio'] = round(CTMeandf['CDmean'] / CTMeandf['CSmean'], 2)
# NaN 值數量
# CS                  312
# CD               258626
# 有 194 筆兩欄(CS、CD) 同時是 NaN 的貸款
allNan = br_df[br_df['CS'].isna() & br_df['CD'].isna()]
print(allNan.shape[0]) # 194

194


In [12]:
# 以該筆貸款的貸款類別(CT)平均額度使用率，同時補給CS, CD 的 nan 一個比例值
for i in allNan['SK_ID_BUREAU'].to_list():
    br_df.loc[br_df['SK_ID_BUREAU'] == i, 'CS'] = 0
    br_df.loc[br_df['SK_ID_BUREAU'] == i, 'CD'] = 0
br_df = pd.merge(br_df, CTMeandf[['CT','ratio']], on = 'CT')
br_df['CS'].fillna(br_df['CD'] / br_df['ratio'], inplace = True)
br_df['CD'].fillna(br_df['CS'] * br_df['ratio'], inplace = True)

In [13]:
# average NaN 補 0，即視同沒紀錄
br_df['average'] = br_df['average'].fillna(0.0)

# annuity NaN 以 random forest 補
br_df['annuity'] = data['AMT_ANNUITY']

# 刪掉先前計算用的欄位
del br_df['ratio']
del br_df['index']
print(br_df.isna().sum())
br_df.to_csv('./data/fillna.csv')

userID          0
SK_ID_BUREAU    0
CA              0
CS              0
CD              0
CT              0
annuity         0
average         0
dtype: int64


In [14]:
# 賦予物理意義，eta: 4 mins
perUser = br_df.groupby(['userID'])
userList = list(set(br_df['userID']))

# 當月所需支付的年金總額
userSumAnn = []
# 信用額度使用率
userCD_CS_ratio = []
# 還款狀況
userStatus = []

# 因 CD 和 CS 都有 0 的值發生，所以產生出來的值須再補值一次
for i in userList:
    userSumAnn.append(np.sum(perUser.get_group(i)['annuity']))
    userCD_CS_ratio.append(np.sum(perUser.get_group(i)['CD'])/np.sum(perUser.get_group(i)['CS']))
    userStatus.append(np.average(perUser.get_group(i)['average']))

  userCD_CS_ratio.append(np.sum(perUser.get_group(i)['CD'])/np.sum(perUser.get_group(i)['CS']))
  userCD_CS_ratio.append(np.sum(perUser.get_group(i)['CD'])/np.sum(perUser.get_group(i)['CS']))


In [15]:
# 最終每位 User 的 df
final_df = pd.DataFrame(
     {'SK_ID_CURR': userList,'CDCS_ratio': userCD_CS_ratio, 'annuity': userSumAnn, 'status': userStatus})
# print(final_df.iloc[:, 1:].describe())

# 有 1224 人沒有信用額度使用率，猜測是之前為小白，現在卻開始貸款，所以無聯徵紀錄
# 而完全沒在聯徵資料出現的貸款者，猜測其當前仍然是小白
print('--- check isna() ---')
print(final_df.isna().sum())

# 將信用額度使用率 nan 補 0 ，即此人的過往聯徵是小白
final_df['CDCS_ratio'].fillna(0.0, inplace = True)
final_df[np.isinf(final_df['CDCS_ratio'])] = 0
print('--- Final version check isna() ---')
print(final_df.isna().sum())

final_df.to_csv('./data/bureau_final.csv')

--- check isna() ---
SK_ID_CURR       0
CDCS_ratio    1224
annuity          0
status           0
dtype: int64
--- Final version check isna() ---
SK_ID_CURR    0
CDCS_ratio    0
annuity       0
status        0
dtype: int64


被歸類在極端值的貸款人

In [17]:
# 極端值數量(1.5 * IQR)
# 信用額度使用率: 越高超過一理論上表示刷到爆掉，低信用
temp = final_df['CDCS_ratio'].unique()
temp = np.sort(temp)
temp = temp[::-1]
# print(f'信用額度使用率前五大值: {temp[:5]}')
print('=================信用額度使用率極端值=================')
print(f'Q1:{final_df["CDCS_ratio"].quantile(0.25)}, Q3: {final_df["CDCS_ratio"].quantile(0.75)}')
oIQR = final_df["CDCS_ratio"].quantile(0.75) - final_df["CDCS_ratio"].quantile(0.25)
print(f'Outlier 上下界: [{final_df["CDCS_ratio"].quantile(0.25)-1.5*oIQR},{final_df["CDCS_ratio"].quantile(0.75)+1.5*oIQR}]')
nOutliers = final_df.loc[final_df["CDCS_ratio"]>final_df["CDCS_ratio"].quantile(0.75)+1.5*oIQR].shape[0] + final_df.loc[final_df["CDCS_ratio"]<final_df["CDCS_ratio"].quantile(0.25)-1.5*oIQR].shape[0]
print(f'1.5 IQR 下的極端值共有: {nOutliers} 筆')
print()
print('=================年金極端值=================')
print(f'Q1:{final_df["annuity"].quantile(0.25)}, Q3: {final_df["annuity"].quantile(0.75)}')
oIQR = final_df["annuity"].quantile(0.75) - final_df["annuity"].quantile(0.25)
print(f'Outlier 上下界: [{final_df["annuity"].quantile(0.25)-1.5*oIQR},{final_df["annuity"].quantile(0.75)+1.5*oIQR}]')
nOutliers = final_df.loc[final_df["annuity"]>final_df["annuity"].quantile(0.75)+1.5*oIQR].shape[0] + final_df.loc[final_df["annuity"]<final_df["annuity"].quantile(0.25)-1.5*oIQR].shape[0]
print(f'1.5 IQR 下的極端值共有: {nOutliers} 筆')
print()
print('=================信用評分極端值=================')
print(f'Q1:{final_df["status"].quantile(0.25)}, Q3: {final_df["status"].quantile(0.75)}')
oIQR = final_df["status"].quantile(0.75) - final_df["status"].quantile(0.25)
print(f'Outlier 上下界: [{final_df["status"].quantile(0.25)-1.5*oIQR},{final_df["status"].quantile(0.75)+1.5*oIQR}]')
nOutliers = final_df.loc[final_df["status"]>final_df["status"].quantile(0.75)+1.5*oIQR].shape[0] + final_df.loc[final_df["status"]<final_df["status"].quantile(0.25)-1.5*oIQR].shape[0]
print(f'1.5 IQR 下的極端值共有: {nOutliers} 筆')

tempList = []
tempList1 = []
tempList2 = []
for i in ['CDCS_ratio', 'annuity', 'status']:
    temp = final_df.loc[final_df[i]>final_df[i].quantile(0.75)+1.5*(final_df[i].quantile(0.75) - final_df[i].quantile(0.25))]['SK_ID_CURR'].to_list()
    temp1 = final_df.loc[final_df[i]<final_df[i].quantile(0.25)-1.5*(final_df[i].quantile(0.75) - final_df[i].quantile(0.25))]['SK_ID_CURR'].to_list()
    if i == 'CDCS_ratio':
        tempList = temp + temp1
    elif i == 'annuity':
        tempList1 = temp + temp1
    else:
        tempList2 = temp + temp1
olList = tempList + tempList1 + tempList2
print()
print(f'總共有 {len(set(olList))} 人算極端值')

Q1:0.09367827361154718, Q3: 0.5149672048600837
Outlier 上下界: [-0.5382551232612576,1.1469006017328884]
1.5 IQR 下的極端值共有: 405 筆

Q1:18494.5, Q3: 95270.32
Outlier 上下界: [-96669.23000000001,210434.05000000002]
1.5 IQR 下的極端值共有: 20760 筆

Q1:0.0, Q3: 0.9583333333333334
Outlier 上下界: [-1.4375,2.3958333333333335]
1.5 IQR 下的極端值共有: 231 筆

總共有 21378 人算極端值
