In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import TruncatedSVD

warnings.filterwarnings('ignore')

# Upload data

In [2]:
data = pd.read_csv('data/fixed_data/data.csv')
print(data.shape)
data.head()

(307511, 119)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002.0,1.0,Cash loans,M,N,Y,0.0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003.0,0.0,Cash loans,F,N,N,0.0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004.0,0.0,Revolving loans,M,Y,Y,0.0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006.0,0.0,Cash loans,F,N,Y,0.0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,,,,,,
4,100007.0,0.0,Cash loans,M,N,Y,0.0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Info

In [3]:
data.info(verbose=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 119 columns):
SK_ID_CURR                      float64
TARGET                          float64
NAME_CONTRACT_TYPE              object
CODE_GENDER                     object
FLAG_OWN_CAR                    object
FLAG_OWN_REALTY                 object
CNT_CHILDREN                    float64
AMT_INCOME_TOTAL                float64
AMT_CREDIT                      float64
AMT_ANNUITY                     float64
AMT_GOODS_PRICE                 float64
NAME_TYPE_SUITE                 object
NAME_INCOME_TYPE                object
NAME_EDUCATION_TYPE             object
NAME_FAMILY_STATUS              object
NAME_HOUSING_TYPE               object
REGION_POPULATION_RELATIVE      float64
DAYS_BIRTH                      float64
DAYS_EMPLOYED                   float64
DAYS_REGISTRATION               float64
DAYS_ID_PUBLISH                 float64
OWN_CAR_AGE                     float64
FLAG_MOBIL       

# Fast statistical info

In [4]:
data.describe()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
mean,278180.518577,0.080729,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,...,0.00813,0.000595,0.000507,0.000335,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
std,102790.175348,0.272419,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,...,0.089798,0.024387,0.022518,0.018299,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295
min,100002.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,367142.5,0.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,456255.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,...,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0


# Numeric and Categorical data

In [5]:
# К-во уникальных значений во всех категориальных переменных
cols = data.columns
num_cols = data._get_numeric_data().columns
print(num_cols)

# Get categorical
categorical = list(set(cols) - set(num_cols))
print(categorical, len(categorical))

Index(['SK_ID_CURR', 'TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=103)
['FLAG_OWN_CAR', 'WALLSMATERIAL_MODE', 'NAME_CONTRACT_TYPE', 'OCCUPATION_TYPE', 'NAME_INCOME_TYPE', 'EMERGENCYSTATE_MODE', 'FLAG_OWN_REALTY', 'HOUSETYPE_MODE', 'NAME_TYPE_SUITE', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'NAME_FAMILY_STATUS', 'FONDKAPREMONT_MODE', 'ORGANIZATION_TYPE', 'CODE_GENDER', 'NAME_EDUCATION_TYPE'] 16


# UPLOAD main_test

In [6]:
main_test = pd.read_csv('data/fixed_data/main_test.csv')
print(main_test.shape)
main_test.head()

(48744, 118)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001.0,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005.0,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013.0,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028.0,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038.0,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


# Append main_test to data

In [7]:
data_and_test = pd.concat([data, main_test]).reset_index(drop=True)

In [8]:
data_and_test.shape

(356255, 119)

In [9]:
# data - [0; 307510]
# main_test - [307511; 356254]

# EXTRACT COMPONENTS FROM OTHER TABLES

# 1) Work with bureau.csv 

### Upload bureau.csv

In [10]:
bureau = pd.read_csv('data/bureau.csv')
print(bureau.shape)
bureau.head()

(1716428, 17)


Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


### Short view of table structure

In [11]:
print(bureau['SK_ID_CURR'].unique().shape)
print((bureau['SK_ID_CURR'] == 215354).sum())
s = np.intersect1d(data_and_test['SK_ID_CURR'].unique(), bureau['SK_ID_CURR'].unique())
s.shape

(305811,)
11


(305811,)

### Transform negative values into positive

In [12]:
bureau['DAYS_CREDIT'] = abs(bureau['DAYS_CREDIT'].values)
bureau['DAYS_CREDIT_ENDDATE'] = abs(bureau['DAYS_CREDIT_ENDDATE'].values)
bureau['DAYS_ENDDATE_FACT'] = abs(bureau['DAYS_ENDDATE_FACT'].values)
bureau['DAYS_CREDIT_UPDATE'] = abs(bureau['DAYS_CREDIT_UPDATE'].values)

### Handle bureau missing values  

In [13]:
missing_values = bureau.isnull().sum().sort_values(ascending=False)
percent_data = (bureau.isnull().sum() / bureau.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([missing_values, percent_data], axis=1, keys=['Total', 'Percent'])
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print('Missing data:\n', missing_data)

Missing data:
                           Total   Percent
AMT_ANNUITY             1226791  0.714735
AMT_CREDIT_MAX_OVERDUE  1124488  0.655133
DAYS_ENDDATE_FACT        633653  0.369170
AMT_CREDIT_SUM_LIMIT     591780  0.344774
AMT_CREDIT_SUM_DEBT      257669  0.150119
DAYS_CREDIT_ENDDATE      105553  0.061496
AMT_CREDIT_SUM               13  0.000008
CREDIT_TYPE                   0  0.000000
AMT_CREDIT_SUM_OVERDUE        0  0.000000
CNT_CREDIT_PROLONG            0  0.000000
DAYS_CREDIT_UPDATE            0  0.000000
CREDIT_DAY_OVERDUE            0  0.000000
DAYS_CREDIT                   0  0.000000
CREDIT_CURRENCY               0  0.000000
CREDIT_ACTIVE                 0  0.000000
SK_ID_BUREAU                  0  0.000000
SK_ID_CURR                    0  0.000000


In [14]:
bureau_removed = bureau.copy()

# Drop columns
bureau_removed = bureau_removed.drop(columns=['AMT_ANNUITY', 'AMT_CREDIT_MAX_OVERDUE'])

# Fill columns
cols_r =['DAYS_ENDDATE_FACT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_DEBT', 'DAYS_CREDIT_ENDDATE', 'AMT_CREDIT_SUM']

for column in cols_r:
    bureau_removed[column] = bureau_removed[column].fillna(bureau_removed[column].mode()[0])

In [15]:
# Check missing values
bureau_removed.isnull().sum().values.sum()

0

### One hot encoding

In [16]:
bureau_removed_dum = pd.get_dummies(bureau_removed)
bureau_removed_dum.shape

(1716428, 35)

### Perform normalization

In [17]:
scaler = MinMaxScaler()
bureau_removed_norm = pd.DataFrame(scaler.fit_transform(bureau_removed_dum), 
                                   columns=bureau_removed_dum.columns)
bureau_removed_norm.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,...,CREDIT_TYPE_Interbank credit,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan
0,0.323794,0.387566,0.170089,0.0,0.003638,0.003641,0.0,0.000156,0.026919,0.11081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.323794,0.387567,0.071184,0.0,0.025559,0.007829,0.0,0.000385,0.027899,0.11081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.323794,0.387567,0.069473,0.0,0.012553,0.007829,0.0,0.000794,0.026919,0.11081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.323794,0.387568,0.069473,0.0,2.4e-05,0.007829,0.0,0.000154,0.026919,0.11081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.323794,0.387569,0.215264,0.0,0.028459,0.007829,0.0,0.004615,0.026919,0.11081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Finding indexes, columns, combining together and so on

In [18]:
# Relief id`s
bureau_features = bureau_removed_norm.copy()
del bureau_features['SK_ID_CURR']
del bureau_features['SK_ID_BUREAU']
bureau_features.shape

(1716428, 33)

### Perform Decomposition

In [19]:
svd = TruncatedSVD(n_components=1, n_iter=7)
bureau_component = svd.fit_transform(bureau_features)  
bureau_component.shape

(1716428, 1)

In [20]:
bureau_component = pd.DataFrame(bureau_component, columns=['COMPONENT'])
bureau_component['SK_ID_CURR'] = bureau_removed_dum['SK_ID_CURR'].values
bureau_component['SK_ID_BUREAU'] = bureau_removed_dum['SK_ID_BUREAU'].values
print(bureau_component.shape)
bureau_component.head()

(1716428, 3)


Unnamed: 0,COMPONENT,SK_ID_CURR,SK_ID_BUREAU
0,1.655994,215354,5714462
1,0.967804,215354,5714463
2,1.368491,215354,5714464
3,0.966734,215354,5714465
4,1.407849,215354,5714466


In [21]:
bureau_component = bureau_component.set_index('SK_ID_BUREAU')
bureau_component.head(10)

Unnamed: 0_level_0,COMPONENT,SK_ID_CURR
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1
5714462,1.655994,215354
5714463,0.967804,215354
5714464,1.368491,215354
5714465,0.966734,215354
5714466,1.407849,215354
5714467,0.989828,215354
5714468,1.353605,215354
5714469,1.785726,162297
5714470,1.716132,162297
5714471,1.053463,162297


### Calculate value for each row

In [22]:
bureau_comp = bureau_component.groupby('SK_ID_CURR', as_index=False)['COMPONENT'].median().rename(columns = {'COMPONENT': 'BUREAU_COMPONENT'})
bureau_comp.head()

Unnamed: 0,SK_ID_CURR,BUREAU_COMPONENT
0,100001,1.689255
1,100002,1.379206
2,100003,1.520511
3,100004,1.690253
4,100005,1.355368


### Merge data.csv and BUREAU_COMPONENT

In [23]:
# Join to the training dataframe
data_combined = data_and_test.merge(bureau_comp, on = 'SK_ID_CURR', how = 'left')
data_combined.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,BUREAU_COMPONENT
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0149,"Stone, brick",WEDNESDAY,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,1.379206
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0714,Block,MONDAY,0.9851,0.9851,0.9851,0.796,0.7987,0.804,1.520511
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,MONDAY,,,,,,,1.690253
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,,,WEDNESDAY,,,,,,,
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,THURSDAY,,,,,,,1.71636


# 2) Work with installments_payments.csv 

### Upload installments_payments.csv

In [24]:
installments_payments = pd.read_csv('data/installments_payments.csv')
print(installments_payments.shape)
installments_payments.head()

(13605401, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


### Short view of table structure

In [25]:
print(installments_payments['SK_ID_CURR'].unique().shape)
print((installments_payments['SK_ID_CURR'] == 215354).sum())
s = np.intersect1d(data_and_test['SK_ID_CURR'].unique(), installments_payments['SK_ID_CURR'].unique())
s.shape

(339587,)
113


(339587,)

### Transform negative values into positive

In [26]:
installments_payments['DAYS_INSTALMENT'] = abs(installments_payments ['DAYS_INSTALMENT'].values)
installments_payments['DAYS_ENTRY_PAYMENT'] = abs(installments_payments ['DAYS_ENTRY_PAYMENT'].values)

### Handle installments_payments missing values

In [27]:
missing_values = installments_payments.isnull().sum().sort_values(ascending=False)
percent_data = (installments_payments.isnull().sum() / installments_payments.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([missing_values, percent_data], axis=1, keys=['Total', 'Percent'])
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print('Missing data:\n', missing_data)

Missing data:
                         Total   Percent
AMT_PAYMENT              2905  0.000214
DAYS_ENTRY_PAYMENT       2905  0.000214
AMT_INSTALMENT              0  0.000000
DAYS_INSTALMENT             0  0.000000
NUM_INSTALMENT_NUMBER       0  0.000000
NUM_INSTALMENT_VERSION      0  0.000000
SK_ID_CURR                  0  0.000000
SK_ID_PREV                  0  0.000000


In [28]:
installments_payments_removed = installments_payments.copy()

# Fill columns
cols_r =['AMT_PAYMENT', 'DAYS_ENTRY_PAYMENT']

for column in cols_r:
    installments_payments_removed[column] = installments_payments_removed[column].fillna(
                                            installments_payments_removed[column].median())

In [29]:
# Check missing values
installments_payments_removed.isnull().sum().values.sum()

0

### One hot encoding

In [30]:
installments_payments_dum = pd.get_dummies(installments_payments_removed)
installments_payments_dum.shape

(13605401, 8)

### Perform normalization

In [31]:
scaler = MinMaxScaler()
installments_payments_norm = pd.DataFrame(scaler.fit_transform(installments_payments_dum), 
                                          columns=installments_payments_dum.columns)
installments_payments_norm.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,0.029392,0.173115,0.005618,0.018116,0.403629,0.241057,0.001842,0.001842
1,0.179458,0.144947,0.0,0.119565,0.737761,0.438008,0.000455,0.000455
2,0.58868,0.261196,0.011236,0.0,0.021226,0.012602,0.006741,0.006741
3,0.787918,0.279845,0.005618,0.007246,0.827456,0.492886,0.006456,0.006456
4,0.930146,0.190187,0.005618,0.003623,0.473126,0.277439,0.000574,0.000573


### Finding indexes, columns, combining together and so on

In [32]:
# Relief id`s
installments_payments_features = installments_payments_norm.copy()
del installments_payments_features['SK_ID_CURR']
del installments_payments_features['SK_ID_PREV']
installments_payments_features.shape

(13605401, 6)

### Perform Decomposition

In [33]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=1, n_iter=7)
installments_payments_component = svd.fit_transform(installments_payments_features)  
installments_payments_component.shape

(13605401, 1)

In [34]:
installments_payments_component = pd.DataFrame(installments_payments_component, columns=['COMPONENT'])
installments_payments_component['SK_ID_CURR'] = installments_payments_dum['SK_ID_CURR'].values
installments_payments_component['SK_ID_PREV'] = installments_payments_dum['SK_ID_PREV'].values
print(installments_payments_component.shape)
installments_payments_component.head()

(13605401, 3)


Unnamed: 0,COMPONENT,SK_ID_CURR,SK_ID_PREV
0,0.469858,161674,1054186
1,0.865282,151639,1330831
2,0.024717,193053,2085231
3,0.959831,199697,2452527
4,0.546516,167756,2714724


In [35]:
installments_payments_component = installments_payments_component.set_index('SK_ID_PREV')
installments_payments_component.head(10)

Unnamed: 0_level_0,COMPONENT,SK_ID_CURR
SK_ID_PREV,Unnamed: 1_level_1,Unnamed: 2_level_1
1054186,0.469858,161674
1330831,0.865282,151639
2085231,0.024717,193053
2452527,0.959831,199697
2714724,0.546516,167756
1137312,0.555413,164489
2234264,0.141803,184693
1818599,0.387182,111420
2723183,0.082003,112102
1413990,0.230694,109741


### Calculate value for each row

In [36]:
installments_payments_comp = installments_payments_component.groupby('SK_ID_CURR', as_index=False)['COMPONENT'].median().rename(columns = {'COMPONENT': 'installments_payments_COMPONENT'})
installments_payments_comp.head()

Unnamed: 0,SK_ID_CURR,installments_payments_COMPONENT
0,100001,0.677821
1,100002,0.121356
2,100003,0.316717
3,100004,0.299847
4,100005,0.233195


### Merge with data_combined

In [37]:
# Join to the training dataframe
data_combined = data_combined.merge(installments_payments_comp, on = 'SK_ID_CURR', how = 'left')
data_combined.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,BUREAU_COMPONENT,installments_payments_COMPONENT
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,"Stone, brick",WEDNESDAY,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,1.379206,0.121356
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Block,MONDAY,0.9851,0.9851,0.9851,0.796,0.7987,0.804,1.520511,0.316717
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,MONDAY,,,,,,,1.690253,0.299847
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,,WEDNESDAY,,,,,,,,0.082729
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,THURSDAY,,,,,,,1.71636,0.339719


# 3) Work with POS_CASH_balance.csv

### Upload POS_CASH_balance.csv

In [38]:
POS_CASH_balance = pd.read_csv('data/POS_CASH_balance.csv')
print(POS_CASH_balance.shape)
POS_CASH_balance.head()

(10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


### Short view of table structure

In [39]:
print(POS_CASH_balance['SK_ID_CURR'].unique().shape)
print((POS_CASH_balance['SK_ID_CURR'] == 215354).sum())
s = np.intersect1d(data_and_test['SK_ID_CURR'].unique(), POS_CASH_balance['SK_ID_CURR'].unique())
s.shape

(337252,)
53


(337252,)

### Transform negative values into positive

In [40]:
POS_CASH_balance['MONTHS_BALANCE'] = abs(POS_CASH_balance ['MONTHS_BALANCE'].values)

### Handle POS_CASH_balance missing values

In [41]:
missing_values = POS_CASH_balance.isnull().sum().sort_values(ascending=False)
percent_data = (POS_CASH_balance.isnull().sum() / POS_CASH_balance.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([missing_values, percent_data], axis=1, keys=['Total', 'Percent'])
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print('Missing data:\n', missing_data)

Missing data:
                        Total   Percent
CNT_INSTALMENT_FUTURE  26087  0.002608
CNT_INSTALMENT         26071  0.002607
SK_DPD_DEF                 0  0.000000
SK_DPD                     0  0.000000
NAME_CONTRACT_STATUS       0  0.000000
MONTHS_BALANCE             0  0.000000
SK_ID_CURR                 0  0.000000
SK_ID_PREV                 0  0.000000


In [42]:
POS_CASH_balance_removed = POS_CASH_balance.copy()

# Fill columns
cols_r =['CNT_INSTALMENT_FUTURE', 'CNT_INSTALMENT']

for column in cols_r:
    POS_CASH_balance_removed[column] = POS_CASH_balance_removed[column].fillna(
                                            POS_CASH_balance_removed[column].median())

In [43]:
# Check missing values
POS_CASH_balance_removed.isnull().sum().values.sum()

0

### One hot encoding

In [44]:
POS_CASH_balance_dum = pd.get_dummies(POS_CASH_balance_removed)
POS_CASH_balance_dum.shape

(10001358, 16)

### Perform normalization

In [45]:
scaler = MinMaxScaler()
POS_CASH_balance_norm = pd.DataFrame(scaler.fit_transform(POS_CASH_balance_dum), 
                                          columns=POS_CASH_balance_dum.columns)
POS_CASH_balance_norm.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Amortized debt,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Returned to the store,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_XNA
0,0.43569,0.232817,0.315789,0.516484,0.529412,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.388038,0.752241,0.336842,0.384615,0.411765,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.425751,0.834812,0.326316,0.120879,0.105882,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.489987,0.475009,0.357895,0.516484,0.494118,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.727445,0.657615,0.357895,0.384615,0.411765,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Finding indexes, columns, combining together and so on

In [46]:
# Relief id`s
POS_CASH_balance_features = POS_CASH_balance_norm.copy()
del POS_CASH_balance_features['SK_ID_CURR']
del POS_CASH_balance_features['SK_ID_PREV']
POS_CASH_balance_features.shape

(10001358, 14)

### Perform Decomposition

In [47]:
svd = TruncatedSVD(n_components=1, n_iter=7)
POS_CASH_balance_component = svd.fit_transform(POS_CASH_balance_features)  
POS_CASH_balance_component.shape

(10001358, 1)

In [48]:
POS_CASH_balance_component = pd.DataFrame(POS_CASH_balance_component, columns=['COMPONENT'])
POS_CASH_balance_component['SK_ID_CURR'] = POS_CASH_balance_dum['SK_ID_CURR'].values
POS_CASH_balance_component['SK_ID_PREV'] = POS_CASH_balance_dum['SK_ID_PREV'].values
print(POS_CASH_balance_component.shape)
POS_CASH_balance_component.head()

(10001358, 3)


Unnamed: 0,COMPONENT,SK_ID_CURR,SK_ID_PREV
0,1.175615,182943,1803195
1,1.146337,367990,1715348
2,1.060544,397406,1784872
3,1.186172,269225,1903291
4,1.153776,334279,2341044


In [49]:
POS_CASH_balance_component = POS_CASH_balance_component.set_index('SK_ID_PREV')
POS_CASH_balance_component.head(10)

Unnamed: 0_level_0,COMPONENT,SK_ID_CURR
SK_ID_PREV,Unnamed: 1_level_1,Unnamed: 2_level_1
1803195,1.175615,182943
1715348,1.146337,367990
1784872,1.060544,397406
1903291,1.186172,269225
2341044,1.153776,334279
2207092,1.064864,342166
1110516,1.19877,204376
1387235,1.155216,153211
1220500,1.061145,112740
2371489,1.092939,274851


### Calculate value for each row

In [50]:
POS_CASH_balance_comp = POS_CASH_balance_component.groupby('SK_ID_CURR', as_index=False)['COMPONENT'].median().rename(columns = {'COMPONENT': 'POS_CASH_balance_COMPONENT'})
POS_CASH_balance_comp.head()

Unnamed: 0,SK_ID_CURR,POS_CASH_balance_COMPONENT
0,100001,1.126294
1,100002,1.009671
2,100003,1.031928
3,100004,1.01213
4,100005,1.00931


### Merge data_combined.csv and POS_CASH_balance_COMPONENT

In [51]:
data_combined = data_combined.merge(POS_CASH_balance_comp, on=['SK_ID_CURR'], how='left')
print(data_combined.shape)
data_combined.head()

(356255, 122)


Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,BUREAU_COMPONENT,installments_payments_COMPONENT,POS_CASH_balance_COMPONENT
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,WEDNESDAY,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,1.379206,0.121356,1.009671
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,MONDAY,0.9851,0.9851,0.9851,0.796,0.7987,0.804,1.520511,0.316717,1.031928
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,MONDAY,,,,,,,1.690253,0.299847,1.01213
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,WEDNESDAY,,,,,,,,0.082729,0.967557
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,THURSDAY,,,,,,,1.71636,0.339719,1.046385


# 4) Work with previous_application.csv

### Upload previous_application.csv

In [52]:
previous_application = pd.read_csv('data/previous_application.csv')
print(previous_application.shape)
previous_application.head()

(1670214, 37)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


### Short view of table structure

In [53]:
print(previous_application['SK_ID_CURR'].unique().shape)
print((previous_application['SK_ID_CURR'] == 215354).sum())
s = np.intersect1d(data_and_test['SK_ID_CURR'].unique(), previous_application['SK_ID_CURR'].unique())
s.shape

(338857,)
5


(338857,)

### Transform negative values into positive

In [54]:
previous_application['DAYS_FIRST_DUE'] = abs(previous_application ['DAYS_FIRST_DUE'].values)
previous_application['DAYS_LAST_DUE_1ST_VERSION'] = abs(previous_application ['DAYS_LAST_DUE_1ST_VERSION'].values)
previous_application['DAYS_FIRST_DRAWING'] = abs(previous_application ['DAYS_FIRST_DRAWING'].values)
previous_application['DAYS_LAST_DUE'] = abs(previous_application ['DAYS_LAST_DUE'].values)
previous_application['DAYS_TERMINATION'] = abs(previous_application ['DAYS_TERMINATION'].values)

### Handle previous_application missing values

In [55]:
missing_values = previous_application.isnull().sum().sort_values(ascending=False)
percent_data = (previous_application.isnull().sum() / previous_application.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([missing_values, percent_data], axis=1, keys=['Total', 'Percent'])
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print('Missing data:\n', missing_data)

Missing data:
                                Total       Percent
RATE_INTEREST_PRIVILEGED     1664263  9.964370e-01
RATE_INTEREST_PRIMARY        1664263  9.964370e-01
RATE_DOWN_PAYMENT             895844  5.363648e-01
AMT_DOWN_PAYMENT              895844  5.363648e-01
NAME_TYPE_SUITE               820405  4.911975e-01
DAYS_TERMINATION              673065  4.029813e-01
NFLAG_INSURED_ON_APPROVAL     673065  4.029813e-01
DAYS_FIRST_DRAWING            673065  4.029813e-01
DAYS_FIRST_DUE                673065  4.029813e-01
DAYS_LAST_DUE_1ST_VERSION     673065  4.029813e-01
DAYS_LAST_DUE                 673065  4.029813e-01
AMT_GOODS_PRICE               385515  2.308177e-01
AMT_ANNUITY                   372235  2.228667e-01
CNT_PAYMENT                   372230  2.228637e-01
PRODUCT_COMBINATION              346  2.071591e-04
AMT_CREDIT                         1  5.987257e-07
SK_ID_CURR                         0  0.000000e+00
NAME_CONTRACT_TYPE                 0  0.000000e+00
WEEKDAY_APPR_PRO

In [56]:
previous_application = previous_application.drop(columns=['RATE_INTEREST_PRIVILEGED', 'RATE_INTEREST_PRIMARY'])

In [57]:
previous_application_removed = previous_application.copy()

# Fill columns
cols_r =['RATE_DOWN_PAYMENT', 'AMT_DOWN_PAYMENT', 'DAYS_TERMINATION',
        'NFLAG_INSURED_ON_APPROVAL', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION',
        'DAYS_LAST_DUE', 'AMT_GOODS_PRICE', 'AMT_ANNUITY', 'CNT_PAYMENT',
        'AMT_CREDIT']

cols_s = ['NAME_TYPE_SUITE', 'PRODUCT_COMBINATION', ]

for column in cols_r:
    previous_application_removed[column] = previous_application_removed[column].fillna(
                                           previous_application_removed[column].median())
    
for column in cols_s:
    previous_application_removed[column] = previous_application_removed[column].fillna(
                                           previous_application_removed[column].mode()[0])

In [58]:
# Check missing values
previous_application_removed.isnull().sum().values.sum()

0

### One hot encoding

In [59]:
previous_application_dum = pd.get_dummies(previous_application_removed)
previous_application_dum.shape

(1670214, 162)

### Perform normalization

In [60]:
scaler = MinMaxScaler()
previous_application_norm = pd.DataFrame(scaler.fit_transform(previous_application_dum), 
                                          columns=previous_application_dum.columns)
previous_application_norm.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,...,PRODUCT_COMBINATION_Cash X-Sell: low,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest
0,0.558418,0.482454,0.004139,0.002483,0.002483,2.941132e-07,0.002483,0.652174,1.0,1.5e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.976722,0.022815,0.060251,0.087978,0.098429,0.0005355802,0.087978,0.478261,1.0,0.051619,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.825556,0.061863,0.036025,0.016292,0.01976,0.0005355802,0.016292,0.478261,1.0,0.051619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.985835,0.213772,0.112523,0.065169,0.068179,0.0005355802,0.065169,0.304348,1.0,0.051619,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.424988,0.286461,0.076364,0.048876,0.058515,0.0005355802,0.048876,0.391304,1.0,0.051619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Finding indexes, columns, combining together and so on

In [61]:
# Relief id`s
previous_application_features = previous_application_norm.copy()
del previous_application_features['SK_ID_CURR']
del previous_application_features['SK_ID_PREV']
previous_application_features.shape

(1670214, 160)

### Perform Decomposition

In [62]:
svd = TruncatedSVD(n_components=1, n_iter=7)
previous_application_component = svd.fit_transform(previous_application_features)  
previous_application_component.shape

(1670214, 1)

In [63]:
previous_application_component = pd.DataFrame(previous_application_component, columns=['COMPONENT'])
previous_application_component['SK_ID_CURR'] = previous_application_dum['SK_ID_CURR'].values
previous_application_component['SK_ID_PREV'] = previous_application_dum['SK_ID_PREV'].values
print(previous_application_component.shape)
previous_application_component.head()

(1670214, 3)


Unnamed: 0,COMPONENT,SK_ID_CURR,SK_ID_PREV
0,3.391711,271877,2030495
1,3.409357,108129,2802425
2,3.374997,122040,2523466
3,3.514514,176158,2819243
4,2.849503,202054,1784265


In [64]:
previous_application_component = previous_application_component.set_index('SK_ID_PREV')
previous_application_component.head(10)

Unnamed: 0_level_0,COMPONENT,SK_ID_CURR
SK_ID_PREV,Unnamed: 1_level_1,Unnamed: 2_level_1
2030495,3.391711,271877
2802425,3.409357,108129
2523466,3.374997,122040
2819243,3.514514,176158
1784265,2.849503,202054
1383531,3.136376,199383
2315218,3.459066,175704
1656711,3.42842,296299
2367563,3.460101,342292
2579447,3.482154,334349


### Calculate value for each row

In [65]:
previous_application_comp = POS_CASH_balance_component.groupby('SK_ID_CURR', as_index=False)['COMPONENT'].median().rename(columns = {'COMPONENT': 'previous_application_COMPONENT'})
previous_application_comp.head()

Unnamed: 0,SK_ID_CURR,previous_application_COMPONENT
0,100001,1.126294
1,100002,1.009671
2,100003,1.031928
3,100004,1.01213
4,100005,1.00931


### Merge data_combined.csv and previous_application_COMPONENT

In [66]:
data_combined = data_combined.merge(previous_application_comp, on=['SK_ID_CURR'], how='left')
print(data_combined.shape)
data_combined.head()

(356255, 123)


Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,BUREAU_COMPONENT,installments_payments_COMPONENT,POS_CASH_balance_COMPONENT,previous_application_COMPONENT
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,1.379206,0.121356,1.009671,1.009671
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.9851,0.9851,0.9851,0.796,0.7987,0.804,1.520511,0.316717,1.031928,1.031928
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,1.690253,0.299847,1.01213,1.01213
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,,,,,,,,0.082729,0.967557,0.967557
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,1.71636,0.339719,1.046385,1.046385


# Save checkpoint

In [67]:
data_combined.to_csv('data/pre_preprocessed_data/data_combined_components.csv', index=False)