In [1]:
# Importing the desired packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)# To see all the columns of a dataframe
pd.set_option('display.max_rows', None)

In [2]:
# Function to reduce the memory usage of various Dataframes
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
       
        1. Iterate over every column
        2. Determine if the column is numeric
        3. Determine if the column can be represented by an integer
        4. Find the min and the max value
        5. Determine and apply the smallest datatype that can fit the range of values

    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [3]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [12]:
# Reading the bureau dataset
df_bureau = import_data('New_bureau.csv')

Memory usage of dataframe is 170.24 MB
Memory usage after optimization is: 55.66 MB
Decreased by 67.3%


In [5]:
# Reading the bureau_balance dataset
df_bureau_bal = import_data('bureau_balance.csv')

Memory usage of dataframe is 624.85 MB
Memory usage after optimization is: 156.21 MB
Decreased by 75.0%


In [13]:
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE
0,215354,5714462,Closed,currency 1,-497,0,-153.0,0,91323.0,0.0,0.0,Consumer credit,-131
1,215354,5714463,Active,currency 1,-208,0,1075.0,0,225000.0,171342.0,0.0,Credit card,-20
2,215354,5714464,Active,currency 1,-203,0,528.0,0,464323.5,,0.0,Consumer credit,-16
3,215354,5714465,Active,currency 1,-203,0,,0,90000.0,,0.0,Credit card,-16
4,215354,5714466,Active,currency 1,-629,0,1197.0,0,2700000.0,,0.0,Consumer credit,-21


In [14]:
# Visualizing the info of the bureau dataframe
df_bureau.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1716428 entries, 0 to 1716427
Data columns (total 13 columns):
SK_ID_CURR                int32
SK_ID_BUREAU              int32
CREDIT_ACTIVE             category
CREDIT_CURRENCY           category
DAYS_CREDIT               int16
CREDIT_DAY_OVERDUE        int16
DAYS_CREDIT_ENDDATE       float16
CNT_CREDIT_PROLONG        int8
AMT_CREDIT_SUM            float32
AMT_CREDIT_SUM_DEBT       float32
AMT_CREDIT_SUM_OVERDUE    float32
CREDIT_TYPE               category
DAYS_CREDIT_UPDATE        int32
dtypes: category(3), float16(1), float32(3), int16(2), int32(3), int8(1)
memory usage: 55.7 MB


In [16]:
# Visualizing the 'CREDIT_CURRENCY' categorical column of df_bureau
df_bureau['CREDIT_CURRENCY'].value_counts()

currency 1    1715020
currency 2       1224
currency 3        174
currency 4         10
Name: CREDIT_CURRENCY, dtype: int64

In [17]:
# Almost all of the rows have currency 1 as its value. Hence remove this column as it lacks variance
df_bureau.drop(columns='CREDIT_CURRENCY', inplace=True)

In [20]:
# Calculating the no. of previous loans information with credit bureaus for each applicant.
df_bureau['Prev_Bu_Loan_Cnt'] = df_bureau.groupby('SK_ID_CURR')['SK_ID_BUREAU'].transform('count')

In [21]:
df_bureau.head(5)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,Prev_Bu_Loan_Cnt
0,215354,5714462,Closed,-497,0,-153.0,0,91323.0,0.0,0.0,Consumer credit,-131,11
1,215354,5714463,Active,-208,0,1075.0,0,225000.0,171342.0,0.0,Credit card,-20,11
2,215354,5714464,Active,-203,0,528.0,0,464323.5,,0.0,Consumer credit,-16,11
3,215354,5714465,Active,-203,0,,0,90000.0,,0.0,Credit card,-16,11
4,215354,5714466,Active,-629,0,1197.0,0,2700000.0,,0.0,Consumer credit,-21,11


### Processing the bureau_bal dataframe

In [100]:
df_bureau_bal.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [23]:
# Visualizing the info of df_bureau_bal dataframe
df_bureau_bal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27299925 entries, 0 to 27299924
Data columns (total 3 columns):
SK_ID_BUREAU      int32
MONTHS_BALANCE    int8
STATUS            category
dtypes: category(1), int32(1), int8(1)
memory usage: 156.2 MB


In [24]:
# Visualizing the unique values of the df_bureau_bal['STATUS'] categorical column
df_bureau_bal['STATUS'].value_counts()

C    13646993
0     7499507
X     5810482
1      242347
5       62406
2       23419
3        8924
4        5847
Name: STATUS, dtype: int64

In [25]:
# Getting the dummies for categorical column of df_bureau_bal
df_bureau_bal = pd.get_dummies(df_bureau_bal)

In [26]:
# Collapsing the df_bureau_bal dataframe on 'SK_ID_BUREAU' column
df_bureau_bal=df_bureau_bal.groupby('SK_ID_BUREAU',as_index=False).mean()

In [27]:
df_bureau_bal.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X
0,5001709,-48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.886598,0.113402
1,5001710,-41.0,0.060241,0.0,0.0,0.0,0.0,0.0,0.578313,0.361446
2,5001711,-1.5,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.25
3,5001712,-9.0,0.526316,0.0,0.0,0.0,0.0,0.0,0.473684,0.0
4,5001713,-10.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
# Dropping Columns from df_bureau_bal to prevent linear dependency
df_bureau_bal.drop(columns=['STATUS_4','STATUS_5','STATUS_2','STATUS_3','STATUS_1'],inplace=True)

In [30]:
# Pre-pending bu_bal to each column name of dataframe df_bureau_bal
ls = ['bu_bal_'+col for col in df_bureau_bal.columns if col!='SK_ID_BUREAU']
ls.insert(0,'SK_ID_BUREAU')
df_bureau_bal.columns=ls

In [31]:
# Visualizing the result
df_bureau_bal.head()

Unnamed: 0,SK_ID_BUREAU,bu_bal_MONTHS_BALANCE,bu_bal_STATUS_0,bu_bal_STATUS_C,bu_bal_STATUS_X
0,5001709,-48.0,0.0,0.886598,0.113402
1,5001710,-41.0,0.060241,0.578313,0.361446
2,5001711,-1.5,0.75,0.0,0.25
3,5001712,-9.0,0.526316,0.473684,0.0
4,5001713,-10.5,0.0,0.0,1.0


In [47]:
# Getting the info of df_bureau_bal before merging
df_bureau_bal.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 817395 entries, 0 to 817394
Data columns (total 5 columns):
SK_ID_BUREAU             817395 non-null int64
bu_bal_MONTHS_BALANCE    817395 non-null float64
bu_bal_STATUS_0          817395 non-null float64
bu_bal_STATUS_C          817395 non-null float64
bu_bal_STATUS_X          817395 non-null float64
dtypes: float64(4), int64(1)
memory usage: 37.4 MB


### Processing the bureau dataframe

In [33]:
# Visualizing the head of df_bureau
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,Prev_Bu_Loan_Cnt
0,215354,5714462,Closed,-497,0,-153.0,0,91323.0,0.0,0.0,Consumer credit,-131,11
1,215354,5714463,Active,-208,0,1075.0,0,225000.0,171342.0,0.0,Credit card,-20,11
2,215354,5714464,Active,-203,0,528.0,0,464323.5,,0.0,Consumer credit,-16,11
3,215354,5714465,Active,-203,0,,0,90000.0,,0.0,Credit card,-16,11
4,215354,5714466,Active,-629,0,1197.0,0,2700000.0,,0.0,Consumer credit,-21,11


In [36]:
# Visualizing the unique values of SK_ID_BUREAU column
df_bureau['SK_ID_BUREAU'].value_counts().head()

5000822    1
6547158    1
6487797    1
6481654    1
6483703    1
Name: SK_ID_BUREAU, dtype: int64

In [37]:
# Visualizing the unique values of SK_ID_CURR column
df_bureau['SK_ID_CURR'].value_counts().head()

120860    116
169704     94
318065     78
251643     61
425396     60
Name: SK_ID_CURR, dtype: int64

In [38]:
# Visualizing the unique values of SK_ID_BUREAU column of df_bureau_bal
df_bureau_bal['SK_ID_BUREAU'].value_counts().head()

6293503    1
6593990    1
6602178    1
6772159    1
6780347    1
Name: SK_ID_BUREAU, dtype: int64

In [46]:
# Pre-pending bu to each column name of dataframe df_bureau
ls = ['bu_'+col for col in df_bureau.columns if col not in ['SK_ID_BUREAU','SK_ID_CURR']]
ls.insert(0,'SK_ID_CURR')
ls.insert(1,'SK_ID_BUREAU')
df_bureau.columns=ls

In [48]:
#Examining the results
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,bu_CREDIT_ACTIVE,bu_DAYS_CREDIT,bu_CREDIT_DAY_OVERDUE,bu_DAYS_CREDIT_ENDDATE,bu_CNT_CREDIT_PROLONG,bu_AMT_CREDIT_SUM,bu_AMT_CREDIT_SUM_DEBT,bu_AMT_CREDIT_SUM_OVERDUE,bu_CREDIT_TYPE,bu_DAYS_CREDIT_UPDATE,bu_Prev_Bu_Loan_Cnt
0,215354,5714462,Closed,-497,0,-153.0,0,91323.0,0.0,0.0,Consumer credit,-131,11
1,215354,5714463,Active,-208,0,1075.0,0,225000.0,171342.0,0.0,Credit card,-20,11
2,215354,5714464,Active,-203,0,528.0,0,464323.5,,0.0,Consumer credit,-16,11
3,215354,5714465,Active,-203,0,,0,90000.0,,0.0,Credit card,-16,11
4,215354,5714466,Active,-629,0,1197.0,0,2700000.0,,0.0,Consumer credit,-21,11


### Merging dataframes df_bureau & df_bureau_bal & analyzing the merged dataframes

In [49]:
# Merging df_bureau & df_bureau_bal on 'SK_ID_BUREAU' Column 
bu_bal_merged = df_bureau.merge(df_bureau_bal, on='SK_ID_BUREAU', how='left')

In [55]:
bu_bal_merged.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,bu_CREDIT_ACTIVE,bu_DAYS_CREDIT,bu_CREDIT_DAY_OVERDUE,bu_DAYS_CREDIT_ENDDATE,bu_CNT_CREDIT_PROLONG,bu_AMT_CREDIT_SUM,bu_AMT_CREDIT_SUM_DEBT,bu_AMT_CREDIT_SUM_OVERDUE,bu_CREDIT_TYPE,bu_DAYS_CREDIT_UPDATE,bu_Prev_Bu_Loan_Cnt,bu_bal_MONTHS_BALANCE,bu_bal_STATUS_0,bu_bal_STATUS_C,bu_bal_STATUS_X
0,215354,5714462,Closed,-497,0,-153.0,0,91323.0,0.0,0.0,Consumer credit,-131,11,,,,
1,215354,5714463,Active,-208,0,1075.0,0,225000.0,171342.0,0.0,Credit card,-20,11,,,,
2,215354,5714464,Active,-203,0,528.0,0,464323.5,,0.0,Consumer credit,-16,11,,,,
3,215354,5714465,Active,-203,0,,0,90000.0,,0.0,Credit card,-16,11,,,,
4,215354,5714466,Active,-629,0,1197.0,0,2700000.0,,0.0,Consumer credit,-21,11,,,,


In [56]:
# Getting the Categorical columns of the merged dataframe bu_bal_merged
bu_bal_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1716428 entries, 0 to 1716427
Data columns (total 17 columns):
SK_ID_CURR                   int32
SK_ID_BUREAU                 int32
bu_CREDIT_ACTIVE             category
bu_DAYS_CREDIT               int16
bu_CREDIT_DAY_OVERDUE        int16
bu_DAYS_CREDIT_ENDDATE       float16
bu_CNT_CREDIT_PROLONG        int8
bu_AMT_CREDIT_SUM            float32
bu_AMT_CREDIT_SUM_DEBT       float32
bu_AMT_CREDIT_SUM_OVERDUE    float32
bu_CREDIT_TYPE               category
bu_DAYS_CREDIT_UPDATE        int32
bu_Prev_Bu_Loan_Cnt          int64
bu_bal_MONTHS_BALANCE        float64
bu_bal_STATUS_0              float64
bu_bal_STATUS_C              float64
bu_bal_STATUS_X              float64
dtypes: category(2), float16(1), float32(3), float64(4), int16(2), int32(3), int64(1), int8(1)
memory usage: 132.6 MB


In [57]:
# Checking the unique values of bu_CREDIT_ACTIVE
bu_bal_merged['bu_CREDIT_ACTIVE'].value_counts()

Closed      1079273
Active       630607
Sold           6527
Bad debt         21
Name: bu_CREDIT_ACTIVE, dtype: int64

In [58]:
# Checking the unique values of bu_CREDIT_TYPE
bu_bal_merged['bu_CREDIT_TYPE'].value_counts()

Consumer credit                                 1251615
Credit card                                      402195
Car loan                                          27690
Mortgage                                          18391
Microloan                                         12413
Loan for business development                      1975
Another type of loan                               1017
Unknown type of loan                                555
Loan for working capital replenishment              469
Cash loan (non-earmarked)                            56
Real estate loan                                     27
Loan for the purchase of equipment                   19
Loan for purchase of shares (margin lending)          4
Mobile operator loan                                  1
Interbank credit                                      1
Name: bu_CREDIT_TYPE, dtype: int64

In [60]:
# Reclassifying minority values in 'bu_CREDIT_TYPE' as other
ls = ['Consumer credit','Credit card']
bu_bal_merged['bu_CREDIT_TYPE'] = bu_bal_merged['bu_CREDIT_TYPE'].astype(str)
bu_bal_merged.loc[~(bu_bal_merged['bu_CREDIT_TYPE'].isin(ls))
             ,'bu_CREDIT_TYPE'] = 'other'
bu_bal_merged['bu_CREDIT_TYPE'] = bu_bal_merged['bu_CREDIT_TYPE'].astype('category')

In [61]:
# verifing the result
bu_bal_merged['bu_CREDIT_TYPE'].value_counts()

Consumer credit    1251615
Credit card         402195
other                62618
Name: bu_CREDIT_TYPE, dtype: int64

In [62]:
# Getting dummies for the merged Dataframes
bu_bal_merged=pd.get_dummies(bu_bal_merged)

In [63]:
bu_bal_merged.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,bu_DAYS_CREDIT,bu_CREDIT_DAY_OVERDUE,bu_DAYS_CREDIT_ENDDATE,bu_CNT_CREDIT_PROLONG,bu_AMT_CREDIT_SUM,bu_AMT_CREDIT_SUM_DEBT,bu_AMT_CREDIT_SUM_OVERDUE,bu_DAYS_CREDIT_UPDATE,bu_Prev_Bu_Loan_Cnt,bu_bal_MONTHS_BALANCE,bu_bal_STATUS_0,bu_bal_STATUS_C,bu_bal_STATUS_X,bu_CREDIT_ACTIVE_Active,bu_CREDIT_ACTIVE_Bad debt,bu_CREDIT_ACTIVE_Closed,bu_CREDIT_ACTIVE_Sold,bu_CREDIT_TYPE_Consumer credit,bu_CREDIT_TYPE_Credit card,bu_CREDIT_TYPE_other
0,215354,5714462,-497,0,-153.0,0,91323.0,0.0,0.0,-131,11,,,,,0,0,1,0,1,0,0
1,215354,5714463,-208,0,1075.0,0,225000.0,171342.0,0.0,-20,11,,,,,1,0,0,0,0,1,0
2,215354,5714464,-203,0,528.0,0,464323.5,,0.0,-16,11,,,,,1,0,0,0,1,0,0
3,215354,5714465,-203,0,,0,90000.0,,0.0,-16,11,,,,,1,0,0,0,0,1,0
4,215354,5714466,-629,0,1197.0,0,2700000.0,,0.0,-21,11,,,,,1,0,0,0,1,0,0


In [64]:
# Dropping redundant SK_ID_BUREAU column from the merged dataframe bu_bal_merged
bu_bal_merged.drop(columns='SK_ID_BUREAU',inplace=True)

In [65]:
bu_bal_merged.head()

Unnamed: 0,SK_ID_CURR,bu_DAYS_CREDIT,bu_CREDIT_DAY_OVERDUE,bu_DAYS_CREDIT_ENDDATE,bu_CNT_CREDIT_PROLONG,bu_AMT_CREDIT_SUM,bu_AMT_CREDIT_SUM_DEBT,bu_AMT_CREDIT_SUM_OVERDUE,bu_DAYS_CREDIT_UPDATE,bu_Prev_Bu_Loan_Cnt,bu_bal_MONTHS_BALANCE,bu_bal_STATUS_0,bu_bal_STATUS_C,bu_bal_STATUS_X,bu_CREDIT_ACTIVE_Active,bu_CREDIT_ACTIVE_Bad debt,bu_CREDIT_ACTIVE_Closed,bu_CREDIT_ACTIVE_Sold,bu_CREDIT_TYPE_Consumer credit,bu_CREDIT_TYPE_Credit card,bu_CREDIT_TYPE_other
0,215354,-497,0,-153.0,0,91323.0,0.0,0.0,-131,11,,,,,0,0,1,0,1,0,0
1,215354,-208,0,1075.0,0,225000.0,171342.0,0.0,-20,11,,,,,1,0,0,0,0,1,0
2,215354,-203,0,528.0,0,464323.5,,0.0,-16,11,,,,,1,0,0,0,1,0,0
3,215354,-203,0,,0,90000.0,,0.0,-16,11,,,,,1,0,0,0,0,1,0
4,215354,-629,0,1197.0,0,2700000.0,,0.0,-21,11,,,,,1,0,0,0,1,0,0


In [66]:
# Cpllapsing the bu_bal_merged dataframe on 'SK_ID_CURR'
bu_bal_final=bu_bal_merged.groupby('SK_ID_CURR',as_index=False).mean()

In [67]:
#verifying the result
bu_bal_final.head()

Unnamed: 0,SK_ID_CURR,bu_DAYS_CREDIT,bu_CREDIT_DAY_OVERDUE,bu_DAYS_CREDIT_ENDDATE,bu_CNT_CREDIT_PROLONG,bu_AMT_CREDIT_SUM,bu_AMT_CREDIT_SUM_DEBT,bu_AMT_CREDIT_SUM_OVERDUE,bu_DAYS_CREDIT_UPDATE,bu_Prev_Bu_Loan_Cnt,bu_bal_MONTHS_BALANCE,bu_bal_STATUS_0,bu_bal_STATUS_C,bu_bal_STATUS_X,bu_CREDIT_ACTIVE_Active,bu_CREDIT_ACTIVE_Bad debt,bu_CREDIT_ACTIVE_Closed,bu_CREDIT_ACTIVE_Sold,bu_CREDIT_TYPE_Consumer credit,bu_CREDIT_TYPE_Credit card,bu_CREDIT_TYPE_other
0,100001,-735.0,0.0,82.4375,0.0,207623.578125,85240.929688,0.0,-93.142857,7,-11.785714,0.336651,0.44124,0.21459,0.428571,0.0,0.571429,0.0,1.0,0.0,0.0
1,100002,-874.0,0.0,-349.0,0.0,108131.945312,49156.199219,0.0,-499.875,8,-21.875,0.40696,0.175426,0.161932,0.25,0.0,0.75,0.0,0.5,0.5,0.0
2,100003,-1400.75,0.0,-544.5,0.0,254350.125,0.0,0.0,-816.0,4,,,,,0.25,0.0,0.75,0.0,0.5,0.5,0.0
3,100004,-867.0,0.0,-488.5,0.0,94518.898438,0.0,0.0,-532.0,2,,,,,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,100005,-190.666667,0.0,439.25,0.0,219042.0,189469.5,0.0,-54.333333,3,-3.0,0.735043,0.128205,0.136752,0.666667,0.0,0.333333,0.0,0.666667,0.333333,0.0


In [70]:
# Redundant columns to drop from bu_bal_final to remove linear dependence
cols_list = ['bu_CREDIT_ACTIVE_Bad debt','bu_CREDIT_ACTIVE_Sold','bu_CREDIT_TYPE_other']

In [71]:
#Dropping Redundant columns from the bu_bal_final
bu_bal_final.drop(columns=cols_list,inplace=True)

In [73]:
bu_bal_final.head()

Unnamed: 0,SK_ID_CURR,bu_DAYS_CREDIT,bu_CREDIT_DAY_OVERDUE,bu_DAYS_CREDIT_ENDDATE,bu_CNT_CREDIT_PROLONG,bu_AMT_CREDIT_SUM,bu_AMT_CREDIT_SUM_DEBT,bu_AMT_CREDIT_SUM_OVERDUE,bu_DAYS_CREDIT_UPDATE,bu_Prev_Bu_Loan_Cnt,bu_bal_MONTHS_BALANCE,bu_bal_STATUS_0,bu_bal_STATUS_C,bu_bal_STATUS_X,bu_CREDIT_ACTIVE_Active,bu_CREDIT_ACTIVE_Closed,bu_CREDIT_TYPE_Consumer credit,bu_CREDIT_TYPE_Credit card
0,100001,-735.0,0.0,82.4375,0.0,207623.578125,85240.929688,0.0,-93.142857,7,-11.785714,0.336651,0.44124,0.21459,0.428571,0.571429,1.0,0.0
1,100002,-874.0,0.0,-349.0,0.0,108131.945312,49156.199219,0.0,-499.875,8,-21.875,0.40696,0.175426,0.161932,0.25,0.75,0.5,0.5
2,100003,-1400.75,0.0,-544.5,0.0,254350.125,0.0,0.0,-816.0,4,,,,,0.25,0.75,0.5,0.5
3,100004,-867.0,0.0,-488.5,0.0,94518.898438,0.0,0.0,-532.0,2,,,,,0.0,1.0,1.0,0.0
4,100005,-190.666667,0.0,439.25,0.0,219042.0,189469.5,0.0,-54.333333,3,-3.0,0.735043,0.128205,0.136752,0.666667,0.333333,0.666667,0.333333


In [74]:
# Getting the info of the final merged dataframe
bu_bal_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 305811 entries, 0 to 305810
Data columns (total 18 columns):
SK_ID_CURR                        305811 non-null int64
bu_DAYS_CREDIT                    305811 non-null float64
bu_CREDIT_DAY_OVERDUE             305811 non-null float64
bu_DAYS_CREDIT_ENDDATE            303226 non-null float16
bu_CNT_CREDIT_PROLONG             305811 non-null float64
bu_AMT_CREDIT_SUM                 305809 non-null float32
bu_AMT_CREDIT_SUM_DEBT            297439 non-null float32
bu_AMT_CREDIT_SUM_OVERDUE         305811 non-null float32
bu_DAYS_CREDIT_UPDATE             305811 non-null float64
bu_Prev_Bu_Loan_Cnt               305811 non-null int64
bu_bal_MONTHS_BALANCE             134542 non-null float64
bu_bal_STATUS_0                   134542 non-null float64
bu_bal_STATUS_C                   134542 non-null float64
bu_bal_STATUS_X                   134542 non-null float64
bu_CREDIT_ACTIVE_Active           305811 non-null float64
bu_CREDIT_ACTIVE_Closed

In [76]:
# Saving the final merged dataframe into a file.
bu_bal_final.to_csv('bureau&bu_bal_final.csv',index=False)