# 1. Loading libraries and import data

In [6]:
# import libraries and suppress memory
# importing Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# define the function for reducing memory usage when importing data
def reduce_memory_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
#reading the csv file 
application_train = reduce_memory_usage(pd.read_csv("/Users/linhdo/Documents/University/University of Utah/Coursework/Spring 2025/Career Capstone/home-credit-default-risk/application_final.csv"))
bureau_df = reduce_memory_usage(pd.read_csv("/Users/linhdo/Documents/University/University of Utah/Coursework/Spring 2025/Career Capstone/home-credit-default-risk/bureau.csv"))
bureau_balance_df = reduce_memory_usage(pd.read_csv("/Users/linhdo/Documents/University/University of Utah/Coursework/Spring 2025/Career Capstone/home-credit-default-risk/bureau_balance.csv"))

Memory usage of dataframe is 17.05 MB
Memory usage after optimization is: 6.72 MB
Decreased by 60.6%
Memory usage of dataframe is 222.62 MB
Memory usage after optimization is: 112.95 MB
Decreased by 49.3%
Memory usage of dataframe is 624.85 MB
Memory usage after optimization is: 338.46 MB
Decreased by 45.8%


# 2. Data Exploration

## a. Application dataset (already downsample)

In [10]:
# getting a summary statistics and shape of the application dataset
application_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CAR_AGE_BIN_Factorized,EXT_SOURCE_1_Category,EXT_SOURCE_2_Category,EXT_SOURCE_3_Category
0,343403,0,Cash loans,M,N,Y,-0.577637,-0.332275,-0.359131,-0.119568,...,-0.076355,-0.063232,-0.167847,-0.291992,-0.334229,-0.481445,0,0,1,1
1,239949,0,Cash loans,F,N,N,-0.577637,-0.408203,-0.751953,-0.761719,...,-0.076355,-0.063232,-0.167847,0.799805,-0.334229,-1.016602,0,0,1,0
2,340073,0,Cash loans,F,N,N,-0.577637,-0.294434,-0.781738,-0.583984,...,-0.076355,-0.063232,-0.167847,-0.291992,-0.334229,0.053497,0,1,2,0
3,442204,0,Cash loans,F,N,Y,-0.577637,-0.237427,-0.722656,-0.708008,...,-0.076355,-0.063232,-0.167847,-0.291992,-0.334229,-1.016602,0,0,2,0
4,443323,0,Cash loans,F,Y,Y,-0.577637,1.185547,-0.237915,2.066406,...,-0.076355,-0.063232,-0.167847,-0.291992,-0.334229,-1.016602,3,1,2,1


In [11]:
application_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49650 entries, 0 to 49649
Data columns (total 45 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   SK_ID_CURR                   49650 non-null  int32  
 1   TARGET                       49650 non-null  int8   
 2   NAME_CONTRACT_TYPE           49650 non-null  object 
 3   CODE_GENDER                  49650 non-null  object 
 4   FLAG_OWN_CAR                 49650 non-null  object 
 5   FLAG_OWN_REALTY              49650 non-null  object 
 6   CNT_CHILDREN                 49650 non-null  float16
 7   AMT_INCOME_TOTAL             49650 non-null  float16
 8   AMT_CREDIT                   49650 non-null  float16
 9   AMT_ANNUITY                  49650 non-null  float16
 10  AMT_GOODS_PRICE              49650 non-null  float16
 11  NAME_INCOME_TYPE             49650 non-null  object 
 12  NAME_EDUCATION_TYPE          49650 non-null  object 
 13  NAME_FAMILY_STAT

## b. Bureau 

First, we will investigate the bureau data by looking at the first few rows. Then we will look further into how we can aggregate this data to join with application table

In [14]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)

# view the first few rows  
bureau_df.head(15)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,
5,215354,5714467,Active,currency 1,-273,0,27456.0,,0.0,0,180000.0,71017.382812,108982.617188,0.0,Credit card,-31,
6,215354,5714468,Active,currency 1,-43,0,79.0,,0.0,0,42103.8,42103.800781,0.0,0.0,Consumer credit,-22,
7,162297,5714469,Closed,currency 1,-1896,0,-1684.0,-1710.0,14985.0,0,76878.45,0.0,0.0,0.0,Consumer credit,-1710,
8,162297,5714470,Closed,currency 1,-1146,0,-811.0,-840.0,0.0,0,103007.7,0.0,0.0,0.0,Consumer credit,-840,
9,162297,5714471,Active,currency 1,-1146,0,-484.0,,0.0,0,4500.0,0.0,0.0,0.0,Credit card,-690,


In [16]:
pd.set_option('display.max_rows', None)  # Show all rows (be cautious for large datasets)

bureau_df['CREDIT_CURRENCY'].value_counts()

bureau_df['CREDIT_DAY_OVERDUE'].value_counts()

bureau_df['CREDIT_TYPE'].value_counts()

CREDIT_TYPE
Consumer credit                                 1251615
Credit card                                      402195
Car loan                                          27690
Mortgage                                          18391
Microloan                                         12413
Loan for business development                      1975
Another type of loan                               1017
Unknown type of loan                                555
Loan for working capital replenishment              469
Cash loan (non-earmarked)                            56
Real estate loan                                     27
Loan for the purchase of equipment                   19
Loan for purchase of shares (margin lending)          4
Mobile operator loan                                  1
Interbank credit                                      1
Name: count, dtype: int64

In [17]:
bureau_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1716428 entries, 0 to 1716427
Data columns (total 17 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   SK_ID_CURR              int32  
 1   SK_ID_BUREAU            int32  
 2   CREDIT_ACTIVE           object 
 3   CREDIT_CURRENCY         object 
 4   DAYS_CREDIT             int16  
 5   CREDIT_DAY_OVERDUE      int16  
 6   DAYS_CREDIT_ENDDATE     float16
 7   DAYS_ENDDATE_FACT       float16
 8   AMT_CREDIT_MAX_OVERDUE  float32
 9   CNT_CREDIT_PROLONG      int8   
 10  AMT_CREDIT_SUM          float32
 11  AMT_CREDIT_SUM_DEBT     float32
 12  AMT_CREDIT_SUM_LIMIT    float32
 13  AMT_CREDIT_SUM_OVERDUE  float32
 14  CREDIT_TYPE             object 
 15  DAYS_CREDIT_UPDATE      int32  
 16  AMT_ANNUITY             float32
dtypes: float16(2), float32(6), int16(2), int32(3), int8(1), object(3)
memory usage: 112.9+ MB


In [47]:
# Number of closed and active credits
bureau_df['closed_credit'] = (bureau_df['CREDIT_ACTIVE'] == 'Closed').astype(int)
bureau_df['active_credit'] = (bureau_df['CREDIT_ACTIVE'] == 'Active').astype(int)

# Median days credit end date
median_enddate = bureau_df.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].median().reset_index()

# Median number of credit days overdue
median_overdue = bureau_df.groupby('SK_ID_CURR')['CREDIT_DAY_OVERDUE'].median().reset_index()

# Median duration of credit
bureau_df['credit_duration'] = bureau_df['DAYS_CREDIT_ENDDATE'] - bureau_df['DAYS_CREDIT']
median_duration = bureau_df.groupby('SK_ID_CURR')['credit_duration'].median().reset_index()

# Median AMT_CREDIT_SUM, AMT_CREDIT_SUM_DEBT, and AMT_ANNUITY
median_credit_sum = bureau_df.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].median().reset_index()
median_credit_debt = bureau_df.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].median().reset_index()
median_amt_annuity = bureau_df.groupby('SK_ID_CURR')['AMT_ANNUITY'].median().reset_index()

# Aggregate closed and active credits
bureau_agg = bureau_df.groupby('SK_ID_CURR').agg(
    closed_credit_number =('closed_credit', 'sum'),
    active_credit_number =('active_credit', 'sum')
).reset_index()

# Merge all computed features
bureau_agg = bureau_agg.merge(median_enddate, on='SK_ID_CURR', how='left')
bureau_agg = bureau_agg.merge(median_overdue, on='SK_ID_CURR', how='left')
bureau_agg = bureau_agg.merge(median_duration, on='SK_ID_CURR', how='left')
bureau_agg = bureau_agg.merge(median_credit_sum, on='SK_ID_CURR', how='left')
bureau_agg = bureau_agg.merge(median_credit_debt, on='SK_ID_CURR', how='left')
bureau_agg = bureau_agg.merge(median_amt_annuity, on='SK_ID_CURR', how='left')

# Rename columns for clarity
bureau_agg.rename(columns={'DAYS_CREDIT_ENDDATE': 'median_DAYS_CREDIT_ENDDATE',
                         'CREDIT_DAY_OVERDUE': 'median_CREDIT_DAY_OVERDUE',
                         'credit_duration': 'median_DURATION_OF_CREDIT',
                         'AMT_CREDIT_SUM': 'median_AMT_CREDIT_SUM',
                         'AMT_CREDIT_SUM_DEBT': 'median_AMT_CREDIT_SUM_DEBT',
                         'AMT_ANNUITY': 'median_AMT_ANNUITY'}, inplace=True)

# Display first 5 rows
bureau_agg.head()


Unnamed: 0,SK_ID_CURR,closed_credit_number,active_credit_number,median_DAYS_CREDIT_ENDDATE,median_CREDIT_DAY_OVERDUE,median_DURATION_OF_CREDIT,median_AMT_CREDIT_SUM,median_AMT_CREDIT_SUM_DEBT,median_AMT_ANNUITY
0,100001,4,3,-179.0,0.0,730.0,168345.0,0.0,0.0
1,100002,6,2,-424.5,0.0,547.5,54130.5,0.0,0.0
2,100003,3,1,-480.0,0.0,725.5,92576.25,0.0,
3,100004,2,0,-488.5,0.0,378.5,94518.898438,0.0,
4,100005,1,2,122.0,0.0,245.0,58500.0,25321.5,0.0


In [67]:
bureau_agg.isna().sum()

# drop median_AMT_ANNUITY since it has over 50% NAs
bureau_agg.drop(columns=['median_AMT_ANNUITY'], inplace=True, errors = 'ignore')
bureau_agg.isna().sum()

SK_ID_CURR                       0
closed_credit_number             0
active_credit_number             0
median_DAYS_CREDIT_ENDDATE    2585
median_CREDIT_DAY_OVERDUE        0
median_DURATION_OF_CREDIT     2585
median_AMT_CREDIT_SUM            2
median_AMT_CREDIT_SUM_DEBT    8372
dtype: int64

In [73]:
# remove rows with NAs values
# Remove rows with missing values in specified columns
bureau_agg.dropna(subset=[
    'median_DAYS_CREDIT_ENDDATE',
    'median_CREDIT_DAY_OVERDUE',
    'median_DURATION_OF_CREDIT',
    'median_AMT_CREDIT_SUM', 
    'median_AMT_CREDIT_SUM_DEBT'
], inplace=True)

# Reset index after dropping rows
bureau_agg.reset_index(drop=True, inplace=True)

In [75]:
bureau_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295341 entries, 0 to 295340
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   SK_ID_CURR                  295341 non-null  int32  
 1   closed_credit_number        295341 non-null  int64  
 2   active_credit_number        295341 non-null  int64  
 3   median_DAYS_CREDIT_ENDDATE  295341 non-null  float64
 4   median_CREDIT_DAY_OVERDUE   295341 non-null  float64
 5   median_DURATION_OF_CREDIT   295341 non-null  float32
 6   median_AMT_CREDIT_SUM       295341 non-null  float64
 7   median_AMT_CREDIT_SUM_DEBT  295341 non-null  float64
dtypes: float32(1), float64(4), int32(1), int64(2)
memory usage: 15.8 MB


## c. Bureau balance

In [95]:
bureau_balance_df.head(10)

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C
5,5715448,-5,C
6,5715448,-6,C
7,5715448,-7,C
8,5715448,-8,C
9,5715448,-9,0


In [97]:
bureau_balance_df['STATUS'].value_counts()

STATUS
C    13646993
0     7499507
X     5810482
1      242347
5       62406
2       23419
3        8924
4        5847
Name: count, dtype: int64

In [87]:
bureau_balance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27299925 entries, 0 to 27299924
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   SK_ID_BUREAU    int32 
 1   MONTHS_BALANCE  int8  
 2   STATUS          object
dtypes: int32(1), int8(1), object(1)
memory usage: 338.5+ MB


In [89]:
bureau_balance_df.isna().sum()

SK_ID_BUREAU      0
MONTHS_BALANCE    0
STATUS            0
dtype: int64

Since this dataset only informs of the credit status and its duration, we decide not to use this dataset since we have all those data in our bureau_agg

# Join data

In [103]:
# Perform an inner join on SK_ID_CURR
merged_bureau = application_train.merge(bureau_agg, on='SK_ID_CURR', how='left')

# Display first 5 rows of the merged dataset
merged_bureau.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_CONT_MOBILE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,ORGANIZATION_TYPE,APARTMENTS_MODE,APARTMENTS_MEDI,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CAR_AGE_BIN_Factorized,EXT_SOURCE_1_Category,EXT_SOURCE_2_Category,EXT_SOURCE_3_Category,closed_credit_number,active_credit_number,median_DAYS_CREDIT_ENDDATE,median_CREDIT_DAY_OVERDUE,median_DURATION_OF_CREDIT,median_AMT_CREDIT_SUM,median_AMT_CREDIT_SUM_DEBT
0,343403,0,Cash loans,M,N,Y,-0.577637,-0.332275,-0.359131,-0.119568,-0.227051,Pensioner,Secondary / secondary special,Married,House / apartment,0.399414,-1.382812,2.132812,-1.961914,-0.717773,1,1,0,Others,-0.167603,2,-0.062683,0,XNA,0.021942,-0.021545,-0.175903,1.917969,-0.170288,2.484375,-0.076355,-0.063232,-0.167847,-0.291992,-0.334229,-0.481445,0,0,1,1,2.0,0.0,-1068.5,0.0,246.5,44646.75,0.0
1,239949,0,Cash loans,F,N,N,-0.577637,-0.408203,-0.751953,-0.761719,-0.848145,Working,Secondary / secondary special,Widow,House / apartment,1.079102,-0.617676,-0.455811,-1.335938,0.481201,1,1,0,Others,-1.265625,2,-0.062683,0,Other,-0.280029,-0.31665,-0.592285,-0.321045,-0.590332,-0.276123,-0.076355,-0.063232,-0.167847,0.799805,-0.334229,-1.016602,0,0,1,0,1.0,0.0,-2324.0,0.0,366.0,47475.0,0.0
2,340073,0,Cash loans,F,N,N,-0.577637,-0.294434,-0.781738,-0.583984,-0.848145,Working,Secondary / secondary special,Civil marriage,House / apartment,-0.994141,1.026367,-0.459717,0.460693,0.51416,1,1,0,Laborers,-0.167603,2,-0.062683,0,Industry: type 3,0.449951,0.398438,-0.592285,-0.321045,-0.590332,-0.276123,-0.076355,-0.063232,-0.167847,-0.291992,-0.334229,0.053497,0,1,2,0,8.0,3.0,-347.0,0.0,307.0,55350.0,0.0
3,442204,0,Cash loans,F,N,Y,-0.577637,-0.237427,-0.722656,-0.708008,-0.82373,State servant,Higher education,Married,With parents,0.310547,0.736816,-0.493652,1.34375,1.560547,1,1,0,Core staff,-0.167603,2,-0.062683,0,School,-0.280029,-0.31665,-0.175903,-0.321045,-0.170288,-0.276123,-0.076355,-0.063232,-0.167847,-0.291992,-0.334229,-1.016602,0,0,2,0,,,,,,,
4,443323,0,Cash loans,F,Y,Y,-0.577637,1.185547,-0.237915,2.066406,-0.202759,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-1.029297,0.898438,-0.452881,1.071289,1.658203,1,1,0,Accountants,-1.265625,2,-0.062683,0,Business Entity Type 3,-1.019531,-1.041992,0.657227,4.15625,0.669922,5.246094,-0.076355,-0.063232,-0.167847,-0.291992,-0.334229,-1.016602,3,1,2,1,2.0,1.0,-200.0,0.0,365.0,485959.5,0.0


In [105]:
merged_bureau.isna().sum()

SK_ID_CURR                        0
TARGET                            0
NAME_CONTRACT_TYPE                0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
FLAG_OWN_REALTY                   0
CNT_CHILDREN                      0
AMT_INCOME_TOTAL                  0
AMT_CREDIT                        0
AMT_ANNUITY                       0
AMT_GOODS_PRICE                   0
NAME_INCOME_TYPE                  0
NAME_EDUCATION_TYPE               0
NAME_FAMILY_STATUS                0
NAME_HOUSING_TYPE                 0
REGION_POPULATION_RELATIVE        0
DAYS_BIRTH                        0
DAYS_EMPLOYED                     0
DAYS_REGISTRATION                 0
DAYS_ID_PUBLISH                   0
FLAG_MOBIL                        0
FLAG_CONT_MOBILE                  0
FLAG_EMAIL                        0
OCCUPATION_TYPE                   0
CNT_FAM_MEMBERS                   0
REGION_RATING_CLIENT              0
REGION_RATING_CLIENT_W_CITY       0
REG_CITY_NOT_LIVE_CITY      

It is noted that there are some customers (aka sk_id_curr) does not have previous credit record. 

In [108]:
merged_bureau.to_csv("merged_bureau.csv", index = False)