In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# Summury

In this part, we are merging all the different table in order to have one dataset. For that, we created classes for each dataset to do some preprocessing, data cleaning and feature engineering. 

# Preprocessing

In [2]:
%reload_ext autoreload
%autoreload 2
import sys
from pathlib import Path
import warnings
from datetime import datetime
from pathlib import Path

import cufflinks as cf
import pandas as pd
import pendulum
from loguru import logger
from plotly.offline import init_notebook_mode

sys.path.append(str(Path.cwd().parent))
from settings.params import PARAMS, SEED
from src.load_all_dataset import load_all_dataset
from src.utils import reduce_memory_usage
from src.datasets import *


init_notebook_mode(connected=True)
cf.go_offline()


warnings.filterwarnings('ignore')


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [3]:
# Set logging format
log_fmt = "<green>{time:YYYY-MM-DD HH:mm:ss.SSS!UTC}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - {message}"
logger.configure(handlers=[{"sink": sys.stderr, "format": log_fmt}])

# current date
CURRENT_DATE = pendulum.now(tz="UTC")

In [4]:
DIR = PARAMS['file_directory']

In [5]:
DIR = "data/home-credit-default-risk/"

In [6]:
CLEANED_DATA_DIR = PARAMS['cleaned_data_directory']

# Preprocessing of each dataset

## Bureau and Bureau_balance

In [7]:
bureau_aggregated = preprocess_bureau_balance_and_bureau(file_directory=DIR).main()

[32m2024-08-07 17:56:34.714[0m | [1mINFO    [0m | [36mbureau[0m:[36m__init__[0m:[36m47[0m - Preprocessing class initialized.
[32m2024-08-07 17:56:34.719[0m | [1mINFO    [0m | [36mbureau[0m:[36mpreprocess_bureau_balance[0m:[36m63[0m - #######################################################
[32m2024-08-07 17:56:34.724[0m | [1mINFO    [0m | [36mbureau[0m:[36mpreprocess_bureau_balance[0m:[36m64[0m - #          Pre-processing bureau_balance.csv          #
[32m2024-08-07 17:56:34.728[0m | [1mINFO    [0m | [36mbureau[0m:[36mpreprocess_bureau_balance[0m:[36m65[0m - #######################################################
[32m2024-08-07 17:56:34.730[0m | [1mINFO    [0m | [36mbureau[0m:[36mpreprocess_bureau_balance[0m:[36m66[0m - 
Loading the DataFrame, bureau_balance.csv, into memory...
[32m2024-08-07 17:56:50.785[0m | [1mINFO    [0m | [36mutils[0m:[36mreduce_memory_usage[0m:[36m25[0m - Memory usage of dataframe is 624.85 MB
[32m2024-0

## Previous Application

In [9]:
previous_aggregated = preprocess_previous_application(file_directory=CLEANED_DATA_DIR).main()

[32m2024-08-07 17:58:42.249[0m | [1mINFO    [0m | [36mprevious_application[0m:[36m__init__[0m:[36m43[0m - Preprocessing class initialized.
[32m2024-08-07 17:58:42.250[0m | [1mINFO    [0m | [36mprevious_application[0m:[36mload_dataframe[0m:[36m53[0m - ########################################################
[32m2024-08-07 17:58:42.251[0m | [1mINFO    [0m | [36mprevious_application[0m:[36mload_dataframe[0m:[36m54[0m - #        Pre-processing previous_application.csv        #
[32m2024-08-07 17:58:42.252[0m | [1mINFO    [0m | [36mprevious_application[0m:[36mload_dataframe[0m:[36m55[0m - ########################################################
[32m2024-08-07 17:58:42.253[0m | [1mINFO    [0m | [36mprevious_application[0m:[36mload_dataframe[0m:[36m56[0m - Loading the DataFrame, previous_application.csv, into memory...
[32m2024-08-07 17:58:56.840[0m | [1mINFO    [0m | [36mutils[0m:[36mreduce_memory_usage[0m:[36m25[0m - Memory usage of

## installments_payments

In [10]:
installments_aggregated = preprocess_installments_payments(file_directory=DIR).main()

[32m2024-08-07 17:59:08.865[0m | [1mINFO    [0m | [36minstallements_payments[0m:[36m__init__[0m:[36m43[0m - Preprocessing class initialized.
[32m2024-08-07 17:59:08.865[0m | [1mINFO    [0m | [36minstallements_payments[0m:[36mload_dataframe[0m:[36m53[0m - ##########################################################
[32m2024-08-07 17:59:08.865[0m | [1mINFO    [0m | [36minstallements_payments[0m:[36mload_dataframe[0m:[36m54[0m - #        Pre-processing installments_payments.csv        #
[32m2024-08-07 17:59:08.865[0m | [1mINFO    [0m | [36minstallements_payments[0m:[36mload_dataframe[0m:[36m55[0m - ##########################################################
[32m2024-08-07 17:59:08.865[0m | [1mINFO    [0m | [36minstallements_payments[0m:[36mload_dataframe[0m:[36m56[0m - Loading the DataFrame, installments_payments.csv, into memory...
[32m2024-08-07 17:59:34.701[0m | [1mINFO    [0m | [36mutils[0m:[36mreduce_memory_usage[0m:[36m25[0m -

## POS_CASH_balance

In [11]:
pos_aggregated = preprocess_POS_CASH_balance(file_directory=DIR).main()

[32m2024-08-07 17:59:49.552[0m | [1mINFO    [0m | [36mpos_cash[0m:[36m__init__[0m:[36m39[0m - Preprocessing class initialized.
[32m2024-08-07 17:59:49.568[0m | [1mINFO    [0m | [36mpos_cash[0m:[36mload_dataframe[0m:[36m49[0m - #########################################################
[32m2024-08-07 17:59:49.568[0m | [1mINFO    [0m | [36mpos_cash[0m:[36mload_dataframe[0m:[36m50[0m - #          Pre-processing POS_CASH_balance.csv          #
[32m2024-08-07 17:59:49.568[0m | [1mINFO    [0m | [36mpos_cash[0m:[36mload_dataframe[0m:[36m51[0m - #########################################################
[32m2024-08-07 17:59:49.592[0m | [1mINFO    [0m | [36mpos_cash[0m:[36mload_dataframe[0m:[36m52[0m - Loading the DataFrame, POS_CASH_balance.csv, into memory...
[32m2024-08-07 18:00:07.474[0m | [1mINFO    [0m | [36mutils[0m:[36mreduce_memory_usage[0m:[36m25[0m - Memory usage of dataframe is 610.43 MB
[32m2024-08-07 18:00:08.638[0m | [1

## credit_card_balance

In [12]:
cc_aggregated = preprocess_credit_card_balance(file_directory=DIR).main()

[32m2024-08-07 18:00:25.357[0m | [1mINFO    [0m | [36mcredit_card_balance[0m:[36m__init__[0m:[36m37[0m - Preprocessing class initialized.
[32m2024-08-07 18:00:25.357[0m | [1mINFO    [0m | [36mcredit_card_balance[0m:[36mload_dataframe[0m:[36m47[0m - #########################################################
[32m2024-08-07 18:00:25.357[0m | [1mINFO    [0m | [36mcredit_card_balance[0m:[36mload_dataframe[0m:[36m48[0m - #        Pre-processing credit_card_balance.csv         #
[32m2024-08-07 18:00:25.357[0m | [1mINFO    [0m | [36mcredit_card_balance[0m:[36mload_dataframe[0m:[36m49[0m - #########################################################
[32m2024-08-07 18:00:25.370[0m | [1mINFO    [0m | [36mcredit_card_balance[0m:[36mload_dataframe[0m:[36m50[0m - Loading the DataFrame, credit_card_balance.csv, into memory...
[32m2024-08-07 18:00:44.984[0m | [1mINFO    [0m | [36mutils[0m:[36mreduce_memory_usage[0m:[36m25[0m - Memory usage of dat

## application_train and application_test

In [14]:
application_train, application_test = preprocess_application_train_test(
    file_directory1=CLEANED_DATA_DIR, file_directory2=DIR
).main()

[32m2024-08-07 18:02:57.776[0m | [1mINFO    [0m | [36mapplication[0m:[36mload_dataframes[0m:[36m56[0m - #######################################################
[32m2024-08-07 18:02:57.779[0m | [1mINFO    [0m | [36mapplication[0m:[36mload_dataframes[0m:[36m57[0m - #        Pre-processing application_train.csv         #
[32m2024-08-07 18:02:57.782[0m | [1mINFO    [0m | [36mapplication[0m:[36mload_dataframes[0m:[36m58[0m - #        Pre-processing application_test.csv          #
[32m2024-08-07 18:02:57.784[0m | [1mINFO    [0m | [36mapplication[0m:[36mload_dataframes[0m:[36m59[0m - #######################################################
[32m2024-08-07 18:02:57.787[0m | [1mINFO    [0m | [36mapplication[0m:[36mload_dataframes[0m:[36m60[0m - 
Loading the DataFrames into memory...
[32m2024-08-07 18:03:16.154[0m | [1mINFO    [0m | [36mutils[0m:[36mreduce_memory_usage[0m:[36m25[0m - Memory usage of dataframe is 288.57 MB
[32m2024-08-07 

In [15]:
train_data, test_data = merge_all_tables(
    application_train,
    application_test,
    bureau_aggregated,
    previous_aggregated,
    installments_aggregated,
    pos_aggregated,
    cc_aggregated,
)

[32m2024-08-07 18:03:36.801[0m | [1mINFO    [0m | [36mmerge_all_tables[0m:[36mmerge_all_tables[0m:[36m34[0m - Merging application_train and application_test with aggregated tables.
[32m2024-08-07 18:03:37.811[0m | [1mINFO    [0m | [36mmerge_all_tables[0m:[36mmerge_all_tables[0m:[36m39[0m - Merged with bureau_aggregated.
[32m2024-08-07 18:03:41.023[0m | [1mINFO    [0m | [36mmerge_all_tables[0m:[36mmerge_all_tables[0m:[36m44[0m - Merged with previous_aggregated.
[32m2024-08-07 18:03:43.339[0m | [1mINFO    [0m | [36mmerge_all_tables[0m:[36mmerge_all_tables[0m:[36m49[0m - Merged with installments_aggregated.
[32m2024-08-07 18:03:45.168[0m | [1mINFO    [0m | [36mmerge_all_tables[0m:[36mmerge_all_tables[0m:[36m54[0m - Merged with pos_aggregated.
[32m2024-08-07 18:03:50.633[0m | [1mINFO    [0m | [36mmerge_all_tables[0m:[36mmerge_all_tables[0m:[36m63[0m - Filled missing values with 0.
[32m2024-08-07 18:03:52.269[0m | [1mINFO    [0

In [None]:
train_data.to_csv(DIR + 'train_data_final.csv')
test_data.to_csv(DIR + 'test_data_final.csv')