In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


# Summury

In this part, we are merging all the different table in order to have one dataset. For that, we created classes for each dataset to do some preprocessing, data cleaning and feature engineering. 

# Preprocessing

In [None]:
import gc
import os
import pickle
import pprint
import shutil
import sys
from pathlib import Path
import warnings
from datetime import datetime

import cufflinks as cf
import pandas as pd
import pendulum
from loguru import logger
from plotly.offline import init_notebook_mode

sys.path.append(str(Path.cwd().parent))
from settings.params import PARAMS, SEED
from src.load_all_dataset import load_all_dataset
from utils import reduce_memory_usage
from src.datasets import *


init_notebook_mode(connected=True)
cf.go_offline()


warnings.filterwarnings('ignore')

pio.renderers.default = "colab"

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
set_config(display="diagram", print_changed_only=False)



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.




In [None]:
# Set logging format
log_fmt = "<green>{time:YYYY-MM-DD HH:mm:ss.SSS!UTC}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - {message}"
logger.configure(handlers=[{"sink": sys.stderr, "format": log_fmt}])

# current date
CURRENT_DATE = pendulum.now(tz="UTC")

In [None]:
DIR = PARAMS['file_directory']

In [None]:
CLEANED_DATA_DIR = DIR = PARAMS['cleaned_data__directory']

In [None]:
load_all_dataset()

# Preprocessing of each dataset

## Bureau and Bureau_balance

In [None]:
bureau_aggregated = preprocess_bureau_balance_and_bureau(file_directory=DIR).main()

[32m2024-08-06 22:33:27.246[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m39[0m - Preprocessing class initialized.
[32m2024-08-06 22:33:27.260[0m | [1mINFO    [0m | [36m__main__[0m:[36mpreprocess_bureau_balance[0m:[36m55[0m - #######################################################
[32m2024-08-06 22:33:27.262[0m | [1mINFO    [0m | [36m__main__[0m:[36mpreprocess_bureau_balance[0m:[36m56[0m - #          Pre-processing bureau_balance.csv          #
[32m2024-08-06 22:33:27.264[0m | [1mINFO    [0m | [36m__main__[0m:[36mpreprocess_bureau_balance[0m:[36m57[0m - #######################################################
[32m2024-08-06 22:33:27.268[0m | [1mINFO    [0m | [36m__main__[0m:[36mpreprocess_bureau_balance[0m:[36m58[0m - 
Loading the DataFrame, bureau_balance.csv, into memory...
[32m2024-08-06 22:33:47.543[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m24[0m - Memory usage of dataframe is 624.85 M

## Previous Application

In [None]:
previous_aggregated = preprocess_previous_application(file_directory=CLEANED_DATA_DIR).main()

[32m2024-08-06 22:34:45.165[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m29[0m - Preprocessing class initialized.
[32m2024-08-06 22:34:45.167[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m39[0m - ########################################################
[32m2024-08-06 22:34:45.170[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m40[0m - #        Pre-processing previous_application.csv        #
[32m2024-08-06 22:34:45.172[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m41[0m - ########################################################
[32m2024-08-06 22:34:45.174[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m42[0m - Loading the DataFrame, previous_application.csv, into memory...
[32m2024-08-06 22:34:57.433[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m24[0m - Memory usage of dataframe is 471.48 MB
[32m2024-08-06 22:34:58.127[0m 

## installments_payments

In [None]:
installments_aggregated = preprocess_installments_payments(file_directory = DIR).main()

[32m2024-08-06 22:35:11.066[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m29[0m - Preprocessing class initialized.
[32m2024-08-06 22:35:11.070[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m39[0m - ##########################################################
[32m2024-08-06 22:35:11.073[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m40[0m - #        Pre-processing installments_payments.csv        #
[32m2024-08-06 22:35:11.074[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m41[0m - ##########################################################
[32m2024-08-06 22:35:11.076[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m42[0m - Loading the DataFrame, installments_payments.csv, into memory...
[32m2024-08-06 22:35:28.972[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m24[0m - Memory usage of dataframe is 830.41 MB
[32m2024-08-06 22:35:30.08

## POS_CASH_balance

In [None]:
pos_aggregated = preprocess_POS_CASH_balance(file_directory = DIR).main()

[32m2024-08-06 22:35:32.370[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m29[0m - Preprocessing class initialized.
[32m2024-08-06 22:35:32.373[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m39[0m - #########################################################
[32m2024-08-06 22:35:32.377[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m40[0m - #          Pre-processing POS_CASH_balance.csv          #
[32m2024-08-06 22:35:32.378[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m41[0m - #########################################################
[32m2024-08-06 22:35:32.381[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m42[0m - Loading the DataFrame, POS_CASH_balance.csv, into memory...
[32m2024-08-06 22:35:43.010[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m24[0m - Memory usage of dataframe is 610.43 MB
[32m2024-08-06 22:35:43.624[0m | 

## credit_card_balance

In [None]:
cc_aggregated = preprocess_credit_card_balance(file_directory=DIR).main()

[32m2024-08-06 22:35:49.704[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m29[0m - Preprocessing class initialized.
[32m2024-08-06 22:35:49.707[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m39[0m - #########################################################
[32m2024-08-06 22:35:49.709[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m40[0m - #        Pre-processing credit_card_balance.csv         #
[32m2024-08-06 22:35:49.711[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m41[0m - #########################################################
[32m2024-08-06 22:35:49.713[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m42[0m - Loading the DataFrame, credit_card_balance.csv, into memory...
[32m2024-08-06 22:36:03.884[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframe[0m:[36m48[0m - Loaded credit_card_balance.csv
[32m2024-08-06 22:36:03.886[0m | [1mINFO  

## application_train and application_test

In [None]:
application_train, application_test = preprocess_application_train_test(file_directory1=CLEANED_DATA_DIR, file_directory2=DIR).main()

[32m2024-08-06 22:36:07.141[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframes[0m:[36m40[0m - #######################################################
[32m2024-08-06 22:36:07.143[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframes[0m:[36m41[0m - #        Pre-processing application_train.csv         #
[32m2024-08-06 22:36:07.149[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframes[0m:[36m42[0m - #        Pre-processing application_test.csv          #
[32m2024-08-06 22:36:07.151[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframes[0m:[36m43[0m - #######################################################
[32m2024-08-06 22:36:07.153[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_dataframes[0m:[36m44[0m - 
Loading the DataFrames into memory...
[32m2024-08-06 22:36:14.047[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m24[0m - Memory usage of dataframe is 286.23 MB
[32m2024-08-06 22:36:14.751

In [None]:
train_data, test_data = merge_all_tables(application_train, application_test, bureau_aggregated, previous_aggregated,
                    installments_aggregated, pos_aggregated, cc_aggregated)

[32m2024-08-06 22:48:55.333[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m24[0m - Memory usage of dataframe is 650.17 MB
[32m2024-08-06 22:48:58.283[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m55[0m - Memory usage after optimization is: 295.03 MB
[32m2024-08-06 22:48:58.286[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m56[0m - Decreased by 54.6%
[32m2024-08-06 22:48:58.316[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m24[0m - Memory usage of dataframe is 103.29 MB
[32m2024-08-06 22:48:59.116[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m55[0m - Memory usage after optimization is: 47.00 MB
[32m2024-08-06 22:48:59.118[0m | [1mINFO    [0m | [36m__main__[0m:[36mreduce_memory_usage[0m:[36m56[0m - Decreased by 54.5%


In [None]:
train_data.to_csv(DIR + 'train_data_final.csv')
test_data.to_csv(DIR + 'test_data_final.csv')