In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import warnings
import time

from matplotlib import pyplot as plt
from dotenv import load_dotenv

sys.path.append("../")

from scripts.preprocessing import preprocess_application, preprocess_bureau, preprocess_credit_card
from scripts.preprocessing import preprocess_installments, preprocess_pos, preprocess_previous_app
from scripts.preprocessing import prefixer

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


In [2]:
# load train test :

application_train = "../data/application_train.csv"
application_test = "../data/application_test.csv"

df_train = pd.read_csv(filepath_or_buffer=application_train)
df_test = pd.read_csv(filepath_or_buffer=application_test)


In [3]:
df_application = pd.concat([df_train, df_test]).reset_index(drop=True)

assert df_application.__len__() == (df_train.__len__() + df_test.__len__())

# no output = ok

df_application = df_application.copy()  # defrag


In [4]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    df_application = preprocess_application(dataframe=df_application)


In [5]:
# defrag :

df_application = df_application.copy()


In [6]:
# csvs :
bureau_csv = "../data/bureau.csv"
credit_card_csv = "../data/credit_card_balance.csv"
pos_csv = "../data/POS_CASH_balance.csv"
installments_csv = "../data/installments_payments.csv"
previous_app_csv = "../data/previous_application.csv"


In [7]:
# loading frames :
t_z = time.perf_counter()

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    df_bureau = preprocess_bureau(bureau_path=bureau_csv)
    df_credit_card = preprocess_credit_card(credit_card_path=credit_card_csv)
    df_pos = preprocess_pos(pos_path=pos_csv)
    df_installments = preprocess_installments(installments_path=installments_csv)
    df_prev_app = preprocess_previous_app(previous_app_path=previous_app_csv)

t_f = time.perf_counter()

print(f"Executed in {t_f - t_z} seconds")


Executed in 35.63497600000119 seconds


In [8]:
# + Bureau :

prefixer(
    dataframe=df_bureau,
    prefix="BUR",
    ignore=["SK_ID_CURR"]
)

df_application = df_application.join(df_bureau, how="left", on="SK_ID_CURR")


In [9]:
# + credit card : 

prefixer(
    dataframe=df_credit_card,
    prefix="CARD",
    ignore=["SK_ID_CURR"]
)

df_application = df_application.join(df_credit_card, how="left", on="SK_ID_CURR")


In [10]:
# + pos

prefixer(
    dataframe=df_pos,
    prefix="POS",
    ignore=["SK_ID_CURR"]
)

df_application = df_application.join(df_pos, how="left", on="SK_ID_CURR")


In [11]:
# + installments

prefixer(
    dataframe=df_installments,
    prefix="INST",
    ignore=["SK_ID_CURR"]
)

df_application = df_application.join(df_installments, how="left", on="SK_ID_CURR")


In [12]:
# + prev app

prefixer(
    dataframe=df_prev_app,
    prefix="PREV",
    ignore=["SK_ID_CURR"]
)

df_application = df_application.join(df_prev_app, how="left", on="SK_ID_CURR")


In [30]:
pd.read_csv("../data/home_credit_data.csv")

Unnamed: 0,SK_ID_CURR,TARGET,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,PREV_PRODUCT_COMBINATION_POS household with interest_mean,PREV_PRODUCT_COMBINATION_POS household without interest_mean,PREV_PRODUCT_COMBINATION_POS industry with interest_mean,PREV_PRODUCT_COMBINATION_POS industry without interest_mean,PREV_PRODUCT_COMBINATION_POS mobile with interest_mean,PREV_PRODUCT_COMBINATION_POS mobile without interest_mean,PREV_PRODUCT_COMBINATION_POS other with interest_mean,PREV_PRODUCT_COMBINATION_POS others without interest_mean,PREV_PRODUCT_COMBINATION_nan_mean,PREV_number_applications
0,100002,0,0,0,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0.000000,0.00,0.000000,0.0,0.000000,0.0,1.0,0.0,0.0,1.0
1,100003,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0.333333,0.00,0.333333,0.0,0.000000,0.0,0.0,0.0,0.0,3.0
2,100004,1,1,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0.000000,0.00,0.000000,0.0,0.000000,1.0,0.0,0.0,0.0,1.0
3,100006,1,0,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0.111111,0.00,0.111111,0.0,0.000000,0.0,0.0,0.0,0.0,9.0
4,100007,1,0,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0.166667,0.00,0.000000,0.0,0.166667,0.0,0.0,0.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356250,456221,-1,0,0,0,121500.0,412560.0,17473.5,270000.0,0.002042,...,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1.0
356251,456222,-1,0,1,2,157500.0,622413.0,31909.5,495000.0,0.035792,...,0.250000,0.25,0.000000,0.0,0.250000,0.0,0.0,0.0,0.0,4.0
356252,456223,-1,1,0,1,202500.0,315000.0,33205.5,315000.0,0.026392,...,0.000000,0.00,0.500000,0.0,0.000000,0.5,0.0,0.0,0.0,2.0
356253,456224,-1,0,1,0,225000.0,450000.0,25128.0,450000.0,0.018850,...,0.000000,0.00,0.200000,0.0,0.400000,0.0,0.0,0.0,0.0,5.0


In [31]:
pd.read_pickle(filepath_or_buffer="../data/home_credit_data.pkl")

Unnamed: 0,SK_ID_CURR,TARGET,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,PREV_PRODUCT_COMBINATION_POS household with interest_mean,PREV_PRODUCT_COMBINATION_POS household without interest_mean,PREV_PRODUCT_COMBINATION_POS industry with interest_mean,PREV_PRODUCT_COMBINATION_POS industry without interest_mean,PREV_PRODUCT_COMBINATION_POS mobile with interest_mean,PREV_PRODUCT_COMBINATION_POS mobile without interest_mean,PREV_PRODUCT_COMBINATION_POS other with interest_mean,PREV_PRODUCT_COMBINATION_POS others without interest_mean,PREV_PRODUCT_COMBINATION_nan_mean,PREV_number_applications
0,100002,0,0,0,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0.000000,0.00,0.000000,0.0,0.000000,0.0,1.0,0.0,0.0,1.0
1,100003,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0.333333,0.00,0.333333,0.0,0.000000,0.0,0.0,0.0,0.0,3.0
2,100004,1,1,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0.000000,0.00,0.000000,0.0,0.000000,1.0,0.0,0.0,0.0,1.0
3,100006,1,0,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0.111111,0.00,0.111111,0.0,0.000000,0.0,0.0,0.0,0.0,9.0
4,100007,1,0,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0.166667,0.00,0.000000,0.0,0.166667,0.0,0.0,0.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356250,456221,-1,0,0,0,121500.0,412560.0,17473.5,270000.0,0.002042,...,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1.0
356251,456222,-1,0,1,2,157500.0,622413.0,31909.5,495000.0,0.035792,...,0.250000,0.25,0.000000,0.0,0.250000,0.0,0.0,0.0,0.0,4.0
356252,456223,-1,1,0,1,202500.0,315000.0,33205.5,315000.0,0.026392,...,0.000000,0.00,0.500000,0.0,0.000000,0.5,0.0,0.0,0.0,2.0
356253,456224,-1,0,1,0,225000.0,450000.0,25128.0,450000.0,0.018850,...,0.000000,0.00,0.200000,0.0,0.400000,0.0,0.0,0.0,0.0,5.0
