In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import functions
from sklearn.model_selection import train_test_split

In [2]:
import importlib

importlib.reload(functions)

<module 'functions' from 'c:\\Users\\balsevt\\python\\m3s4\\functions.py'>

In [13]:
df_app = pd.read_parquet(r"data\application_train.parquet", engine="pyarrow")
df_bur = pd.read_parquet(r"data\bureau.parquet", engine="pyarrow")
df_bur_bal = pd.read_parquet(r"data\bureau_balance.parquet", engine="pyarrow")
df_prev_app = pd.read_parquet(r"data\previous_application.parquet", engine="pyarrow")
df_cash = pd.read_parquet(r"data\POS_CASH_balance.parquet", engine="pyarrow")
df_inst = pd.read_parquet(r"data\installments_payments.parquet", engine="pyarrow")
df_cred = pd.read_parquet(r"data\credit_card_balance.parquet", engine="pyarrow")

In [4]:
df_app_train, df_app_test = train_test_split(
    df_app, test_size=0.05, shuffle=True, random_state=42
)

### Handle Train Split on Other Tables

In [5]:
curr_ids = df_app_train["SK_ID_CURR"]

# Handle Bureau
df_bur_train = df_bur[df_bur["SK_ID_CURR"].isin(curr_ids)]
bureau_ids = df_bur_train["SK_ID_BUREAU"]
df_bur_bal_train = df_bur_bal[df_bur_bal["SK_ID_BUREAU"].isin(bureau_ids)]

# Handle cash, inst, cred tables directly through SK_ID_CURR
df_cash_curr = df_cash[df_cash["SK_ID_CURR"].isin(curr_ids)]
df_inst_curr = df_inst[df_inst["SK_ID_CURR"].isin(curr_ids)]
df_cred_curr = df_cred[df_cred["SK_ID_CURR"].isin(curr_ids)]

# Handle cash, inst, cred tables through prev_app table
df_prev_app_train = df_prev_app[df_prev_app["SK_ID_CURR"].isin(curr_ids)]
prev_ids = df_prev_app_train["SK_ID_PREV"]
df_cash_prev = df_cash[df_cash["SK_ID_PREV"].isin(prev_ids)]
df_inst_prev = df_inst[df_inst["SK_ID_PREV"].isin(prev_ids)]
df_cred_prev = df_cred[df_cred["SK_ID_PREV"].isin(prev_ids)]

In [6]:
print("Shape of cash table when handled directly:", df_cash_curr.shape)
print("Shape of cash table when handled indirectly:", df_cash_prev.shape)

Shape of cash table when handled directly: (8117603, 8)
Shape of cash table when handled indirectly: (7841360, 8)


When handled indirectly, we get a smaller dataset, which would imply that we don't have all the previous applications in the prev_app table.

We can also check if the "_curr" tables are supersets of the "_prev" tables.

In [7]:
cols = ["SK_ID_CURR", "SK_ID_PREV"]
functions.check_if_superset(df_cash_curr, df_cash_prev, cols, "cash_curr", "cash_prev")
functions.check_if_superset(df_inst_curr, df_inst_prev, cols, "inst_curr", "inst_prev")
functions.check_if_superset(df_cred_curr, df_cred_prev, cols, "cred_curr", "cred_prev")

cash_curr table is a superset of cash_prev
inst_curr table is a superset of inst_prev
cred_curr table is a superset of cred_prev


Since all the "curr" tables are supersets, we'll only save them and discard "prev" tables.

In [8]:
df_cash_train = df_cash_curr.copy()
df_inst_train = df_inst_curr.copy()
df_cred_train = df_cred_curr.copy()

### Handle Test Split on Other Tables

In [9]:
curr_ids = df_app_test["SK_ID_CURR"]

# Handle Bureau
df_bur_test = df_bur[df_bur["SK_ID_CURR"].isin(curr_ids)]
bureau_ids = df_bur_test["SK_ID_BUREAU"]
df_bur_bal_test = df_bur_bal[df_bur_bal["SK_ID_BUREAU"].isin(bureau_ids)]

# Handle cash, inst, cred tables directly through SK_ID_CURR
df_cash_test = df_cash[df_cash["SK_ID_CURR"].isin(curr_ids)]
df_inst_test = df_inst[df_inst["SK_ID_CURR"].isin(curr_ids)]
df_cred_test = df_cred[df_cred["SK_ID_CURR"].isin(curr_ids)]

# Handle prev_app
df_prev_app_test = df_prev_app[df_prev_app["SK_ID_CURR"].isin(curr_ids)]

### Save as Parquet

In [11]:
df_app_train.to_parquet(r"data\train\df_app.parquet", engine="pyarrow")
df_app_test.to_parquet(r"data\test\df_app.parquet", engine="pyarrow")
df_bur_train.to_parquet(r"data\train\df_bur.parquet", engine="pyarrow")
df_bur_test.to_parquet(r"data\test\df_bur.parquet", engine="pyarrow")
df_bur_bal_train.to_parquet(r"data\train\df_bur_bal.parquet", engine="pyarrow")
df_bur_bal_test.to_parquet(r"data\test\df_bur_bal.parquet", engine="pyarrow")
df_prev_app_train.to_parquet(r"data\train\df_prev_app.parquet", engine="pyarrow")
df_prev_app_test.to_parquet(r"data\test\df_prev_app.parquet", engine="pyarrow")
df_cash_train.to_parquet(r"data\train\df_cash.parquet", engine="pyarrow")
df_cash_test.to_parquet(r"data\test\df_cash.parquet", engine="pyarrow")
df_inst_train.to_parquet(r"data\train\df_inst.parquet", engine="pyarrow")
df_inst_test.to_parquet(r"data\test\df_inst.parquet", engine="pyarrow")
df_cred_train.to_parquet(r"data\train\df_cred.parquet", engine="pyarrow")
df_cred_test.to_parquet(r"data\test\df_cred.parquet", engine="pyarrow")