In [34]:
import pandas as pd
import os

### Application

In [8]:
df = pd.read_csv(r"data\application_test.csv")
df.shape

(48744, 121)

In [35]:
def csv_to_parquet(path: str) -> tuple[str, str]:
    """Transforms csv to parquet, returns read dataframe.

    Parameters
    ----------
    path : str
        Relative path to the csv file.
    """
    path_parquet = os.path.splitext(path)[0] + ".parquet"
    df = pd.read_csv(path, engine="pyarrow")
    df.to_parquet(path_parquet, engine="pyarrow")
    return df

In [36]:
path = r"data\application_train.csv"
df = csv_to_parquet(path)


df.shape

(307511, 122)

In [None]:
df.duplicated().sum()

In [None]:
df["SK_ID_CURR"].duplicated().sum()

### Bureau

In [37]:
path = r"data\bureau_balance.csv"
df = csv_to_parquet(path)

df.shape

(27299925, 3)

In [11]:
df

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C
...,...,...,...
27299920,5041336,-47,X
27299921,5041336,-48,X
27299922,5041336,-49,X
27299923,5041336,-50,X


In [38]:
path = r"data\bureau.csv"
df = csv_to_parquet(path)

df.shape

(1716428, 17)

In [13]:
df.iloc[:5, :7]

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE
0,215354,5714462,Closed,currency 1,-497,0,-153.0
1,215354,5714463,Active,currency 1,-208,0,1075.0
2,215354,5714464,Active,currency 1,-203,0,528.0
3,215354,5714465,Active,currency 1,-203,0,
4,215354,5714466,Active,currency 1,-629,0,1197.0


In [14]:
df.iloc[:5, 8:14]

Unnamed: 0,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE
0,,0,91323.0,0.0,,0.0
1,,0,225000.0,171342.0,,0.0
2,,0,464323.5,,,0.0
3,,0,90000.0,,,0.0
4,77674.5,0,2700000.0,,,0.0


In [15]:
df.iloc[:5, 15:20]

Unnamed: 0,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,-131,
1,-20,
2,-16,
3,-16,
4,-21,


In [16]:
df.iloc[:5, 21:]

0
1
2
3
4


### Credit card

In [39]:
path = r"data\credit_card_balance.csv"
df = csv_to_parquet(path)

df.shape

(3840312, 23)

In [40]:
df.iloc[:5, :7]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT
0,2562384,378907,-6,56.97,135000,0.0,877.5
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0
2,1740877,371185,-7,31815.225,450000,0.0,0.0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0


In [41]:
df.iloc[:5, 8:13]

Unnamed: 0,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL
0,877.5,1700.325,1800.0,1800.0,0.0
1,0.0,2250.0,2250.0,2250.0,60175.08
2,0.0,2250.0,2250.0,2250.0,26926.425
3,0.0,11795.76,11925.0,11925.0,224949.285
4,11547.0,22924.89,27000.0,27000.0,443044.395


In [20]:
df.iloc[:5, 14:19]

Unnamed: 0,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT
0,0.0,0.0,1,0.0,1.0
1,64875.555,1.0,1,0.0,0.0
2,31460.085,0.0,0,0.0,0.0
3,233048.97,1.0,1,0.0,0.0
4,453919.455,0.0,1,0.0,1.0


In [21]:
df.iloc[:5, 20:]

Unnamed: 0,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,Active,0,0
1,Active,0,0
2,Active,0,0
3,Active,0,0
4,Active,0,0


### Installments

In [42]:
path = r"data\installments_payments.csv"
df = csv_to_parquet(path)

df.shape

(13605401, 8)

In [43]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


### Balance

In [44]:
path = r"data\POS_CASH_balance.csv"
df = csv_to_parquet(path)

df.shape

(10001358, 8)

In [25]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


### Previous Application

In [45]:
path = r"data\previous_application.csv"
df = csv_to_parquet(path)

df.shape

(1670214, 37)

### Sample Submission

In [27]:
df = pd.read_csv(r"data\sample_submission.csv")
df.shape

(48744, 2)

In [28]:
df

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.5
1,100005,0.5
2,100013,0.5
3,100028,0.5
4,100038,0.5
...,...,...
48739,456221,0.5
48740,456222,0.5
48741,456223,0.5
48742,456224,0.5


This table together with application_test.csv implies that the acquired dataset was used for a competition. Therefore, we should disregard these two tables.