In [16]:
import numpy as np
import pandas as pd
import polars as pl
import os
import gc
from glob import glob
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
import joblib
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import tqdm

In [18]:
class Data_Pipeline:
    def data_dtypes_adj(df): # set_table_dtypes
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.Utf8))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
                
        return df

    def date_adj(df): #  handle_dates
        for col in df.columns:
            if col[-1] in ("D"):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())

        df = df.drop("date_decision", "MONTH")

        return df

    def col_filter(df): # filter_cols
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                if df[col].dtype != pl.String:
                    isnull = df[col].is_null().mean()
                    if isnull > 0.90:
                        df = df.drop(col)
                else:
                    freq = df[col].n_unique()
                    if (freq == 1) | (freq > 200):
                        df = df.drop(col)
        return df
    
    def reduce_mem_usage(df, int_cast=True, obj_to_category=False, subset=None):
        start_mem = df.memory_usage().sum() / 1024 ** 2;
        gc.collect()
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

        cols = subset if subset is not None else df.columns.tolist()

        for col in cols:
            col_type = df[col].dtype

            if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
                df[col] = df[col].fillna(-888)
                c_min = df[col].min()
                c_max = df[col].max()
                treat_as_int = True
                if treat_as_int:
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                        df[col] = df[col].astype(np.uint8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                        df[col] = df[col].astype(np.uint16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                        df[col] = df[col].astype(np.uint32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                    elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
            elif 'datetime' not in col_type.name and obj_to_category:
                df[col] = df[col].fillna('Mis')
                df[col] = df[col].astype('category')
        gc.collect()
        end_mem = df.memory_usage().sum() / 1024 ** 2
        print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

        return df


In [11]:
class Aggregator:
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max

    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max

    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max

    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max

    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max

    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [12]:
def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )

    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")

    df_base = df_base.pipe(Data_Pipeline.date_adj)

    return df_base

In [13]:
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()

    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)

    df_data[cat_cols] = df_data[cat_cols].astype("category")

    return df_data, cat_cols

In [19]:
def read_file(path, depth=None):
    df = pd.read_parquet(path)
    df = df.pipe(Data_Pipeline.reduce_mem_usage)
    df = pl.from_pandas(df)
    df = df.pipe(Data_Pipeline.data_dtypes_adj)
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))

    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        chunks.append(pl.from_pandas(pd.read_parquet(path).pipe(Data_Pipeline.reduce_mem_usage)).pipe(Data_Pipeline.data_dtypes_adj))

    df = pl.concat(chunks, how="vertical_relaxed")
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))

    return df

In [31]:
data_store_train = {
    "df_base": read_file("train_base.parquet"),
    "depth_0": [
        read_file("train_static_cb_0.parquet"),
        read_files("train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files("train_applprev_1_*.parquet", 1),
        read_file("train_tax_registry_a_1.parquet", 1),
        read_file("train_tax_registry_b_1.parquet", 1),
        read_file("train_tax_registry_c_1.parquet", 1),
        read_file("train_credit_bureau_b_1.parquet", 1),
        read_files("train_credit_bureau_a_1_*.parquet", 1),
        read_file("train_other_1.parquet", 1),
        read_file("train_person_1.parquet", 1),
        read_file("train_deposit_1.parquet", 1),
        read_file("train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file("train_credit_bureau_b_2.parquet", 2),
        read_file('train_person_2.parquet', 2),
        read_files('train_credit_bureau_a_2_*.parquet', 2),
        read_file('train_applprev_2.parquet', 2)
    ]
}

data_store_test = {
    "df_base": read_file("test_base.parquet"),
    "depth_0": [
        read_file("test_static_cb_0.parquet"),
        read_files("test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files("test_applprev_1_*.parquet", 1),
        read_file("test_tax_registry_a_1.parquet", 1),
        read_file("test_tax_registry_b_1.parquet", 1),
        read_file("test_tax_registry_c_1.parquet", 1),
        read_file("test_credit_bureau_b_1.parquet", 1),
        read_files("test_credit_bureau_a_1_*.parquet", 1),
        read_file("test_other_1.parquet", 1),
        read_file("test_person_1.parquet", 1),
        read_file("test_deposit_1.parquet", 1),
        read_file("test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file("test_credit_bureau_b_2.parquet", 2),
        read_file('test_person_2.parquet', 2),
        read_files('test_credit_bureau_a_2_*.parquet', 2),
        read_file('test_applprev_2.parquet', 2)
    ]
}

Memory usage of dataframe is 58.24 MB
Memory usage after optimization is: 26.207 MB
Decreased by 55.0%
Memory usage of dataframe is 606.73 MB
Memory usage after optimization is: 306.227 MB
Decreased by 49.5%
Memory usage of dataframe is 1279.85 MB
Memory usage after optimization is: 572.440 MB
Decreased by 55.3%
Memory usage of dataframe is 666.73 MB
Memory usage after optimization is: 300.703 MB
Decreased by 54.9%
Memory usage of dataframe is 1216.09 MB
Memory usage after optimization is: 863.867 MB
Decreased by 29.0%
Memory usage of dataframe is 825.27 MB
Memory usage after optimization is: 586.245 MB
Decreased by 29.0%
Memory usage of dataframe is 124.96 MB
Memory usage after optimization is: 78.101 MB
Decreased by 37.5%
Memory usage of dataframe is 42.26 MB
Memory usage after optimization is: 26.415 MB
Decreased by 37.5%
Memory usage of dataframe is 127.56 MB
Memory usage after optimization is: 79.723 MB
Decreased by 37.5%
Memory usage of dataframe is 29.45 MB
Memory usage after op

In [32]:
df_train = feature_eng(**data_store_train)
print("train data shape:\t", df_train.shape)
del data_store_train
gc.collect()

train data shape:	 (1526659, 487)


1635

In [34]:
df_test = feature_eng(**data_store_test)
print("test data shape:\t", df_test.shape)
del data_store_test
gc.collect()

test data shape:	 (10, 486)


In [37]:
df_train = df_train.pipe(Data_Pipeline.col_filter)
df_test = df_test.select([col for col in df_train.columns if col != "target"])

print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

train data shape:	 (1526659, 403)
test data shape:	 (10, 402)


In [38]:
df_train, cat_cols = to_pandas(df_train)
df_test, cat_cols = to_pandas(df_test, cat_cols)