In [1]:
# %%capture
!python /kaggle/usr/lib/pipeline1_final/pipeline1_final.py

train data shape:	 (1526659, 861)
389
['max_collater_typofvalofguarant_298M', 'last_familystate_726L', 'max_financialinstitution_382M', 'max_incometype_1044T', 'max_remitter_829L', 'paytype_783L', 'max_subjectrole_93M', 'last_purposeofcred_874M', 'opencred_647L', 'last_classificationofcontr_13M', 'last_language1_981M', 'last_contaddr_smempladdr_334L', 'last_education_927M', 'last_subjectrole_93M', 'max_empladdr_district_926M', 'max_rejectreason_755M', 'last_role_1084L', 'last_collaterals_typeofguarante_359M', 'max_conts_type_509L', 'max_cancelreason_3545846M', 'max_conts_role_79M', 'max_inittransactioncode_279L', 'max_classificationofcontr_13M', 'last_relationshiptoclient_642T', 'max_description_351M', 'last_sex_738L', 'last_collater_typofvalofguarant_407M', 'last_subjectroles_name_541M', 'max_education_1138M', 'last_purposeofcred_426M', 'maritalst_385M', 'last_cancelreason_3545846M', 'max_purposeofcred_426M', 'last_collater_typofvalofguarant_298M', 'max_contaddr_matchlist_1032L', 'las

In [2]:
import gc
import lightgbm as lgb  # type: ignore
import numpy as np  # type: ignore
import pandas as pd  # type: ignore
import polars as pl  # type: ignore
import warnings

from catboost import CatBoostClassifier, Pool  # type: ignore
from glob import glob
from IPython.display import display  # type: ignore
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin  # type: ignore
from sklearn.metrics import roc_auc_score  # type: ignore
from sklearn.model_selection import StratifiedGroupKFold  # type: ignore
from typing import Any
import joblib
from sklearn.decomposition import PCA
warnings.filterwarnings("ignore")

ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"

In [3]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str) -> None:
        """
        Retrieves feature definitions from a CSV file based on the specified ending.

        Args:
        - ending_with (str): Ending to filter feature definitions.

        Returns:
        - pl.DataFrame: Filtered feature definitions.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")

        filtered_feats: pl.DataFrame = feat_defs.filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

    @staticmethod
    def find_index(lst: list[Any], item: Any) -> int | None:
        """
        Finds the index of an item in a list.

        Args:
        - lst (list): List to search.
        - item (Any): Item to find in the list.

        Returns:
        - int | None: Index of the item if found, otherwise None.
        """
        try:
            return lst.index(item)
        except ValueError:
            return None

    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        """
        Converts Polars data type to string representation.

        Args:
        - dtype (pl.DataType): Polars data type.

        Returns:
        - str: String representation of the data type.
        """
        dtype_map = {
            pl.Decimal: "Decimal",
            pl.Float32: "Float32",
            pl.Float64: "Float64",
            pl.UInt8: "UInt8",
            pl.UInt16: "UInt16",
            pl.UInt32: "UInt32",
            pl.UInt64: "UInt64",
            pl.Int8: "Int8",
            pl.Int16: "Int16",
            pl.Int32: "Int32",
            pl.Int64: "Int64",
            pl.Date: "Date",
            pl.Datetime: "Datetime",
            pl.Duration: "Duration",
            pl.Time: "Time",
            pl.Array: "Array",
            pl.List: "List",
            pl.Struct: "Struct",
            pl.String: "String",
            pl.Categorical: "Categorical",
            pl.Enum: "Enum",
            pl.Utf8: "Utf8",
            pl.Binary: "Binary",
            pl.Boolean: "Boolean",
            pl.Null: "Null",
            pl.Object: "Object",
            pl.Unknown: "Unknown",
        }

        return dtype_map.get(dtype)

    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        """
        Finds occurrences of features ending with a specific string in Parquet files.

        Args:
        - regex_path (str): Regular expression to match Parquet file paths.
        - ending_with (str): Ending to filter feature names.

        Returns:
        - pl.DataFrame: DataFrame containing feature definitions, data types, and file locations.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv").filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )
        feat_defs.sort(by=["Variable"])

        feats: list[pl.String] = feat_defs["Variable"].to_list()
        feats.sort()

        occurrences: list[list] = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for feat, dtype in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias("Data_Type(s)"))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias("File_Loc(s)"))

        return feat_defs

    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        """
        Reduces memory usage of a DataFrame by converting column types.

        Args:
        - df (pl.DataFrame): DataFrame to optimize.
        - name (str): Name of the DataFrame.

        Returns:
        - pl.DataFrame: Optimized DataFrame.
        """
        print(
            f"Memory usage of dataframe \"{name}\" is {round(df.estimated_size('mb'), 4)} MB."
        )

        int_types = [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
        ]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in int_types + float_types:
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if (
                                c_min >= np.iinfo(np.uint8).min
                                and c_max <= np.iinfo(np.uint8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif (
                                c_min >= np.iinfo(np.uint16).min
                                and c_max <= np.iinfo(np.uint16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif (
                                c_min >= np.iinfo(np.uint32).min
                                and c_max <= np.iinfo(np.uint32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif (
                                c_min >= np.iinfo(np.uint64).min
                                and c_max <= np.iinfo(np.uint64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if (
                                c_min >= np.iinfo(np.int8).min
                                and c_max <= np.iinfo(np.int8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif (
                                c_min >= np.iinfo(np.int16).min
                                and c_max <= np.iinfo(np.int16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif (
                                c_min >= np.iinfo(np.int32).min
                                and c_max <= np.iinfo(np.int32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif (
                                c_min >= np.iinfo(np.int64).min
                                and c_max <= np.iinfo(np.int64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if (
                            c_min > np.finfo(np.float32).min
                            and c_max < np.finfo(np.float32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(
            f"Memory usage of dataframe \"{name}\" became {round(df.estimated_size('mb'), 4)} MB."
        )

        return df

    def to_pandas(df: pl.DataFrame, cat_cols: list[str] = None) -> (pd.DataFrame, list[str]):  # type: ignore
        """
        Converts a Polars DataFrame to a Pandas DataFrame.

        Args:
        - df (pl.DataFrame): Polars DataFrame to convert.
        - cat_cols (list[str]): List of categorical columns. Default is None.

        Returns:
        - (pd.DataFrame, list[str]): Tuple containing the converted Pandas DataFrame and categorical columns.
        """
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes("object").columns)

        df[cat_cols] = df[cat_cols].astype("str")

        return df, cat_cols

In [4]:
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "P")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "M")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "A")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "D")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "T")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "L")
# feat_defs:pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")
# with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
#     print(feat_defs)

In [5]:
class Aggregator:
    @staticmethod
    def max_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating maximum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for maximum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_max: list[pl.Series] = [
            pl.col(col).max().alias(f"max_{col}") for col in cols
        ]

        return expr_max

    @staticmethod
    def last_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating last values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for last values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_last: list[pl.Series] = [
            pl.col(col).last().alias(f"last_{col}") for col in cols
        ]

        return expr_last
    
    
    @staticmethod
    def min_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating minimum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for minimum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_min: list[pl.Series] = [
            pl.col(col).min().alias(f"min_{col}") for col in cols
        ]

        return expr_min

    @staticmethod
    def mean_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mean values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mean values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).mean().alias(f"mean_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def var_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating variance for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for variance.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).var().alias(f"var_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def mode_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mode values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mode values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith("M")]

        expr_mode: list[pl.Series] = [
            pl.col(col).drop_nulls().mode().first().alias(f"mode_{col}") for col in cols
        ]

        return expr_mode

    @staticmethod
    def get_exprs(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Combines expressions for maximum, mean, and variance calculations.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of combined expressions.
        """
        exprs = (
            Aggregator.max_expr(df) + Aggregator.mean_expr(df) + Aggregator.var_expr(df)
        )

        return exprs

In [6]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        """
        Changes the data types of columns in the DataFrame.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - pl.LazyFrame: LazyFrame with modified data types.
        """
        for col in df.columns:
            if col == "case_id":
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ["WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == "date_decision" or col[-1] == "D":
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ["P", "A"]:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
        return df

    @staticmethod
    def scan_files(glob_path: str, depth: int = None) -> pl.LazyFrame:
        """
        Scans Parquet files matching the glob pattern and combines them into a LazyFrame.

        Args:
        - glob_path (str): Glob pattern to match Parquet files.
        - depth (int, optional): Depth level for data aggregation. Defaults to None.

        Returns:
        - pl.LazyFrame: Combined LazyFrame.
        """
        chunks: list[pl.LazyFrame] = []
        for path in glob(str(glob_path)):
            df: pl.LazyFrame = pl.scan_parquet(
                path, low_memory=True, rechunk=True
            ).pipe(SchemaGen.change_dtypes)
            print(f"File {Path(path).stem} loaded into memory.")

            if depth in (1, 2):
                exprs: list[pl.Series] = Aggregator.get_exprs(df)
                df = df.group_by("case_id").agg(exprs)

                del exprs
                gc.collect()

            chunks.append(df)

        df = pl.concat(chunks, how="vertical_relaxed")

        del chunks
        gc.collect()

        df = df.unique(subset=["case_id"])

        return df

    @staticmethod
    def join_dataframes(
        df_base: pl.LazyFrame,
        depth_0: list[pl.LazyFrame],
        depth_1: list[pl.LazyFrame],
        depth_2: list[pl.LazyFrame],
    ) -> pl.DataFrame:
        """
        Joins multiple LazyFrames with a base LazyFrame.

        Args:
        - df_base (pl.LazyFrame): Base LazyFrame.
        - depth_0 (list[pl.LazyFrame]): List of LazyFrames for depth 0.
        - depth_1 (list[pl.LazyFrame]): List of LazyFrames for depth 1.
        - depth_2 (list[pl.LazyFrame]): List of LazyFrames for depth 2.

        Returns:
        - pl.DataFrame: Joined DataFrame.
        """
        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")

        return df_base.collect().pipe(Utility.reduce_memory_usage, "df_train")

In [7]:
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if col.endswith("D"):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.rename(
        {
            "MONTH": "month",
            "WEEK_NUM": "week_num"
        }
    )
            
    df = df.with_columns(
        [
            pl.col("date_decision").dt.year().alias("year").cast(pl.Int16),
            pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8),
        ]
    )

    return df.drop("date_decision")

In [8]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TRAIN_DIR / "train_static_cb_0.parquet"),
        SchemaGen.scan_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_other_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_person_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_deposit_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ],
}

df_train: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(filter_cols)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_train")
)

del data_store
gc.collect()

print(f"Train data shape: {df_train.shape}")
# display(df_train.head(10))

# df_train.write_parquet("train_final.parquet", compression="lz4")

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_6 loaded into memory.
File train_credit_bureau_a_2_1 loaded into memory.
File train_credit_bureau_a_2_0 loaded into memory.
File train_credit_bureau_a_2_7 loade

In [9]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TEST_DIR / "test_static_cb_0.parquet"),
        SchemaGen.scan_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_other_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_person_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_deposit_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ],
}

df_test: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .select([col for col in df_train.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "df_test")
)

del data_store
gc.collect()

print(f"Test data shape: {df_test.shape}")

# df_test.write_parquet("test_final.parquet", compression="lz4")

File test_base loaded into memory.
File test_static_cb_0 loaded into memory.
File test_static_0_0 loaded into memory.
File test_static_0_2 loaded into memory.
File test_static_0_1 loaded into memory.
File test_applprev_1_2 loaded into memory.
File test_applprev_1_0 loaded into memory.
File test_applprev_1_1 loaded into memory.
File test_tax_registry_a_1 loaded into memory.
File test_tax_registry_b_1 loaded into memory.
File test_tax_registry_c_1 loaded into memory.
File test_credit_bureau_a_1_3 loaded into memory.
File test_credit_bureau_a_1_2 loaded into memory.
File test_credit_bureau_a_1_1 loaded into memory.
File test_credit_bureau_a_1_4 loaded into memory.
File test_credit_bureau_a_1_0 loaded into memory.
File test_credit_bureau_b_1 loaded into memory.
File test_other_1 loaded into memory.
File test_person_1 loaded into memory.
File test_deposit_1 loaded into memory.
File test_debitcard_1 loaded into memory.
File test_credit_bureau_a_2_3 loaded into memory.
File test_credit_bureau

In [10]:
num_cols = ['assignmentdate_238D', 'assignmentdate_4527235D', 'birthdate_574D', 'contractssum_5085716L', 'dateofbirth_337D', 'days120_123L', 'days180_256L', 'days30_165L', 'days360_512L', 'days90_310L', 'firstquarter_103L', 'fourthquarter_440L', 'numberofqueries_373L', 'pmtaverage_3A', 'pmtaverage_4527227A', 'pmtcount_4527229L', 'pmtcount_693L', 'pmtscount_423L', 'pmtssum_45A', 'responsedate_1012D', 'responsedate_4527233D', 'responsedate_4917613D', 'secondquarter_766L', 'thirdquarter_1082L', 'actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_629L', 'applicationscnt_867L', 'avgdbddpdlast24m_3658932P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt6m_3712949L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'datefirstoffer_1144D', 'datelastinstal40dpd_247D', 'datelastunpaid_3546854D', 'daysoverduetolerancedd_3976961L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'dtlastpmtallstes_4499206D', 'eir_270L', 'firstclxcampaign_1125D', 'firstdatedue_489D', 'homephncnt_628L', 'inittransactionamount_650A', 'interestrate_311L', 'lastactivateddate_801D', 'lastapplicationdate_877D', 'lastapprcredamount_781A', 'lastapprdate_640D', 'lastdelinqdate_224D', 'lastrejectcredamount_222A', 'lastrejectdate_50D', 'maininc_215A', 'mastercontrelectronic_519L', 'mastercontrexist_109L', 'maxannuity_159A', 'maxdbddpdlast1m_3658939P', 'maxdbddpdtollast12m_3658940P', 'maxdbddpdtollast6m_4187119P', 'maxdebt4_972A', 'maxdpdfrom6mto36m_3546853P', 'maxdpdinstldate_3546855D', 'maxdpdinstlnum_3546846P', 'maxdpdlast12m_727P', 'maxdpdlast24m_143P', 'maxdpdlast3m_392P', 'maxdpdlast6m_474P', 'maxdpdlast9m_1059P', 'maxdpdtolerance_374P', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'mindbddpdlast24m_3658935P', 'mindbdtollast24m_4525191P', 'mobilephncnt_593L', 'monthsannuity_845L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numincomingpmts_3546848L', 'numinstlallpaidearly3d_817L', 'numinstls_657L', 'numinstlsallpaid_934L', 'numinstlswithdpd10_728L', 'numinstlswithdpd5_4187116L', 'numinstlswithoutdpd_562L', 'numinstmatpaidtearly2d_4499204L', 'numinstpaid_4499208L', 'numinstpaidearly3d_3546850L', 'numinstpaidearly3dest_4493216L', 'numinstpaidearly5d_1087L', 'numinstpaidearly5dest_4493211L', 'numinstpaidearly5dobd_4499205L', 'numinstpaidearly_338L', 'numinstpaidearlyest_4493214L', 'numinstpaidlastcontr_4325080L', 'numinstpaidlate1d_3546852L', 'numinstregularpaid_973L', 'numinstregularpaidest_4493210L', 'numinsttopaygr_769L', 'numinsttopaygrest_4493213L', 'numinstunpaidmax_3546851L', 'numinstunpaidmaxest_4493212L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'pctinstlsallpaidearl3d_427L', 'pctinstlsallpaidlat10d_839L', 'pctinstlsallpaidlate1d_3546856L', 'pctinstlsallpaidlate4d_3546849L', 'pctinstlsallpaidlate6d_3546844L', 'pmtnum_254L', 'posfpd10lastmonth_333P', 'posfpd30lastmonth_3976960P', 'posfstqpd30lastmonth_3976962P', 'price_1097A', 'sellerplacecnt_915L', 'sellerplacescnt_216L', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallast1m_4525188A', 'validfrom_1069D', 'max_actualdpd_943P', 'max_annuity_853A', 'max_approvaldate_319D', 'max_byoccupationinc_3656910L', 'max_childnum_21L', 'max_creationdate_885D', 'max_credacc_actualbalance_314A', 'max_credacc_credlmt_575A', 'max_credacc_maxhisbal_375A', 'max_credacc_minhisbal_90A', 'max_credacc_transactions_402L', 'max_credamount_590A', 'max_currdebt_94A', 'max_dateactivated_425D', 'max_downpmt_134A', 'max_dtlastpmt_581D', 'max_dtlastpmtallstes_3545839D', 'max_employedfrom_700D', 'max_firstnonzeroinstldate_307D', 'max_mainoccupationinc_437A', 'max_maxdpdtolerance_577P', 'max_num_group1', 'max_outstandingdebt_522A', 'max_pmtnum_8L', 'max_revolvingaccount_394A', 'max_tenor_203L', 'mean_actualdpd_943P', 'mean_annuity_853A', 'mean_approvaldate_319D', 'mean_creationdate_885D', 'mean_credacc_actualbalance_314A', 'mean_credacc_credlmt_575A', 'mean_credacc_maxhisbal_375A', 'mean_credacc_minhisbal_90A', 'mean_credamount_590A', 'mean_currdebt_94A', 'mean_dateactivated_425D', 'mean_downpmt_134A', 'mean_dtlastpmt_581D', 'mean_dtlastpmtallstes_3545839D', 'mean_employedfrom_700D', 'mean_firstnonzeroinstldate_307D', 'mean_mainoccupationinc_437A', 'mean_maxdpdtolerance_577P', 'mean_outstandingdebt_522A', 'mean_revolvingaccount_394A', 'var_actualdpd_943P', 'var_annuity_853A', 'var_credacc_credlmt_575A', 'var_credamount_590A', 'var_currdebt_94A', 'var_downpmt_134A', 'var_mainoccupationinc_437A', 'var_maxdpdtolerance_577P', 'var_outstandingdebt_522A', 'max_amount_4527230A', 'max_num_group1_3', 'max_recorddate_4527225D', 'mean_amount_4527230A', 'mean_recorddate_4527225D', 'var_amount_4527230A', 'max_amount_4917619A', 'max_deductiondate_4917603D', 'max_num_group1_4', 'mean_amount_4917619A', 'mean_deductiondate_4917603D', 'var_amount_4917619A', 'max_num_group1_5', 'max_pmtamount_36A', 'max_processingdate_168D', 'mean_pmtamount_36A', 'mean_processingdate_168D', 'var_pmtamount_36A', 'max_annualeffectiverate_199L', 'max_annualeffectiverate_63L', 'max_contractsum_5085717L', 'max_credlmt_230A', 'max_credlmt_935A', 'max_dateofcredend_289D', 'max_dateofcredend_353D', 'max_dateofcredstart_181D', 'max_dateofcredstart_739D', 'max_dateofrealrepmt_138D', 'max_debtoutstand_525A', 'max_debtoverdue_47A', 'max_dpdmax_139P', 'max_dpdmax_757P', 'max_dpdmaxdatemonth_442T', 'max_dpdmaxdatemonth_89T', 'max_dpdmaxdateyear_596T', 'max_dpdmaxdateyear_896T', 'max_instlamount_768A', 'max_instlamount_852A', 'max_lastupdate_1112D', 'max_lastupdate_388D', 'max_monthlyinstlamount_332A', 'max_monthlyinstlamount_674A', 'max_nominalrate_281L', 'max_nominalrate_498L', 'max_num_group1_6', 'max_numberofcontrsvalue_258L', 'max_numberofcontrsvalue_358L', 'max_numberofinstls_229L', 'max_numberofinstls_320L', 'max_numberofoutstandinstls_520L', 'max_numberofoutstandinstls_59L', 'max_numberofoverdueinstlmax_1039L', 'max_numberofoverdueinstlmax_1151L', 'max_numberofoverdueinstlmaxdat_148D', 'max_numberofoverdueinstlmaxdat_641D', 'max_numberofoverdueinstls_725L', 'max_numberofoverdueinstls_834L', 'max_outstandingamount_354A', 'max_outstandingamount_362A', 'max_overdueamount_31A', 'max_overdueamount_659A', 'max_overdueamountmax2_14A', 'max_overdueamountmax2_398A', 'max_overdueamountmax2date_1002D', 'max_overdueamountmax2date_1142D', 'max_overdueamountmax_155A', 'max_overdueamountmax_35A', 'max_overdueamountmaxdatemonth_284T', 'max_overdueamountmaxdatemonth_365T', 'max_overdueamountmaxdateyear_2T', 'max_overdueamountmaxdateyear_994T', 'max_periodicityofpmts_1102L', 'max_periodicityofpmts_837L', 'max_prolongationcount_1120L', 'max_refreshdate_3813885D', 'max_residualamount_488A', 'max_residualamount_856A', 'max_totalamount_6A', 'max_totalamount_996A', 'max_totaldebtoverduevalue_178A', 'max_totaldebtoverduevalue_718A', 'max_totaloutstanddebtvalue_39A', 'max_totaloutstanddebtvalue_668A', 'mean_credlmt_230A', 'mean_credlmt_935A', 'mean_dateofcredend_289D', 'mean_dateofcredend_353D', 'mean_dateofcredstart_181D', 'mean_dateofcredstart_739D', 'mean_dateofrealrepmt_138D', 'mean_debtoutstand_525A', 'mean_debtoverdue_47A', 'mean_dpdmax_139P', 'mean_dpdmax_757P', 'mean_instlamount_768A', 'mean_instlamount_852A', 'mean_lastupdate_1112D', 'mean_lastupdate_388D', 'mean_monthlyinstlamount_332A', 'mean_monthlyinstlamount_674A', 'mean_numberofoverdueinstlmaxdat_148D', 'mean_numberofoverdueinstlmaxdat_641D', 'mean_outstandingamount_354A', 'mean_outstandingamount_362A', 'mean_overdueamount_31A', 'mean_overdueamount_659A', 'mean_overdueamountmax2_14A', 'mean_overdueamountmax2_398A', 'mean_overdueamountmax2date_1002D', 'mean_overdueamountmax2date_1142D', 'mean_overdueamountmax_155A', 'mean_overdueamountmax_35A', 'mean_refreshdate_3813885D', 'mean_residualamount_488A', 'mean_residualamount_856A', 'mean_totalamount_6A', 'mean_totalamount_996A', 'mean_totaldebtoverduevalue_178A', 'mean_totaldebtoverduevalue_718A', 'mean_totaloutstanddebtvalue_39A', 'mean_totaloutstanddebtvalue_668A', 'var_credlmt_230A', 'var_credlmt_935A', 'var_dpdmax_139P', 'var_dpdmax_757P', 'var_instlamount_768A', 'var_instlamount_852A', 'var_monthlyinstlamount_332A', 'var_monthlyinstlamount_674A', 'var_outstandingamount_354A', 'var_outstandingamount_362A', 'var_overdueamount_31A', 'var_overdueamount_659A', 'var_overdueamountmax2_14A', 'var_overdueamountmax2_398A', 'var_overdueamountmax_155A', 'var_overdueamountmax_35A', 'var_residualamount_488A', 'var_residualamount_856A', 'var_totalamount_6A', 'var_totalamount_996A', 'max_birth_259D', 'max_empl_employedfrom_271D', 'max_mainoccupationinc_384A', 'max_num_group1_9', 'max_personindex_1023L', 'max_persontype_1072L', 'max_persontype_792L', 'mean_birth_259D', 'mean_empl_employedfrom_271D', 'mean_mainoccupationinc_384A', 'max_amount_416A', 'max_num_group1_10', 'max_openingdate_313D', 'mean_amount_416A', 'mean_openingdate_313D', 'max_num_group1_11', 'max_openingdate_857D', 'mean_openingdate_857D', 'max_collater_valueofguarantee_1124L', 'max_collater_valueofguarantee_876L', 'max_num_group1_12', 'max_num_group2', 'max_pmts_dpd_1073P', 'max_pmts_dpd_303P', 'max_pmts_month_158T', 'max_pmts_month_706T', 'max_pmts_overdue_1140A', 'max_pmts_overdue_1152A', 'max_pmts_year_1139T', 'max_pmts_year_507T', 'mean_pmts_dpd_1073P', 'mean_pmts_dpd_303P', 'mean_pmts_overdue_1140A', 'mean_pmts_overdue_1152A', 'var_pmts_dpd_1073P', 'var_pmts_dpd_303P', 'var_pmts_overdue_1140A', 'var_pmts_overdue_1152A', 'day']
# except "case_id", "year", "month", "week_num", "target"
for col in num_cols:
    df_train = df_train.with_columns(
        pl.col(col).fill_null(-0.1)
    )
for col in num_cols:
    df_test = df_test.with_columns(
        pl.col(col).fill_null(-0.1)
    )

In [11]:
df_train, cat_cols = Utility.to_pandas(df_train)
df_test, cat_cols = Utility.to_pandas(df_test, cat_cols)

In [12]:
print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

train data shape:	 (1526659, 472)
test data shape:	 (10, 471)


# pca

In [13]:
from sklearn.decomposition import PCA
pca = joblib.load('/kaggle/input/pipe2final/pca28.joblib')
pca_columns = [f'PCA_int_{i+1}' for i in range(28)]

In [14]:
principal_components_train = pca.transform(df_train[num_cols])
df_pca_train = pd.DataFrame(principal_components_train, columns=pca_columns, index=df_train.index)
df_train = pd.concat([df_train, df_pca_train], axis=1)
# joblib.dump(pca, "pca28.joblib")

In [15]:
principal_components_test = pca.transform(df_test[num_cols])
df_pca_test = pd.DataFrame(principal_components_test, columns=pca_columns, index=df_test.index)
df_test = pd.concat([df_test, df_pca_test], axis=1)

In [16]:
del df_pca_train, df_pca_test, principal_components_train, principal_components_test
gc.collect()

0

In [17]:
print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

train data shape:	 (1526659, 500)
test data shape:	 (10, 499)


# - - - - -- - - - - - - 

In [18]:
class VotingModel(BaseEstimator, ClassifierMixin):

    def __init__(self, estimators: list[BaseEstimator]):
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [19]:
df_subm: pd.DataFrame = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

device: str = "gpu"
DRY_RUN = True if df_subm.shape[0] == 10 else False
if DRY_RUN:
    df_train = df_train.iloc[:50000]
    est_cnt: int = 600
print(device)

gpu


In [20]:
print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

train data shape:	 (50000, 500)
test data shape:	 (10, 499)


In [21]:
X = df_train.drop(columns=["target", "case_id", "week_num"])
y = df_train["target"]

weeks = df_train["week_num"]

del df_train
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

params1 = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 8268,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    'categorical_feature ': 'auto',
    "device": 'gpu', 
    "max_bin":245,
    "verbose": -1,
}

params2 = {
    "boosting_type": "gbdt",
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "device": 'gpu', 
    "extra_trees": True,
    "learning_rate": 0.03,
    "l1_regularization": 0.1,
    "l2_regularization": 10,
    "max_depth": 16,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 72,
    "objective": "binary",
    "random_state": 9217,
    "verbose": -1,
    "max_bin":245,
}

fitted_models_cat = []
fitted_models_lgb = []

cv_scores_cat = []
cv_scores_lgb = []

iter_cnt = 0
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    if iter_cnt % 2 == 0:
        model = lgb.LGBMClassifier(**params1)
    else:
        model = lgb.LGBMClassifier(**params2)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(200), lgb.early_stopping(100)],
    )
    fitted_models_lgb.append(model)
#     joblib.dump(model, f"lgb_{iter_cnt}.joblib")

    iter_cnt += 1


for i in range(5):
    cat_model = joblib.load(f'/kaggle/input/pipe2final/cat_{i}.joblib')
    fitted_models_cat.append(cat_model)
    
model = VotingModel(fitted_models_lgb+fitted_models_cat)


del X, y
gc.collect()

model

Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.815111
Early stopping, best iteration is:
[160]	valid_0's auc: 0.815579
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.834263
Early stopping, best iteration is:
[278]	valid_0's auc: 0.836904
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.8334
Early stopping, best iteration is:
[299]	valid_0's auc: 0.835816
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.826546
Early stopping, best iteration is:
[216]	valid_0's auc: 0.82792
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.838375
Early stopping, best iteration is:
[290]	valid_0's auc: 0.841265


In [22]:
X_test: pd.DataFrame = df_test.drop(columns=["week_num"]).set_index("case_id")

X_test[cat_cols] = X_test[cat_cols].astype("category")

y_pred: pd.Series = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

df_subm["score"] = y_pred

display(df_subm)

del X_test, y_pred
gc.collect()

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.008539
57549,0.043683
57551,0.002333
57552,0.026517
57569,0.141214
57630,0.011883
57631,0.037557
57632,0.011924
57633,0.032879
57634,0.024221


0

In [23]:
df_lgb_pipe1 = pd.read_csv('sub1.csv').set_index('case_id')
df_subm['score2'] = df_lgb_pipe1['score']

display(df_subm)

del df_lgb_pipe1
gc.collect()

Unnamed: 0_level_0,score,score2
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1
57543,0.008539,0.015884
57549,0.043683,0.032191
57551,0.002333,0.011103
57552,0.026517,0.052811
57569,0.141214,0.050107
57630,0.011883,0.0352
57631,0.037557,0.073221
57632,0.011924,0.045776
57633,0.032879,0.027966
57634,0.024221,0.061248


0

In [24]:
df_subm['score'] = df_subm[['score', 'score2']].mean(axis=1)
df_subm.drop(columns=['score2'], inplace=True)
display(df_subm)

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.012212
57549,0.037937
57551,0.006718
57552,0.039664
57569,0.095661
57630,0.023542
57631,0.055389
57632,0.02885
57633,0.030423
57634,0.042735


In [25]:
df_subm.to_csv("submission.csv")