# Reference 
(LGB + Cat ensemble) +Stacking https://www.kaggle.com/code/harrychan123/lgb-cat-ensemble-stacking

# Let's get to work!

## Imports 

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder

In [2]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date,strict=False))
        return df

    def handle_dates(df):
        for col in df.columns: 
                if col.endswith("D"):
                    # Calculate the difference in days between each date column and date_decision
                    df = df.with_columns(
                        (pl.col("date_decision") - pl.col(col)).dt.total_days().alias(col)
                    )
                    df = df.with_columns(pl.col(col).fill_null(np.nan)) 
        # Drop date_decision column
        df = df.drop("date_decision")
#         print(df.dtypes) # for Debugging
        return df

    def filter_cols(df,base_df = None,test=False):
        #for test data
            for col in df.columns:
                if col not in ["target", "case_id", "WEEK_NUM"]:
                    isnull = df[col].is_null().mean()
                    if isnull > 0.97:
                        df = df.drop(col)
            columns_to_drop = []
            for col in df.columns:
                if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                    freq = df[col].n_unique()
                    if (freq == 1) or (freq > 200):
                        columns_to_drop.append(col)

            df = df.drop(columns_to_drop)
            return df


class Aggregator:
    # Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max + expr_mean 

    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        
        return expr_max + expr_mean 

    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        # expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return expr_max + expr_mean

    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        return expr_max + expr_mean

    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_mean
    
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs
    
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.set_table_dtypes)
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                try:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
                except:
                    continue
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Processing functions

Little Testing

## Train data

In [3]:
import joblib

In [4]:
lgb_notebook_info = joblib.load('/kaggle/input/home-credit-lgb555/notebook_info.joblib')
print(f"- [lgb] notebook_start_time: {lgb_notebook_info['notebook_start_time']}")
print(f"- [lgb] description: {lgb_notebook_info['description']}")

cols = lgb_notebook_info['cols']
cat_cols = lgb_notebook_info['cat_cols']
print(f"- [lgb] len(cols): {len(cols)}")
print(f"- [lgb] len(cat_cols): {len(cat_cols)}")

lgb_models = joblib.load('/kaggle/input/home-credit-lgb555/lgb_models.joblib')
lgb_models

- [lgb] notebook_start_time: 2024-05-24 22:44:32.349464
- [lgb] description: Add notebook info dict to store cols and cat_cols
- [lgb] len(cols): 452
- [lgb] len(cat_cols): 79


[LGBMClassifier(colsample_bynode=0.8, colsample_bytree=0.8, device='gpu',
                extra_trees=True, learning_rate=0.05, max_depth=10, metric='auc',
                n_estimators=2000, num_leaves=64, objective='binary',
                random_state=42, reg_alpha=0.1, reg_lambda=10,
                sample_weight='balanced', verbose=-1),
 LGBMClassifier(colsample_bynode=0.8, colsample_bytree=0.8, device='gpu',
                extra_trees=True, learning_rate=0.05, max_depth=10, metric='auc',
                n_estimators=2000, num_leaves=64, objective='binary',
                random_state=42, reg_alpha=0.1, reg_lambda=10,
                sample_weight='balanced', verbose=-1),
 LGBMClassifier(colsample_bynode=0.8, colsample_bytree=0.8, device='gpu',
                extra_trees=True, learning_rate=0.05, max_depth=10, metric='auc',
                n_estimators=2000, num_leaves=64, objective='binary',
                random_state=42, reg_alpha=0.1, reg_lambda=10,
                sample_

In [5]:
cat_notebook_info = joblib.load('/kaggle/input/home-credit-cat555/notebook_info.joblib')
print(f"- [cat] notebook_start_time: {cat_notebook_info['notebook_start_time']}")
print(f"- [cat] description: {cat_notebook_info['description']}")

cat_models = joblib.load('/kaggle/input/home-credit-cat555/cat_models.joblib')
cat_models

- [cat] notebook_start_time: 2024-05-25 17:33:28.027389
- [cat] description: Add notebook info dict to store cols and cat_cols


[<catboost.core.CatBoostClassifier at 0x7e65c2a01090>,
 <catboost.core.CatBoostClassifier at 0x7e65c1ee31c0>,
 <catboost.core.CatBoostClassifier at 0x7e65ba8c8730>,
 <catboost.core.CatBoostClassifier at 0x7e65b8ab1c90>,
 <catboost.core.CatBoostClassifier at 0x7e65b821f1c0>]

In [6]:
ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TEST_DIR        = ROOT / "parquet_files" / "test"

data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

In [7]:
df_test = feature_eng(**data_store)
print("test data shape:\t", df_test.shape)
del data_store
gc.collect()


test data shape:	 (10, 750)


0

In [8]:
cnt_encoding_cols = df_test.select(pl.selectors.by_dtype([pl.String, pl.Boolean, pl.Categorical])).columns

mappings = {}
for col in cnt_encoding_cols:
    mappings[col] = df_test.group_by(col).len()

df_test_lazy = df_test.select(mappings.keys()).lazy()
# df_test_lazy = pl.LazyFrame(df_test.select('case_id'))

for col, mapping in mappings.items():
    remapping = {category: count for category, count in mapping.rows()}
    remapping[None] = -2
    expr = pl.col(col).replace(
                remapping,
                default=-1,
            )
    df_test_lazy = df_test_lazy.with_columns(expr.alias(col + '_cnt'))
    del col, mapping, remapping
del mappings
transformed_test = df_test_lazy.collect()

df_test = pl.concat([df_test, transformed_test.select("^*cnt$")], how='horizontal')
del transformed_test, cnt_encoding_cols

In [9]:
# 打印 df_test 中存在的列名
print(df_test.columns)

# 检查 cols 中的每个列是否都在 df_test 中
missing_cols = [col for col in cols if col not in df_test.columns]
print("缺失的列:", missing_cols)


['case_id', 'MONTH', 'WEEK_NUM', 'month_decision', 'weekday_decision', 'assignmentdate_238D', 'assignmentdate_4527235D', 'assignmentdate_4955616D', 'birthdate_574D', 'contractssum_5085716L', 'dateofbirth_337D', 'dateofbirth_342D', 'days120_123L', 'days180_256L', 'days30_165L', 'days360_512L', 'days90_310L', 'description_5085714M', 'education_1103M', 'education_88M', 'firstquarter_103L', 'for3years_128L', 'for3years_504L', 'for3years_584L', 'formonth_118L', 'formonth_206L', 'formonth_535L', 'forquarter_1017L', 'forquarter_462L', 'forquarter_634L', 'fortoday_1092L', 'forweek_1077L', 'forweek_528L', 'forweek_601L', 'foryear_618L', 'foryear_818L', 'foryear_850L', 'fourthquarter_440L', 'maritalst_385M', 'maritalst_893M', 'numberofqueries_373L', 'pmtaverage_3A', 'pmtaverage_4527227A', 'pmtaverage_4955615A', 'pmtcount_4527229L', 'pmtcount_4955617L', 'pmtcount_693L', 'pmtscount_423L', 'pmtssum_45A', 'requesttype_4525192L', 'responsedate_1012D', 'responsedate_4527233D', 'responsedate_4917613D',

In [10]:
# Calculate the mean and assign it to 'riskassesment_302T_cnt' using 'with_columns'
df_test = df_test.with_columns(
    pl.col('riskassesment_940T').mean().alias('riskassesment_302T_cnt')
)

# Continue with your processing
df_test = df_test.select(['case_id'] + cols)
df_test, cat_cols = to_pandas(df_test, cat_cols)  # Assuming this converts a Polars DataFrame to a Pandas DataFrame
df_test = reduce_mem_usage(df_test)
df_test = df_test.set_index('case_id')
print("test data shape:\t", df_test.shape)

gc.collect()


Memory usage of dataframe is 0.04 MB
Memory usage after optimization is: 0.02 MB
Decreased by 48.6%
test data shape:	 (10, 452)


0

In [11]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
     
    def predict_proba(self, X):      
        # lgb
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:5]]
        
        # cat        
        X[cat_cols] = X[cat_cols].astype(str)
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[-5:]]
        
        return np.mean(y_preds, axis=0)

In [12]:
model = VotingModel(lgb_models + cat_models)
len(model.estimators)

10

In [13]:
y_pred = pd.Series(model.predict_proba(df_test)[:, 1], index=df_test.index)
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred
df_subm.to_csv("submission.csv")
df_subm

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.019098
57549,0.103251
57551,0.003748
57552,0.018622
57569,0.276062
57630,0.021469
57631,0.082208
57632,0.021138
57633,0.098788
57634,0.077353
