In [1]:
import polars as pl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [3]:
train_basetable = pl.read_csv("csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv("csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv("csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv("csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv("csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [4]:
test_basetable = pl.read_csv("csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv("csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv("csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv("csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv("csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv("csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [5]:
# Convert Polars DataFrames to Pandas for easier manipulation in later stages (optional)
train_basetable = train_basetable.to_pandas()
train_static = train_static.to_pandas()
train_static_cb = train_static_cb.to_pandas()
train_person_1 = train_person_1.to_pandas()
test_basetable = test_basetable.to_pandas()
test_static = test_static.to_pandas()
test_static_cb = test_static_cb.to_pandas()
test_person_1 = test_person_1.to_pandas()

# Example of a simple feature engineering function
def create_features(base_table, static, static_cb, person_1):
    # Join your tables here. Example:
    df = pd.merge(base_table, static, on="case_id", how="left")
    df = pd.merge(df, static_cb, on="case_id", how="left")
    df = pd.merge(df, person_1, on="case_id", how="left")
    
    # Assuming `df` is your DataFrame after conversion to Pandas
    df['annuity_income_ratio'] = df['annuity_780A'] / df['maininc_215A']
    df['avg_outstanding_balance_income_ratio'] = df['avgoutstandbalancel6m_4187114A'] / df['maininc_215A']
    df['payment_income_ratio_12m'] = df['avgpmtlast12m_4525200A'] / df['maininc_215A']
    df['debt_credit_ratio'] = df['totaldebt_9A'] / df['credamount_770A']
    
    df['bankacctype_710L'] = df['bankacctype_710L'].astype('category')
    # Assuming 'df' is your DataFrame
    # Select only numeric columns for median calculation
    numeric_df = df.select_dtypes(include=[np.number])

    # Calculate the median only for numeric columns
    numeric_medians = numeric_df.median()

    # Fill missing values in numeric columns with their respective medians
    df.fillna(numeric_medians, inplace=True)
        
    # Convert categorical variables to type 'category' for LightGBM, XGBoost, and CatBoost
    df = convert_strings(df)
    
    return df


train_df = create_features(train_basetable, train_static, train_static_cb, train_person_1)
test_df = create_features(test_basetable, test_static, test_static_cb, test_person_1)


In [6]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype

        if col_type not in ['object', 'category']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            # Skip min/max operations for categorical or object columns
            continue  # You can also convert to ordered categorical here if applicable

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')

    return df


train_df = reduce_mem_usage(train_df)
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)

for col in train_df.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    train_df[col] = train_df[col].fillna(train_df[col].median())

# Now proceed with your train-test split and model training as before
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(['target', 'case_id'], axis=1),
    train_df['target'], test_size=0.2, random_state=42
)
X_test = test_df.drop(['case_id'], axis=1)

lgb_model = lgb.LGBMClassifier(
    n_estimators=5,
    learning_rate=0.05,
    random_state=42,
    force_col_wise=True
)

# Fit the LightGBM model with early stopping
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)]  # This is now correctly placed within fit_params
)



xgb_model = xgb.XGBClassifier(
    n_estimators=5, 
    learning_rate=0.05, 
    random_state=42, 
    use_label_encoder=False, 
    eval_metric='logloss',
    early_stopping_rounds=50,  # Place early_stopping_rounds here
    enable_categorical=True  # Ensure your XGBoost version supports this
)

# Fit the XGBoost model with early stopping
xgb_model.fit(
    X_train, y_train, 
    eval_set=[(X_val, y_val)],
    verbose=True
)
cat_features = ['date_decision', 'bankacctype_710L','cardtype_51L','clientscnt_136L', 'credtype_322L', 'datefirstoffer_1144D', 'datelastinstal40dpd_247D', 'datelastunpaid_3546854D', 'deferredmnthsnum_166L', 'disbursementtype_67L', 'dtlastpmtallstes_4499206D', 'equalitydataagreement_891L', 'equalityempfrom_62L', 'firstclxcampaign_1125D', 'firstdatedue_489D', 'inittransactioncode_186L', 'interestrategrace_34L', 'isbidproductrequest_292L', 'isdebitcard_729L', 'lastactivateddate_801D', 'lastapplicationdate_877D', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprdate_640D', 'lastcancelreason_561M', 'lastdelinqdate_224D', 'lastdependentsnum_448L', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectdate_50D', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'lastrepayingdate_696D', 'lastst_736L', 'maxdpdinstldate_3546855D', 'numinstlswithdpd5_4187116L', 'numinstmatpaidtearly2d_4499204L', 'numinstpaid_4499208L', 'numinstpaidearly3dest_4493216L', 'numinstpaidearly5dest_4493211L', 'numinstpaidearly5dobd_4499205L', 'numinstpaidearlyest_4493214L', 'numinstpaidlastcontr_4325080L', 'numinstregularpaidest_4493210L', 'numinsttopaygrest_4493213L', 'numinstunpaidmaxest_4493212L', 'opencred_647L', 'paytype1st_925L', 'paytype_783L', 'payvacationpostpone_4187118D', 'previouscontdistrict_112M', 'twobodfilling_608L', 'typesuite_864L', 'validfrom_1069D', 'assignmentdate_238D', 'assignmentdate_4527235D', 'assignmentdate_4955616D', 'birthdate_574D', 'contractssum_5085716L', 'dateofbirth_337D', 'dateofbirth_342D', 'description_5085714M', 'education_1103M', 'education_88M', 'maritalst_385M', 'maritalst_893M', 'pmtcount_4527229L', 'pmtcount_4955617L', 'requesttype_4525192L', 'responsedate_1012D', 'responsedate_4527233D', 'responsedate_4917613D', 'riskassesment_302T', 'birth_259D', 'birthdate_87D', 'contaddr_district_15M', 'contaddr_matchlist_1032L', 'contaddr_smempladdr_334L', 'contaddr_zipcode_807M', 'education_927M', 'empl_employedfrom_271D', 'empl_employedtotal_800L', 'empl_industry_691L', 'empladdr_district_926M', 'empladdr_zipcode_114M', 'familystate_447L', 'gender_992L', 'housetype_905L', 'housingtype_772L', 'incometype_1044T', 'isreference_387L', 'language1_981M', 'maritalst_703L', 'registaddr_district_1083M', 'registaddr_zipcode_184M', 'relationshiptoclient_415T', 'relationshiptoclient_642T', 'remitter_829L', 'role_1084L', 'role_993L', 'safeguarantyflag_411L', 'sex_738L', 'type_25L']
print(train_df[cat_features].isna().sum())  # Check for remaining NaN values
print(train_df[cat_features].dtypes) 

for col in cat_features:
    # For the training set
    train_df[col] = train_df[col].astype(str).replace('nan', 'missing').fillna('missing')
    X_val[col] = X_val[col].astype(str).replace('nan', 'missing').fillna('missing')
for col in cat_features:
    X_train[col] = X_train[col].astype(str).fillna('missing')
    X_val[col] = X_val[col].astype(str).fillna('missing')    

print(train_df[cat_features].isna().sum())  # Check for remaining NaN values
print(train_df[cat_features].dtypes) 

    

cbt_model = cbt.CatBoostClassifier(
    iterations=5, 
    learning_rate=0.05, 
    random_state=42, 
    verbose=100,
    early_stopping_rounds=50,
    cat_features = cat_features
)
# Fit the CatBoost model
cbt_model.fit(
    X_train, y_train, 
    eval_set=[(X_val, y_val)],
    verbose=100
)

Memory usage of dataframe is 4104.14 MB
Memory usage after optimization is: 1602.59 MB
Decreased by 61.0%
[LightGBM] [Info] Number of positive: 79567, number of negative: 2299625
[LightGBM] [Info] Total Bins 30567
[LightGBM] [Info] Number of data points in the train set: 2379192, number of used features: 252
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.033443 -> initscore=-3.363902
[LightGBM] [Info] Start training from score -3.363902
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5]	valid_0's auc: 0.727821	valid_0's binary_logloss: 0.141313
[0]	validation_0-logloss:0.20140
[1]	validation_0-logloss:0.19703
[2]	validation_0-logloss:0.19294
[3]	validation_0-logloss:0.18910
[4]	validation_0-logloss:0.18550
date_decision                  0
bankacctype_710L         2114985
cardtype_51L             2523887
clientscnt_136L          2972123
credtype_322L                  3
                          ...   
role_1084L           

<catboost.core.CatBoostClassifier at 0x1a5939bd3d0>

   case_id  score
0    57543    0.5
1    57549    0.5
2    57551    0.5
3    57552    0.5
4    57569    0.5
