In [1]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
import gc

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

In [2]:
gc.collect()

38

In [3]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
        else:
            try:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            except:
                continue
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [4]:
from datetime import datetime
def date_encoding(df: pl.DataFrame) -> pl.DataFrame:    
    current_date = datetime.now()
    for col in df.columns:
        if col[-1] == "D":
            try:
    #             df = df.with_columns(
    #                 pl.col(col).str.to_datetime(format="%Y-%m-%d").dt.year().cast(pl.Int64)
    #             )
                df = df.with_columns(
                    pl.col(col).str.to_datetime(format="%Y-%m-%d")
                )
    #             df = df.with_columns(pl.col(col).sub(2024).alias(col))
                df = df.with_columns(pl.col(col).sub(current_date).dt.total_days().mul(-1).alias(col))
    #             print(df['x'])
    #             df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            except:
                continue
    return df

In [5]:
from sklearn.preprocessing import LabelEncoder
def label_encoding(df: pl.DataFrame)-> pl.DataFrame:
    for col in df.columns:
        if str(df[col].dtype) == 'String':
            label_encoder = LabelEncoder()
#             df = df.with_columns(col, label_encoder.fit_transform(df[col].to_numpy()))
            data = label_encoder.fit_transform(df[col].to_numpy())
            df = df.with_columns(a = data)
            df = df.with_columns(pl.col('a').alias(col))
            df = df.drop('a')
    return df

In [6]:
# def aggregate(df: pl.DataFrame) -> pl.DataFrame:
#     agg_data = None  # Initialize variable to hold aggregated data
#     for col in df.columns:
#         if col[-1] == 'A':
#             if agg_data is None:
#                 # If this is the first aggregation, assign it to aggregated_data
#                 agg_data = df.group_by('case_id').agg(pl.col(col).sum())
#             else:
#                 # Otherwise, join the aggregated data with the new aggregation
#                 agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).sum()), on='case_id')
                
#         elif col[-1] in ['P', 'D']:
#             if agg_data is None:
#                 # If this is the first aggregation, assign it to aggregated_data
#                 agg_data = df.group_by('case_id').agg(pl.col(col).max())
#             else:
#                 # Otherwise, join the aggregated data with the new aggregation
#                 agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).max()), on='case_id')
                
#         elif col[-1] in ['M', 'T', 'L']:
#             if agg_data is None:
#                 # If this is the first aggregation, assign it to aggregated_data
#                 agg_data = df.group_by('case_id').agg(pl.col(col).mode().max())
#             else:
#                 # Otherwise, join the aggregated data with the new aggregation
#                 agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).mode().max()), on='case_id')
#         else:
#             continue
                
#     return agg_data

In [7]:
def aggregate(df: pl.DataFrame) -> pl.DataFrame:
    agg_data = None  # Initialize variable to hold aggregated data
    for col in df.columns:
        if col[-1] == 'A':
            if agg_data is None:
                # If this is the first aggregation, assign it to aggregated_data
                agg_data = df.group_by('case_id').agg(pl.col(col).sum())
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).min().alias(col+'_min')), on='case_id')
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).max().alias(col+'_max')), on='case_id')
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).mean().alias(col+'_mean')), on='case_id')
            else:
                # Otherwise, join the aggregated data with the new aggregation
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).sum()), on='case_id')
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).min().alias(col+'_min')), on='case_id')
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).max().alias(col+'_max')), on='case_id')
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).mean().alias(col+'_mean')), on='case_id')
                
        elif col[-1] in ['P', 'D']:
            if agg_data is None:
                # If this is the first aggregation, assign it to aggregated_data
                agg_data = df.group_by('case_id').agg(pl.col(col).max().alias(col+'_max'))
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).min().alias(col+'_min')), on='case_id')
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).mean().alias(col+'_mean')), on='case_id')
            else:
                # Otherwise, join the aggregated data with the new aggregation
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).max().alias(col+'_max')), on='case_id')
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).min().alias(col+'_min')), on='case_id')
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).mean().alias(col+'_mean')), on='case_id')
                
        elif col[-1] in ['M', 'T', 'L']:
            if agg_data is None:
                # If this is the first aggregation, assign it to aggregated_data
                agg_data = df.group_by('case_id').agg(pl.col(col).mode().max().alias(col+'_max'))
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).mean().alias(col+'_mean')), on='case_id')
            else:
                # Otherwise, join the aggregated data with the new aggregation
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).mode().max().alias(col+'_max')), on='case_id')
                agg_data = agg_data.join(df.group_by('case_id').agg(pl.col(col).mean().alias(col+'_mean')), on='case_id')
        else:
            continue
                
    return agg_data

In [8]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_basetable = train_basetable.rename({'date_decision':'date_decisionD'})

In [9]:
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_basetable = test_basetable.rename({'date_decision':'date_decisionD'})

In [10]:
train_basetable = date_encoding(train_basetable)

test_basetable = date_encoding(test_basetable)

In [11]:
train_static_0 = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

train_static_0 = train_static_0.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))

test_static_0 = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),

    ],
    how="vertical_relaxed",
)

test_static_0 = test_static_0.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))

In [12]:
train_static_0 = date_encoding(train_static_0)
train_static_0 = label_encoding(train_static_0)

test_static_0 = date_encoding(test_static_0)
test_static_0 = label_encoding(test_static_0)

In [13]:
train_df = train_basetable.join(train_static_0, on="case_id", how="left")
del train_basetable
del train_static_0

test_df = test_basetable.join(test_static_0, on="case_id", how="left")
del test_basetable
del test_static_0

gc.collect()

0

In [14]:
train_static_cb_0 = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_static_cb_0 = train_static_cb_0.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))

test_static_cb_0 = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_static_cb_0 = test_static_cb_0.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))

In [15]:
train_static_cb_0 = date_encoding(train_static_cb_0)
train_static_cb_0 = label_encoding(train_static_cb_0)

test_static_cb_0 = date_encoding(test_static_cb_0)
test_static_cb_0 = label_encoding(test_static_cb_0)

In [16]:
train_df = train_df.join(train_static_cb_0, on="case_id", how="left")
del train_static_cb_0

test_df = test_df.join(test_static_cb_0, on="case_id", how="left")
del test_static_cb_0

gc.collect()

0

In [17]:
train_applprev_1 = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_applprev_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_applprev_1_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

train_applprev_1 = train_applprev_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))

test_applprev_1 = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_applprev_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_applprev_1_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_applprev_1_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

test_applprev_1 = test_applprev_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))

In [18]:
train_applprev_1 = date_encoding(train_applprev_1)
train_applprev_1 = label_encoding(train_applprev_1)
train_applprev_1 = aggregate(train_applprev_1)

test_applprev_1 = date_encoding(test_applprev_1)
test_applprev_1 = label_encoding(test_applprev_1)
test_applprev_1 = aggregate(test_applprev_1)

In [19]:
train_df = train_df.join(train_applprev_1, on="case_id", how="left")
del train_applprev_1

test_df = test_df.join(test_applprev_1, on="case_id", how="left")
del test_applprev_1

gc.collect()

0

In [20]:
train_other_1 = pl.read_csv(dataPath + "csv_files/train/train_other_1.csv").pipe(set_table_dtypes)
train_other_1 = train_other_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_other_1 = date_encoding(train_other_1)
train_other_1 = label_encoding(train_other_1)
train_other_1 = aggregate(train_other_1)

test_other_1 = pl.read_csv(dataPath + "csv_files/test/test_other_1.csv").pipe(set_table_dtypes)
test_other_1 = test_other_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_other_1 = date_encoding(test_other_1)
test_other_1 = label_encoding(test_other_1)
test_other_1 = aggregate(test_other_1)

In [21]:
train_df = train_df.join(train_other_1, on="case_id", how="left")
del train_other_1

test_df = test_df.join(test_other_1, on="case_id", how="left")
del test_other_1

gc.collect()

0

In [22]:
train_tax_registry_a_1 = pl.read_csv(dataPath + "csv_files/train/train_tax_registry_a_1.csv").pipe(set_table_dtypes)
train_tax_registry_a_1 = train_tax_registry_a_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_tax_registry_a_1 = date_encoding(train_tax_registry_a_1)
train_tax_registry_a_1 = label_encoding(train_tax_registry_a_1)
train_tax_registry_a_1 = aggregate(train_tax_registry_a_1)

test_tax_registry_a_1 = pl.read_csv(dataPath + "csv_files/test/test_tax_registry_a_1.csv").pipe(set_table_dtypes)
test_tax_registry_a_1 = test_tax_registry_a_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_tax_registry_a_1 = date_encoding(test_tax_registry_a_1)
test_tax_registry_a_1 = label_encoding(test_tax_registry_a_1)
test_tax_registry_a_1 = aggregate(test_tax_registry_a_1)

In [23]:
train_df = train_df.join(train_tax_registry_a_1, on="case_id", how="left")
del train_tax_registry_a_1

test_df = test_df.join(test_tax_registry_a_1, on="case_id", how="left")
del test_tax_registry_a_1

gc.collect()

0

In [24]:
train_tax_registry_b_1 = pl.read_csv(dataPath + "csv_files/train/train_tax_registry_b_1.csv").pipe(set_table_dtypes)
train_tax_registry_b_1 = train_tax_registry_b_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_tax_registry_b_1 = date_encoding(train_tax_registry_b_1)
train_tax_registry_b_1 = label_encoding(train_tax_registry_b_1)
train_tax_registry_b_1 = aggregate(train_tax_registry_b_1)

test_tax_registry_b_1 = pl.read_csv(dataPath + "csv_files/test/test_tax_registry_b_1.csv").pipe(set_table_dtypes)
test_tax_registry_b_1 = test_tax_registry_b_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_tax_registry_b_1 = date_encoding(test_tax_registry_b_1)
test_tax_registry_b_1 = label_encoding(test_tax_registry_b_1)
test_tax_registry_b_1 = aggregate(test_tax_registry_b_1)

In [25]:
train_df = train_df.join(train_tax_registry_b_1, on="case_id", how="left")
del train_tax_registry_b_1

test_df = test_df.join(test_tax_registry_b_1, on="case_id", how="left")
del test_tax_registry_b_1

gc.collect()

0

In [26]:
train_tax_registry_c_1 = pl.read_csv(dataPath + "csv_files/train/train_tax_registry_c_1.csv").pipe(set_table_dtypes)
train_tax_registry_c_1 = train_tax_registry_c_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_tax_registry_c_1 = date_encoding(train_tax_registry_c_1)
train_tax_registry_c_1 = label_encoding(train_tax_registry_c_1)
train_tax_registry_c_1 = aggregate(train_tax_registry_c_1)

test_tax_registry_c_1 = pl.read_csv(dataPath + "csv_files/test/test_tax_registry_c_1.csv").pipe(set_table_dtypes)
test_tax_registry_c_1 = test_tax_registry_c_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_tax_registry_c_1 = date_encoding(test_tax_registry_c_1)
test_tax_registry_c_1 = label_encoding(test_tax_registry_c_1)
test_tax_registry_c_1 = aggregate(test_tax_registry_c_1)

In [27]:
train_df = train_df.join(train_tax_registry_c_1, on="case_id", how="left")
del train_tax_registry_c_1

test_df = test_df.join(test_tax_registry_c_1, on="case_id", how="left")
del test_tax_registry_c_1

gc.collect()

0

In [28]:
train_credit_bureau_b_1 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_1.csv").pipe(set_table_dtypes)
train_credit_bureau_b_1 = train_credit_bureau_b_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_credit_bureau_b_1 = date_encoding(train_credit_bureau_b_1)
train_credit_bureau_b_1 = label_encoding(train_credit_bureau_b_1)
train_credit_bureau_b_1 = aggregate(train_credit_bureau_b_1)

test_credit_bureau_b_1 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_1.csv").pipe(set_table_dtypes)
test_credit_bureau_b_1 = test_credit_bureau_b_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_credit_bureau_b_1 = date_encoding(test_credit_bureau_b_1)
test_credit_bureau_b_1 = label_encoding(test_credit_bureau_b_1)
test_credit_bureau_b_1 = aggregate(test_credit_bureau_b_1)

In [29]:
train_df = train_df.join(train_credit_bureau_b_1, on="case_id", how="left")
del train_credit_bureau_b_1

test_df = test_df.join(test_credit_bureau_b_1, on="case_id", how="left")
del test_credit_bureau_b_1

gc.collect()

0

In [30]:
# train_credit_bureau_a_1_0 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_a_1_0.csv").pipe(set_table_dtypes)
# train_credit_bureau_a_1_0 = train_credit_bureau_a_1_0.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
# train_credit_bureau_a_1_0 = date_encoding(train_credit_bureau_a_1_0)
# train_credit_bureau_a_1_0 = label_encoding(train_credit_bureau_a_1_0)
# train_credit_bureau_a_1_0 = aggregate(train_credit_bureau_a_1_0)
# #
# test_credit_bureau_a_1_0 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_a_1_0.csv").pipe(set_table_dtypes)
# test_credit_bureau_a_1_0 = test_credit_bureau_a_1_0.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
# test_credit_bureau_a_1_0 = date_encoding(test_credit_bureau_a_1_0)
# test_credit_bureau_a_1_0 = label_encoding(test_credit_bureau_a_1_0)
# test_credit_bureau_a_1_0 = aggregate(test_credit_bureau_a_1_0)

In [31]:
# train_credit_bureau_a_1_1 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_a_1_1.csv").pipe(set_table_dtypes)
# train_credit_bureau_a_1_1 = train_credit_bureau_a_1_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
# train_credit_bureau_a_1_1 = date_encoding(train_credit_bureau_a_1_1)
# train_credit_bureau_a_1_1 = label_encoding(train_credit_bureau_a_1_1)
# train_credit_bureau_a_1_1 = aggregate(train_credit_bureau_a_1_1)

# test_credit_bureau_a_1_1 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_a_1_1.csv").pipe(set_table_dtypes)
# test_credit_bureau_a_1_1 = test_credit_bureau_a_1_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
# test_credit_bureau_a_1_1 = date_encoding(test_credit_bureau_a_1_1)
# test_credit_bureau_a_1_1 = label_encoding(test_credit_bureau_a_1_1)
# test_credit_bureau_a_1_1 = aggregate(test_credit_bureau_a_1_1)

In [32]:
# train_credit_bureau_a_1_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_a_1_2.csv").pipe(set_table_dtypes)
# train_credit_bureau_a_1_2 = train_credit_bureau_a_1_2.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
# train_credit_bureau_a_1_2 = date_encoding(train_credit_bureau_a_1_2)
# train_credit_bureau_a_1_2 = label_encoding(train_credit_bureau_a_1_2)
# train_credit_bureau_a_1_2 = aggregate(train_credit_bureau_a_1_2)

# test_credit_bureau_a_1_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_a_1_2.csv").pipe(set_table_dtypes)
# test_credit_bureau_a_1_2 = test_credit_bureau_a_1_2.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
# test_credit_bureau_a_1_2 = date_encoding(test_credit_bureau_a_1_2)
# test_credit_bureau_a_1_2 = label_encoding(test_credit_bureau_a_1_2)
# test_credit_bureau_a_1_2 = aggregate(test_credit_bureau_a_1_2)

In [33]:
# train_credit_bureau_a_1_3 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_a_1_3.csv").pipe(set_table_dtypes)
# train_credit_bureau_a_1_3 = train_credit_bureau_a_1_3.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
# train_credit_bureau_a_1_3 = date_encoding(train_credit_bureau_a_1_3)
# train_credit_bureau_a_1_3 = label_encoding(train_credit_bureau_a_1_3)
# train_credit_bureau_a_1_3 = aggregate(train_credit_bureau_a_1_3)

# test_credit_bureau_a_1_3 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_a_1_3.csv").pipe(set_table_dtypes)
# test_credit_bureau_a_1_3 = test_credit_bureau_a_1_3.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
# test_credit_bureau_a_1_3 = date_encoding(test_credit_bureau_a_1_3)
# test_credit_bureau_a_1_3 = label_encoding(test_credit_bureau_a_1_3)
# test_credit_bureau_a_1_3 = aggregate(test_credit_bureau_a_1_3)

In [34]:
# train_credit_bureau_a_1 = pl.concat([train_credit_bureau_a_1_0,train_credit_bureau_a_1_1,train_credit_bureau_a_1_2,train_credit_bureau_a_1_3])

# test_credit_bureau_a_1 = pl.concat([test_credit_bureau_a_1_0,test_credit_bureau_a_1_1,test_credit_bureau_a_1_2,test_credit_bureau_a_1_3])

In [35]:
# train_df = train_df.join(train_credit_bureau_a_1, on="case_id", how="left")
# del train_credit_bureau_a_1
# del train_credit_bureau_a_1_0
# del train_credit_bureau_a_1_1
# del train_credit_bureau_a_1_2
# del train_credit_bureau_a_1_3
# gc.collect()

In [36]:
# test_df = test_df.join(test_credit_bureau_a_1, on="case_id", how="left")
# del test_credit_bureau_a_1
# del test_credit_bureau_a_1_0
# del test_credit_bureau_a_1_1
# del test_credit_bureau_a_1_2
# del test_credit_bureau_a_1_3
# gc.collect()

In [37]:
train_deposit_1 = pl.read_csv(dataPath + "csv_files/train/train_deposit_1.csv").pipe(set_table_dtypes)
train_deposit_1 = train_deposit_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_deposit_1 = date_encoding(train_deposit_1)
train_deposit_1 = label_encoding(train_deposit_1)
train_deposit_1 = aggregate(train_deposit_1)

test_deposit_1 = pl.read_csv(dataPath + "csv_files/test/test_deposit_1.csv").pipe(set_table_dtypes)
test_deposit_1 = test_deposit_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_deposit_1 = date_encoding(test_deposit_1)
test_deposit_1 = label_encoding(test_deposit_1)
test_deposit_1 = aggregate(test_deposit_1)

In [38]:
train_df = train_df.join(train_deposit_1, on="case_id", how="left")
del train_deposit_1

test_df = test_df.join(test_deposit_1, on="case_id", how="left")
del test_deposit_1

gc.collect()

0

In [39]:
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes)
train_person_1 = train_person_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_person_1 = date_encoding(train_person_1)
train_person_1 = label_encoding(train_person_1)
train_person_1 = aggregate(train_person_1)

test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes)
test_person_1 = test_person_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_person_1 = date_encoding(test_person_1)
test_person_1 = label_encoding(test_person_1)
test_person_1 = aggregate(test_person_1)

In [40]:
train_df = train_df.join(train_person_1, on="case_id", how="left")
del train_person_1

test_df = test_df.join(test_person_1, on="case_id", how="left")
del test_person_1

gc.collect()

0

In [41]:
train_debitcard_1 = pl.read_csv(dataPath + "csv_files/train/train_debitcard_1.csv").pipe(set_table_dtypes)
train_debitcard_1 = train_debitcard_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_debitcard_1 = date_encoding(train_debitcard_1)
train_debitcard_1 = label_encoding(train_debitcard_1)
train_debitcard_1 = aggregate(train_debitcard_1)

test_debitcard_1 = pl.read_csv(dataPath + "csv_files/test/test_debitcard_1.csv").pipe(set_table_dtypes)
test_debitcard_1 = test_debitcard_1.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_debitcard_1 = date_encoding(test_debitcard_1)
test_debitcard_1 = label_encoding(test_debitcard_1)
test_debitcard_1 = aggregate(test_debitcard_1)

In [42]:
train_df = train_df.join(train_debitcard_1, on="case_id", how="left")
del train_debitcard_1

test_df = test_df.join(test_debitcard_1, on="case_id", how="left")
del test_debitcard_1

gc.collect()

0

In [43]:
train_applprev_2 = pl.read_csv(dataPath + "csv_files/train/train_applprev_2.csv").pipe(set_table_dtypes)
train_applprev_2 = train_applprev_2.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_applprev_2 = date_encoding(train_applprev_2)
train_applprev_2 = label_encoding(train_applprev_2)
train_applprev_2 = aggregate(train_applprev_2)

test_applprev_2 = pl.read_csv(dataPath + "csv_files/test/test_applprev_2.csv").pipe(set_table_dtypes)
test_applprev_2 = test_applprev_2.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_applprev_2 = date_encoding(test_applprev_2)
test_applprev_2 = label_encoding(test_applprev_2)
test_applprev_2 = aggregate(test_applprev_2)

In [44]:
train_df = train_df.join(train_applprev_2, on="case_id", how="left")
del train_applprev_2

test_df = test_df.join(test_applprev_2, on="case_id", how="left")
del test_applprev_2

gc.collect()

0

In [45]:
train_person_2 = pl.read_csv(dataPath + "csv_files/train/train_person_2.csv").pipe(set_table_dtypes)
train_person_2 = train_person_2.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_person_2 = date_encoding(train_person_2)
train_person_2 = label_encoding(train_person_2)
train_person_2 = aggregate(train_person_2)

test_person_2 = pl.read_csv(dataPath + "csv_files/test/test_person_2.csv").pipe(set_table_dtypes)
test_person_2 = test_person_2.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_person_2 = date_encoding(test_person_2)
test_person_2 = label_encoding(test_person_2)
test_person_2 = aggregate(test_person_2)

In [46]:
train_df = train_df.join(train_person_2, on="case_id", how="left")
del train_person_2

test_df = test_df.join(test_person_2, on="case_id", how="left")
del test_person_2

gc.collect()

0

In [47]:
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes)
train_credit_bureau_b_2 = train_credit_bureau_b_2.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
train_credit_bureau_b_2 = date_encoding(train_credit_bureau_b_2)
train_credit_bureau_b_2 = label_encoding(train_credit_bureau_b_2)
train_credit_bureau_b_2 = aggregate(train_credit_bureau_b_2)

test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes)
test_credit_bureau_b_2 = test_credit_bureau_b_2.with_columns(pl.col('case_id').cast(pl.Int64, strict=False))
test_credit_bureau_b_2 = date_encoding(test_credit_bureau_b_2)
test_credit_bureau_b_2 = label_encoding(test_credit_bureau_b_2)
test_credit_bureau_b_2 = aggregate(test_credit_bureau_b_2)

In [48]:
train_df = train_df.join(train_credit_bureau_b_2, on="case_id", how="left")
del train_credit_bureau_b_2

test_df = test_df.join(test_credit_bureau_b_2, on="case_id", how="left")
del test_credit_bureau_b_2

gc.collect()

0

In [49]:
# train_df = train_basetable.join(train_static_0, on="case_id", how="left").join(train_static_cb_0, on="case_id", how="left").join(train_applprev_1, on="case_id", how="left").join(train_other_1, on="case_id", how="left").join(train_tax_registry_a_1, on="case_id", how="left").join(train_tax_registry_b_1, on="case_id", how="left").join(train_tax_registry_c_1, on="case_id", how="left").join(train_credit_bureau_b_1, on="case_id", how="left").join(train_deposit_1, on="case_id", how="left").join(train_person_1, on="case_id", how="left").join(train_debitcard_1, on="case_id", how="left")

# test_df = test_basetable.join(test_static_0, on="case_id", how="left").join(test_static_cb_0, on="case_id", how="left").join(test_applprev_1, on="case_id", how="left").join(test_other_1, on="case_id", how="left").join(test_tax_registry_a_1, on="case_id", how="left").join(test_tax_registry_b_1, on="case_id", how="left").join(test_tax_registry_c_1, on="case_id", how="left").join(test_credit_bureau_b_1, on="case_id", how="left").join(test_deposit_1, on="case_id", how="left").join(test_person_1, on="case_id", how="left").join(test_debitcard_1, on="case_id", how="left")

In [50]:
train_df = train_df.fill_null(-1)

test_df = test_df.fill_null(-1)

In [51]:
train_df = train_df.drop(['case_id', 'date_decisionD', 'MONTH', 'WEEK_NUM', 'applicationcnt_361L', 'applicationscnt_1086L', 'applicationscnt_629L', 'applicationscnt_867L', 'avginstallast24m_3658937A', 'bankacctype_710L', 'cardtype_51L', 'clientscnt_1071L', 'clientscnt_136L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'credtype_322L', 'disbursementtype_67L', 'firstdatedue_489D', 'inittransactionamount_650A', 'inittransactioncode_186L', 'interestrategrace_34L', 'isbidproduct_1095L', 'isbidproductrequest_292L', 'isdebitcard_729L', 'lastapplicationdate_877D', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastdependentsnum_448L', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommodtypec_5251769M', 'maininc_215A', 'maxannuity_4075009A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'numactivecreds_622L', 'numactiverelcontr_750L', 'numinstpaidlastcontr_4325080L', 'numpmtchanneldd_318L', 'paytype1st_925L', 'paytype_783L', 'payvacationpostpone_4187118D', 'previouscontdistrict_112M', 'sellerplacescnt_216L', 'validfrom_1069D', 'education_1103M', 'education_88M', 'maritalst_893M', 'riskassesment_940T', 'approvaldate_319D_max', 'approvaldate_319D_mean', 'byoccupationinc_3656910L_max', 'byoccupationinc_3656910L_mean', 'childnum_21L_mean', 'creationdate_885D_max', 'creationdate_885D_min', 'creationdate_885D_mean', 'dateactivated_425D_max', 'dateactivated_425D_mean', 'dtlastpmt_581D_min', 'dtlastpmtallstes_3545839D_min', 'dtlastpmtallstes_3545839D_mean', 'employedfrom_700D_max', 'employedfrom_700D_mean', 'firstnonzeroinstldate_307D_max', 'firstnonzeroinstldate_307D_mean', 'isdebitcard_527L_mean', 'amtdebitincoming_4809443A', 'amtdebitincoming_4809443A_min', 'amtdebitincoming_4809443A_max', 'amtdebitincoming_4809443A_mean', 'amtdebitoutgoing_4809440A', 'amtdebitoutgoing_4809440A_min', 'amtdebitoutgoing_4809440A_max', 'amtdebitoutgoing_4809440A_mean', 'amtdepositbalance_4809441A', 'amtdepositbalance_4809441A_min', 'amtdepositbalance_4809441A_max', 'amtdepositbalance_4809441A_mean', 'amtdepositincoming_4809444A', 'amtdepositincoming_4809444A_min', 'amtdepositincoming_4809444A_max', 'amtdepositincoming_4809444A_mean', 'amtdepositoutgoing_4809442A', 'amtdepositoutgoing_4809442A_min', 'amtdepositoutgoing_4809442A_max', 'amtdepositoutgoing_4809442A_mean', 'deductiondate_4917603D_max', 'deductiondate_4917603D_min', 'deductiondate_4917603D_mean', 'name_4917606M_max', 'name_4917606M_mean', 'employername_160M_max', 'employername_160M_mean', 'processingdate_168D_min', 'processingdate_168D_mean', 'interesteffectiverate_369L_max', 'birthdate_87D_max', 'birthdate_87D_min', 'birthdate_87D_mean', 'childnum_185L_max', 'childnum_185L_mean', 'contaddr_district_15M_max', 'contaddr_district_15M_mean', 'contaddr_matchlist_1032L_mean', 'contaddr_smempladdr_334L_mean', 'contaddr_zipcode_807M_max', 'contaddr_zipcode_807M_mean', 'education_927M_max', 'empl_employedtotal_800L_max', 'empl_industry_691L_max', 'empladdr_district_926M_max', 'empladdr_zipcode_114M_max', 'familystate_447L_max', 'familystate_447L_mean', 'gender_992L_max', 'gender_992L_mean', 'housingtype_772L_mean', 'incometype_1044T_max', 'incometype_1044T_mean', 'isreference_387L_max', 'isreference_387L_mean', 'language1_981M_max', 'language1_981M_mean', 'mainoccupationinc_384A', 'mainoccupationinc_384A_min', 'mainoccupationinc_384A_max', 'mainoccupationinc_384A_mean', 'maritalst_703L_max', 'maritalst_703L_mean', 'persontype_1072L_max', 'registaddr_district_1083M_max', 'registaddr_district_1083M_mean', 'registaddr_zipcode_184M_max', 'registaddr_zipcode_184M_mean', 'relationshiptoclient_415T_max', 'relationshiptoclient_642T_max', 'remitter_829L_max', 'role_993L_max', 'role_993L_mean', 'safeguarantyflag_411L_mean', 'type_25L_max', 'type_25L_mean', 'last180dayaveragebalance_704A_min', 'last180dayaveragebalance_704A_max', 'last180dayaveragebalance_704A_mean', 'last180dayturnover_1134A_min', 'last180dayturnover_1134A_max', 'last180dayturnover_1134A_mean', 'last30dayturnover_651A_min', 'last30dayturnover_651A_max', 'last30dayturnover_651A_mean', 'annualeffectiverate_63L_max', 'classificationofcontr_13M_mean', 'contractst_545M_max', 'contractst_545M_mean', 'contractsum_5085717L_max', 'credlmt_230A', 'credlmt_230A_min', 'credlmt_230A_max', 'credlmt_230A_mean', 'credlmt_935A', 'credlmt_935A_min', 'credlmt_935A_max', 'credlmt_935A_mean', 'dateofcredend_289D_max', 'dateofcredend_289D_mean', 'dateofcredend_353D_min', 'dateofcredstart_181D_min', 'dateofrealrepmt_138D_min', 'debtoutstand_525A', 'debtoutstand_525A_min', 'debtoutstand_525A_max', 'debtoutstand_525A_mean', 'description_351M_max', 'dpdmaxdatemonth_442T_mean', 'dpdmaxdatemonth_89T_max', 'dpdmaxdatemonth_89T_mean', 'dpdmaxdateyear_596T_max', 'interestrate_508L_max', 'lastupdate_1112D_max', 'lastupdate_1112D_min', 'lastupdate_1112D_mean', 'lastupdate_388D_min', 'monthlyinstlamount_332A', 'monthlyinstlamount_332A_max', 'monthlyinstlamount_332A_mean', 'monthlyinstlamount_674A', 'monthlyinstlamount_674A_min', 'monthlyinstlamount_674A_max', 'monthlyinstlamount_674A_mean', 'nominalrate_281L_max', 'nominalrate_498L_max', 'numberofcontrsvalue_258L_mean', 'numberofinstls_320L_max', 'numberofoutstandinstls_520L_mean', 'numberofoutstandinstls_59L_max', 'numberofoverdueinstlmax_1039L_max', 'numberofoverdueinstlmaxdat_148D_min', 'numberofoverdueinstlmaxdat_148D_mean', 'numberofoverdueinstls_725L_max', 'numberofoverdueinstls_834L_mean', 'outstandingamount_354A', 'outstandingamount_354A_min', 'outstandingamount_354A_max', 'outstandingamount_354A_mean', 'outstandingamount_362A', 'outstandingamount_362A_min', 'outstandingamount_362A_max', 'outstandingamount_362A_mean', 'overdueamount_31A', 'overdueamount_31A_min', 'overdueamount_31A_max', 'overdueamount_31A_mean', 'overdueamount_659A_min', 'overdueamountmaxdatemonth_284T_mean', 'overdueamountmaxdatemonth_365T_max', 'overdueamountmaxdatemonth_365T_mean', 'overdueamountmaxdateyear_2T_max', 'periodicityofpmts_837L_max', 'periodicityofpmts_837L_mean', 'prolongationcount_599L_max', 'purposeofcred_426M_max', 'purposeofcred_426M_mean', 'refreshdate_3813885D_max', 'refreshdate_3813885D_min', 'refreshdate_3813885D_mean', 'residualamount_488A', 'residualamount_488A_min', 'residualamount_488A_max', 'residualamount_488A_mean', 'residualamount_856A_max', 'residualamount_856A_mean', 'totaldebtoverduevalue_718A', 'totaldebtoverduevalue_718A_min', 'totaldebtoverduevalue_718A_max', 'totaldebtoverduevalue_718A_mean', 'totaloutstanddebtvalue_39A', 'totaloutstanddebtvalue_39A_min', 'totaloutstanddebtvalue_39A_max', 'totaloutstanddebtvalue_39A_mean', 'totaloutstanddebtvalue_668A', 'addres_district_368M_max', 'addres_district_368M_mean', 'addres_role_871L_max', 'addres_role_871L_mean', 'addres_zip_823M_max', 'addres_zip_823M_mean', 'conts_role_79M_max', 'conts_role_79M_mean', 'empls_economicalst_849M_max', 'empls_economicalst_849M_mean', 'empls_employedfrom_796D_max', 'empls_employedfrom_796D_min', 'empls_employedfrom_796D_mean', 'empls_employer_name_740M_max', 'empls_employer_name_740M_mean', 'relatedpersons_role_762T_max', 'relatedpersons_role_762T_mean'])
test_df = test_df.drop(['case_id', 'date_decisionD', 'MONTH', 'WEEK_NUM', 'applicationcnt_361L', 'applicationscnt_1086L', 'applicationscnt_629L', 'applicationscnt_867L', 'avginstallast24m_3658937A', 'bankacctype_710L', 'cardtype_51L', 'clientscnt_1071L', 'clientscnt_136L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'credtype_322L', 'disbursementtype_67L', 'firstdatedue_489D', 'inittransactionamount_650A', 'inittransactioncode_186L', 'interestrategrace_34L', 'isbidproduct_1095L', 'isbidproductrequest_292L', 'isdebitcard_729L', 'lastapplicationdate_877D', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastdependentsnum_448L', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommodtypec_5251769M', 'maininc_215A', 'maxannuity_4075009A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'numactivecreds_622L', 'numactiverelcontr_750L', 'numinstpaidlastcontr_4325080L', 'numpmtchanneldd_318L', 'paytype1st_925L', 'paytype_783L', 'payvacationpostpone_4187118D', 'previouscontdistrict_112M', 'sellerplacescnt_216L', 'validfrom_1069D', 'education_1103M', 'education_88M', 'maritalst_893M', 'riskassesment_940T', 'approvaldate_319D_max', 'approvaldate_319D_mean', 'byoccupationinc_3656910L_max', 'byoccupationinc_3656910L_mean', 'childnum_21L_mean', 'creationdate_885D_max', 'creationdate_885D_min', 'creationdate_885D_mean', 'dateactivated_425D_max', 'dateactivated_425D_mean', 'dtlastpmt_581D_min', 'dtlastpmtallstes_3545839D_min', 'dtlastpmtallstes_3545839D_mean', 'employedfrom_700D_max', 'employedfrom_700D_mean', 'firstnonzeroinstldate_307D_max', 'firstnonzeroinstldate_307D_mean', 'isdebitcard_527L_mean', 'amtdebitincoming_4809443A', 'amtdebitincoming_4809443A_min', 'amtdebitincoming_4809443A_max', 'amtdebitincoming_4809443A_mean', 'amtdebitoutgoing_4809440A', 'amtdebitoutgoing_4809440A_min', 'amtdebitoutgoing_4809440A_max', 'amtdebitoutgoing_4809440A_mean', 'amtdepositbalance_4809441A', 'amtdepositbalance_4809441A_min', 'amtdepositbalance_4809441A_max', 'amtdepositbalance_4809441A_mean', 'amtdepositincoming_4809444A', 'amtdepositincoming_4809444A_min', 'amtdepositincoming_4809444A_max', 'amtdepositincoming_4809444A_mean', 'amtdepositoutgoing_4809442A', 'amtdepositoutgoing_4809442A_min', 'amtdepositoutgoing_4809442A_max', 'amtdepositoutgoing_4809442A_mean', 'deductiondate_4917603D_max', 'deductiondate_4917603D_min', 'deductiondate_4917603D_mean', 'name_4917606M_max', 'name_4917606M_mean', 'employername_160M_max', 'employername_160M_mean', 'processingdate_168D_min', 'processingdate_168D_mean', 'interesteffectiverate_369L_max', 'birthdate_87D_max', 'birthdate_87D_min', 'birthdate_87D_mean', 'childnum_185L_max', 'childnum_185L_mean', 'contaddr_district_15M_max', 'contaddr_district_15M_mean', 'contaddr_matchlist_1032L_mean', 'contaddr_smempladdr_334L_mean', 'contaddr_zipcode_807M_max', 'contaddr_zipcode_807M_mean', 'education_927M_max', 'empl_employedtotal_800L_max', 'empl_industry_691L_max', 'empladdr_district_926M_max', 'empladdr_zipcode_114M_max', 'familystate_447L_max', 'familystate_447L_mean', 'gender_992L_max', 'gender_992L_mean', 'housingtype_772L_mean', 'incometype_1044T_max', 'incometype_1044T_mean', 'isreference_387L_max', 'isreference_387L_mean', 'language1_981M_max', 'language1_981M_mean', 'mainoccupationinc_384A', 'mainoccupationinc_384A_min', 'mainoccupationinc_384A_max', 'mainoccupationinc_384A_mean', 'maritalst_703L_max', 'maritalst_703L_mean', 'persontype_1072L_max', 'registaddr_district_1083M_max', 'registaddr_district_1083M_mean', 'registaddr_zipcode_184M_max', 'registaddr_zipcode_184M_mean', 'relationshiptoclient_415T_max', 'relationshiptoclient_642T_max', 'remitter_829L_max', 'role_993L_max', 'role_993L_mean', 'safeguarantyflag_411L_mean', 'type_25L_max', 'type_25L_mean', 'last180dayaveragebalance_704A_min', 'last180dayaveragebalance_704A_max', 'last180dayaveragebalance_704A_mean', 'last180dayturnover_1134A_min', 'last180dayturnover_1134A_max', 'last180dayturnover_1134A_mean', 'last30dayturnover_651A_min', 'last30dayturnover_651A_max', 'last30dayturnover_651A_mean', 'annualeffectiverate_63L_max', 'classificationofcontr_13M_mean', 'contractst_545M_max', 'contractst_545M_mean', 'contractsum_5085717L_max', 'credlmt_230A', 'credlmt_230A_min', 'credlmt_230A_max', 'credlmt_230A_mean', 'credlmt_935A', 'credlmt_935A_min', 'credlmt_935A_max', 'credlmt_935A_mean', 'dateofcredend_289D_max', 'dateofcredend_289D_mean', 'dateofcredend_353D_min', 'dateofcredstart_181D_min', 'dateofrealrepmt_138D_min', 'debtoutstand_525A', 'debtoutstand_525A_min', 'debtoutstand_525A_max', 'debtoutstand_525A_mean', 'description_351M_max', 'dpdmaxdatemonth_442T_mean', 'dpdmaxdatemonth_89T_max', 'dpdmaxdatemonth_89T_mean', 'dpdmaxdateyear_596T_max', 'interestrate_508L_max', 'lastupdate_1112D_max', 'lastupdate_1112D_min', 'lastupdate_1112D_mean', 'lastupdate_388D_min', 'monthlyinstlamount_332A', 'monthlyinstlamount_332A_max', 'monthlyinstlamount_332A_mean', 'monthlyinstlamount_674A', 'monthlyinstlamount_674A_min', 'monthlyinstlamount_674A_max', 'monthlyinstlamount_674A_mean', 'nominalrate_281L_max', 'nominalrate_498L_max', 'numberofcontrsvalue_258L_mean', 'numberofinstls_320L_max', 'numberofoutstandinstls_520L_mean', 'numberofoutstandinstls_59L_max', 'numberofoverdueinstlmax_1039L_max', 'numberofoverdueinstlmaxdat_148D_min', 'numberofoverdueinstlmaxdat_148D_mean', 'numberofoverdueinstls_725L_max', 'numberofoverdueinstls_834L_mean', 'outstandingamount_354A', 'outstandingamount_354A_min', 'outstandingamount_354A_max', 'outstandingamount_354A_mean', 'outstandingamount_362A', 'outstandingamount_362A_min', 'outstandingamount_362A_max', 'outstandingamount_362A_mean', 'overdueamount_31A', 'overdueamount_31A_min', 'overdueamount_31A_max', 'overdueamount_31A_mean', 'overdueamount_659A_min', 'overdueamountmaxdatemonth_284T_mean', 'overdueamountmaxdatemonth_365T_max', 'overdueamountmaxdatemonth_365T_mean', 'overdueamountmaxdateyear_2T_max', 'periodicityofpmts_837L_max', 'periodicityofpmts_837L_mean', 'prolongationcount_599L_max', 'purposeofcred_426M_max', 'purposeofcred_426M_mean', 'refreshdate_3813885D_max', 'refreshdate_3813885D_min', 'refreshdate_3813885D_mean', 'residualamount_488A', 'residualamount_488A_min', 'residualamount_488A_max', 'residualamount_488A_mean', 'residualamount_856A_max', 'residualamount_856A_mean', 'totaldebtoverduevalue_718A', 'totaldebtoverduevalue_718A_min', 'totaldebtoverduevalue_718A_max', 'totaldebtoverduevalue_718A_mean', 'totaloutstanddebtvalue_39A', 'totaloutstanddebtvalue_39A_min', 'totaloutstanddebtvalue_39A_max', 'totaloutstanddebtvalue_39A_mean', 'totaloutstanddebtvalue_668A', 'addres_district_368M_max', 'addres_district_368M_mean', 'addres_role_871L_max', 'addres_role_871L_mean', 'addres_zip_823M_max', 'addres_zip_823M_mean', 'conts_role_79M_max', 'conts_role_79M_mean', 'empls_economicalst_849M_max', 'empls_economicalst_849M_mean', 'empls_employedfrom_796D_max', 'empls_employedfrom_796D_min', 'empls_employedfrom_796D_mean', 'empls_employer_name_740M_max', 'empls_employer_name_740M_mean', 'relatedpersons_role_762T_max', 'relatedpersons_role_762T_mean'])

In [52]:
# train_df = train_df.drop(['actualdpdtolerance_344P', 'annuity_780A', 'annuitynextmonth_57A', 'applicationscnt_464L', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'clientscnt3m_3712950L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1130L', 'clientscnt_533L', 'clientscnt_887L', 'downpmt_116A', 'homephncnt_628L', 'lastapprdate_640D', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxpmtlast3m_4525190A', 'numactivecredschannel_414L', 'numinstpaid_4499208L', 'numinstregularpaidest_4493210L', 'posfpd10lastmonth_333P', 'posfpd30lastmonth_3976960P', 'posfstqpd30lastmonth_3976962P', 'totinstallast1m_4525188A', 'twobodfilling_608L', 'assignmentdate_4527235D', 'maritalst_385M', 'pmtcount_4527229L', 'approvaldate_319D_min', 'credacc_maxhisbal_375A_min', 'credacc_maxhisbal_375A_max', 'credacc_maxhisbal_375A_mean', 'credacc_minhisbal_90A_min', 'credacc_minhisbal_90A_max', 'credacc_minhisbal_90A_mean', 'credacc_transactions_402L_max', 'credacc_transactions_402L_mean', 'credtype_587L_max', 'credtype_587L_mean', 'downpmt_134A_mean', 'dtlastpmt_581D_mean', 'dtlastpmtallstes_3545839D_max', 'employedfrom_700D_min', 'firstnonzeroinstldate_307D_min', 'inittransactioncode_279L_max', 'inittransactioncode_279L_mean', 'isdebitcard_527L_max', 'revolvingaccount_394A', 'revolvingaccount_394A_min', 'revolvingaccount_394A_max', 'revolvingaccount_394A_mean', 'amount_4917619A_min', 'amount_4917619A_max', 'amount_4917619A_mean', 'processingdate_168D_max', 'interesteffectiverate_369L_mean', 'amount_416A', 'amount_416A_min', 'amount_416A_max', 'amount_416A_mean', 'contractenddate_991D_max', 'contractenddate_991D_min', 'contractenddate_991D_mean', 'openingdate_313D_max', 'openingdate_313D_min', 'openingdate_313D_mean', 'education_927M_mean', 'empl_employedfrom_271D_max', 'empl_employedfrom_271D_min', 'empl_employedfrom_271D_mean', 'housetype_905L_max', 'personindex_1023L_max', 'persontype_1072L_mean', 'persontype_792L_max', 'relationshiptoclient_415T_mean', 'relationshiptoclient_642T_mean', 'remitter_829L_mean', 'role_1084L_max', 'role_1084L_mean', 'last180dayaveragebalance_704A', 'last180dayturnover_1134A', 'last30dayturnover_651A', 'openingdate_857D_max', 'openingdate_857D_min', 'openingdate_857D_mean', 'annualeffectiverate_199L_max', 'classificationofcontr_13M_max', 'classificationofcontr_400M_mean', 'dateofcredend_289D_min', 'description_351M_mean', 'dpdmaxdateyear_896T_max', 'instlamount_768A', 'instlamount_768A_max', 'instlamount_768A_mean', 'instlamount_852A', 'instlamount_852A_max', 'instlamount_852A_mean', 'interestrate_508L_mean', 'nominalrate_281L_mean', 'nominalrate_498L_mean', 'numberofinstls_229L_max', 'numberofoutstandinstls_59L_mean', 'overdueamountmax2date_1002D_min', 'overdueamountmax2date_1002D_mean', 'overdueamountmaxdateyear_994T_max', 'periodicityofpmts_1102L_mean', 'residualamount_856A', 'totalamount_6A_min', 'totalamount_996A', 'totalamount_996A_min', 'totalamount_996A_max', 'totaloutstanddebtvalue_668A_min', 'totaloutstanddebtvalue_668A_max', 'totaloutstanddebtvalue_668A_mean'])
# test_df = test_df.drop(['actualdpdtolerance_344P', 'annuity_780A', 'annuitynextmonth_57A', 'applicationscnt_464L', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'clientscnt3m_3712950L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1130L', 'clientscnt_533L', 'clientscnt_887L', 'downpmt_116A', 'homephncnt_628L', 'lastapprdate_640D', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxpmtlast3m_4525190A', 'numactivecredschannel_414L', 'numinstpaid_4499208L', 'numinstregularpaidest_4493210L', 'posfpd10lastmonth_333P', 'posfpd30lastmonth_3976960P', 'posfstqpd30lastmonth_3976962P', 'totinstallast1m_4525188A', 'twobodfilling_608L', 'assignmentdate_4527235D', 'maritalst_385M', 'pmtcount_4527229L', 'approvaldate_319D_min', 'credacc_maxhisbal_375A_min', 'credacc_maxhisbal_375A_max', 'credacc_maxhisbal_375A_mean', 'credacc_minhisbal_90A_min', 'credacc_minhisbal_90A_max', 'credacc_minhisbal_90A_mean', 'credacc_transactions_402L_max', 'credacc_transactions_402L_mean', 'credtype_587L_max', 'credtype_587L_mean', 'downpmt_134A_mean', 'dtlastpmt_581D_mean', 'dtlastpmtallstes_3545839D_max', 'employedfrom_700D_min', 'firstnonzeroinstldate_307D_min', 'inittransactioncode_279L_max', 'inittransactioncode_279L_mean', 'isdebitcard_527L_max', 'revolvingaccount_394A', 'revolvingaccount_394A_min', 'revolvingaccount_394A_max', 'revolvingaccount_394A_mean', 'amount_4917619A_min', 'amount_4917619A_max', 'amount_4917619A_mean', 'processingdate_168D_max', 'interesteffectiverate_369L_mean', 'amount_416A', 'amount_416A_min', 'amount_416A_max', 'amount_416A_mean', 'contractenddate_991D_max', 'contractenddate_991D_min', 'contractenddate_991D_mean', 'openingdate_313D_max', 'openingdate_313D_min', 'openingdate_313D_mean', 'education_927M_mean', 'empl_employedfrom_271D_max', 'empl_employedfrom_271D_min', 'empl_employedfrom_271D_mean', 'housetype_905L_max', 'personindex_1023L_max', 'persontype_1072L_mean', 'persontype_792L_max', 'relationshiptoclient_415T_mean', 'relationshiptoclient_642T_mean', 'remitter_829L_mean', 'role_1084L_max', 'role_1084L_mean', 'last180dayaveragebalance_704A', 'last180dayturnover_1134A', 'last30dayturnover_651A', 'openingdate_857D_max', 'openingdate_857D_min', 'openingdate_857D_mean', 'annualeffectiverate_199L_max', 'classificationofcontr_13M_max', 'classificationofcontr_400M_mean', 'dateofcredend_289D_min', 'description_351M_mean', 'dpdmaxdateyear_896T_max', 'instlamount_768A', 'instlamount_768A_max', 'instlamount_768A_mean', 'instlamount_852A', 'instlamount_852A_max', 'instlamount_852A_mean', 'interestrate_508L_mean', 'nominalrate_281L_mean', 'nominalrate_498L_mean', 'numberofinstls_229L_max', 'numberofoutstandinstls_59L_mean', 'overdueamountmax2date_1002D_min', 'overdueamountmax2date_1002D_mean', 'overdueamountmaxdateyear_994T_max', 'periodicityofpmts_1102L_mean', 'residualamount_856A', 'totalamount_6A_min', 'totalamount_996A', 'totalamount_996A_min', 'totalamount_996A_max', 'totaloutstanddebtvalue_668A_min', 'totaloutstanddebtvalue_668A_max', 'totaloutstanddebtvalue_668A_mean'])

In [53]:
gc.collect()

0

In [54]:
from sklearn.utils import resample

majority_class = train_df.filter(pl.col('target') == 0)
minority_class = train_df.filter(pl.col('target') == 1)

majority_downsampled = resample(
                                majority_class,
                                replace=False,
                                n_samples=len(minority_class),
                                random_state=42
                               )

train_d_df = pl.concat([majority_downsampled, minority_class])

train_d_df = train_d_df.sample(fraction=1, seed=42)

In [55]:
del majority_class
del minority_class

In [56]:
gc.collect()

0

In [57]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Assuming train_static_0 contains your data
# Splitting data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(train_d_df.drop('target'), train_d_df['target'], test_size=0.2, random_state=42)
y_train = train_d_df['target']
X_train = train_d_df.drop('target')
X_test = test_df

# Initializing XGBoost classifier
model = xgb.XGBClassifier(
    objective='binary:logistic',  # for binary classification
    eval_metric='logloss'       # Evaluation metric
)

# Training the classifier
model.fit(X_train, y_train)

In [58]:
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort the features by importance in descending order
sorted_features = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the top N important features
top_n = 50  # Change this value as needed
top_features = sorted_features['Feature'].head(top_n).tolist()

In [59]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Assuming train_static_0 contains your data
# Splitting data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(train_d_df.drop('target'), train_d_df['target'], test_size=0.2, random_state=42)
y_train = train_d_df['target']
X_train = train_d_df.drop('target').select(top_features)
X_test = test_df.select(top_features)

# Initializing XGBoost classifier
model = xgb.XGBClassifier(
    objective='binary:logistic',  # for binary classification
    eval_metric='logloss'       # Evaluation metric
)

# Training the classifier
model.fit(X_train, y_train)

In [60]:
y_pred1 = model.predict_proba(X_test)

In [61]:
# Assuming train_static_0 contains your data
# Splitting data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(train_d_df.drop('target'), train_d_df['target'], test_size=0.2, random_state=42
from sklearn.ensemble import RandomForestClassifier

# Assuming you have your data X and y ready

# Initialize Random Forest classifier
clf = RandomForestClassifier(n_estimators=300)

# Fit the classifier to your data
clf.fit(X_train, y_train)

In [62]:
y_pred2 = clf.predict_proba(X_test)

In [63]:
y_pred = []
for i in range(len(y_pred1)):
    y_pred.append((y_pred1[:, 1][i] + y_pred2[:, 1][i])/2)

In [64]:
# label_probabilities = y_pred[:, 1]

In [65]:
# majority_downsampled = resample(
#                                 majority_class,
#                                 replace=False,
#                                 n_samples=len(minority_class),
#                                 random_state=13
#                                )

# train_d_df = pl.concat([majority_downsampled, minority_class])

# train_d_df = train_d_df.sample(fraction=1, seed=42)

# model = xgb.XGBClassifier(
#     objective='binary:logistic',  # for binary classification
#     eval_metric='logloss'       # Evaluation metric
# )

# # Training the classifier
# model.fit(X_train, y_train)

# y_pred = model.predict_proba(X_test)
# label_probabilities2 = y_pred[:, 1]

In [66]:
# majority_downsampled = resample(
#                                 majority_class,
#                                 replace=False,
#                                 n_samples=len(minority_class),
#                                 random_state=0
#                                )

# train_d_df = pl.concat([majority_downsampled, minority_class])

# train_d_df = train_d_df.sample(fraction=1, seed=42)

# model = xgb.XGBClassifier(
#     objective='binary:logistic',  # for binary classification
#     eval_metric='logloss'       # Evaluation metric
# )

# # Training the classifier
# model.fit(X_train, y_train)

# y_pred = model.predict_proba(X_test)
# label_probabilities3 = y_pred[:, 1]

In [67]:
# majority_downsampled = resample(
#                                 majority_class,
#                                 replace=False,
#                                 n_samples=len(minority_class),
#                                 random_state=89
#                                )

# train_d_df = pl.concat([majority_downsampled, minority_class])

# train_d_df = train_d_df.sample(fraction=1, seed=42)

# model = xgb.XGBClassifier(
#     objective='binary:logistic',  # for binary classification
#     eval_metric='logloss'       # Evaluation metric
# )

# # Training the classifier
# model.fit(X_train, y_train)

# y_pred = model.predict_proba(X_test)
# label_probabilities4 = y_pred[:, 1]

In [68]:
# majority_downsampled = resample(
#                                 majority_class,
#                                 replace=False,
#                                 n_samples=len(minority_class),
#                                 random_state=25
#                                )

# train_d_df = pl.concat([majority_downsampled, minority_class])

# train_d_df = train_d_df.sample(fraction=1, seed=42)

# model = xgb.XGBClassifier(
#     objective='binary:logistic',  # for binary classification
#     eval_metric='logloss'       # Evaluation metric
# )

# # Training the classifier
# model.fit(X_train, y_train)

# y_pred = model.predict_proba(X_test)
# label_probabilities5 = y_pred[:, 1]

In [69]:
# label_probabilities = []
# for i in range(len(label_probabilities1)):
#     if label_probabilities1[i] > 0.4 and label_probabilities1[i]<0.9:
#         label_probabilities.append(label_probabilities1[i]+0.1)
#     else:
#         label_probabilities.append(label_probabilities1[i])


In [70]:
# label_probabilities = []
# for i in range(len(label_probabilities1)):
#     label_probabilities.append(label_probabilities1[i]+label_probabilities2[i]+label_probabilities3[i]+label_probabilities4[i]+label_probabilities5[i])

In [71]:
# y_pred = model.predict(X_test)

In [72]:
df = pd.read_csv(dataPath + "sample_submission.csv")
df = df.set_index("case_id")

df["score"] = y_pred

In [73]:
df.to_csv("./submission.csv")