In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import os
import zipfile
import time
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_dir = os.path.join(os.getcwd(),'..') # This line may need to be adjusted depending on where the csv files are stored on your local env
application = pd.read_csv(os.path.join(data_dir,"application_train.csv"))
bureau = pd.read_csv(os.path.join(data_dir,"bureau.csv"))
bureau_balance = pd.read_csv(os.path.join(data_dir,"bureau_balance.csv"))
previous_application = pd.read_csv(os.path.join(data_dir,"previous_application.csv"))
POS_CASH_balance = pd.read_csv(os.path.join(data_dir,"POS_CASH_balance.csv"))
installments_payments = pd.read_csv(os.path.join(data_dir,"installments_payments.csv"))
credit_card_balance = pd.read_csv(os.path.join(data_dir,"credit_card_balance.csv"))
app_train = pd.read_csv(os.path.join(data_dir,"application_train.csv"))

y = application["TARGET"]
X = application.drop("TARGET", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Create a class to select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
# Identify the numeric features we wish to consider.
num_attribs = [
    'AMT_INCOME_TOTAL',  'AMT_CREDIT','DAYS_EMPLOYED','DAYS_BIRTH','EXT_SOURCE_1',
    'EXT_SOURCE_2','EXT_SOURCE_3']
 
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler()),
    ])
# Identify the categorical features we wish to consider.
cat_attribs = ['CODE_GENDER', 'FLAG_OWN_REALTY','FLAG_OWN_CAR','NAME_CONTRACT_TYPE',
               'NAME_EDUCATION_TYPE','OCCUPATION_TYPE','NAME_INCOME_TYPE']
 
# Notice handle_unknown="ignore" in OHE which ignore values from the validation/test that
# do NOT occur in the training set
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        #('imputer', SimpleImputer(strategy='most_frequent')),
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        #('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
        ('ohe', OneHotEncoder(handle_unknown="ignore"))

    ])
 
data_prep_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [5]:
# Logistic Regression Model Pipeline
#%%time
np.random.seed(42)
full_pipeline_with_predictor = Pipeline([
        ("preparation", data_prep_pipeline),
        ("logistic", LogisticRegression(penalty='l2', class_weight='balanced', random_state=42,C=0.1,n_jobs=-1,solver='liblinear')
        )
    ])
model_logistic = full_pipeline_with_predictor.fit(X_train, y_train)
 
# Random Forest Model Pipeline
#%%time
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
full_pipeline_with_predictor = Pipeline([
        ("preparation", data_prep_pipeline),
        ("random_forest", RandomForestClassifier(class_weight='balanced', random_state=42))
    ])
model_rf = full_pipeline_with_predictor.fit(X_train, y_train)
 # XGBoost Model Pipeline
#%pip install xgboost
from xgboost import XGBClassifier
 
 
np.random.seed(42)
full_pipeline_with_predictor = Pipeline([
        ("preparation", data_prep_pipeline),
        ("xgb", XGBClassifier(random_state=42,
                              use_label_encoder=False,
                              eval_metric='logloss',scale_pos_weight=11.47))
    ])
model_xgb = full_pipeline_with_predictor.fit(X_train, y_train)
 
# LightGBM Model Pipeline
#%pip install lightgbm
from lightgbm import LGBMClassifier
 
 
#%%time
np.random.seed(42)
full_pipeline_with_predictor = Pipeline([
        ("preparation", data_prep_pipeline),
        ("lgbm", LGBMClassifier(scale_pos_weight=11.47,random_state=42)) # Scale weight set becasue of class imbalance
    ])
model_lgbm = full_pipeline_with_predictor.fit(X_train, y_train)
 
# CatBoost Model Pipeline
#%pip install catboost
import catboost
 
#%%time
 
np.random.seed(42)
full_pipeline_with_predictor = Pipeline([
        ("preparation", data_prep_pipeline),
        ("catboost", catboost.CatBoostClassifier(verbose=0,
        auto_class_weights='Balanced'))
    ])
model_cat = full_pipeline_with_predictor.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 19876, number of negative: 226132
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009952 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1801
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080794 -> initscore=-2.431606
[LightGBM] [Info] Start training from score -2.431606


In [6]:
def aggregate_dataset(df, group_col, prefix, agg_dict):
    """Aggregates a dataframe by group_col using agg_dict and renames columns."""
    agg_df = df.groupby(group_col).agg(agg_dict)
    agg_df.columns = [f"{prefix}_{col}_{func}".replace(" ", "_") 
                      for col, funcs in agg_dict.items() for func in funcs]
    return agg_df.reset_index()

In [None]:
# --- Aggregate Bureau Balance first ---
if 'bureau_balance' in globals() or 'bureau_balance' in locals():
    bb_agg = aggregate_dataset(
        bureau_balance,
        group_col='SK_ID_BUREAU',
        prefix='BB',
        agg_dict={
            'MONTHS_BALANCE': ['min', 'max', 'mean'],
            'STATUS': ['nunique']
        }
    )
    bureau = bureau.merge(bb_agg, on='SK_ID_BUREAU', how='left')

# --- Bureau aggregate by SK_ID_CURR ---
bureau_agg = aggregate_dataset(
    bureau,
    group_col='SK_ID_CURR',
    prefix='BUREAU',
    agg_dict={
        'DAYS_CREDIT': ['min', 'max', 'mean', 'std'],
        'AMT_CREDIT_SUM': ['sum', 'mean', 'max'],
        'AMT_CREDIT_SUM_DEBT': ['sum', 'mean'],
        'AMT_CREDIT_SUM_OVERDUE': ['sum', 'max'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'CREDIT_DAY_OVERDUE': ['max'],
        'SK_ID_BUREAU': ['count'],  # number of credit lines
    }
)

# Ratio features inside bureau
bureau_agg['BUREAU_DEBT_RATIO'] = (
    bureau_agg['BUREAU_AMT_CREDIT_SUM_DEBT_sum'] /
    bureau_agg['BUREAU_AMT_CREDIT_SUM_sum']
).replace([np.inf, -np.inf], np.nan)

prev_agg = aggregate_dataset(
    previous_application,
    group_col='SK_ID_CURR',
    prefix='PREV',
    agg_dict={
        'AMT_APPLICATION': ['mean', 'min', 'max'],
        'AMT_CREDIT': ['mean', 'min', 'max'],
        'AMT_DOWN_PAYMENT': ['mean', 'min', 'max'],
        'DAYS_FIRST_DRAWING': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['mean'],
        'NAME_CONTRACT_STATUS': ['nunique'],
        'SK_ID_PREV': ['count']  # number of previous apps
    }
)

pos_agg = aggregate_dataset(
    POS_CASH_balance,
    group_col='SK_ID_CURR',
    prefix='POS',
    agg_dict={
        'MONTHS_BALANCE': ['min', 'max', 'mean'],
        'SK_ID_PREV': ['nunique'],
        'CNT_INSTALMENT': ['sum', 'mean'],
        'CNT_INSTALMENT_FUTURE': ['sum', 'mean']
    }
)

install_agg = aggregate_dataset(
    installments_payments,
    group_col='SK_ID_CURR',
    prefix='INST',
    agg_dict={
        'AMT_INSTALMENT': ['sum', 'mean', 'max'],
        'AMT_PAYMENT': ['sum', 'mean', 'max'],
        'DAYS_ENTRY_PAYMENT': ['min', 'max', 'mean']
    }
)

# Add repayment ratio feature
install_agg['INST_PAYMENT_RATIO'] = (
    install_agg['INST_AMT_PAYMENT_sum'] /
    install_agg['INST_AMT_INSTALMENT_sum']
).replace([np.inf, -np.inf], np.nan)

ccb_agg = aggregate_dataset(
    credit_card_balance,
    group_col='SK_ID_CURR',
    prefix='CC',
    agg_dict={
        'AMT_BALANCE': ['mean', 'max'],
        'AMT_CREDIT_LIMIT_ACTUAL': ['mean', 'max'],
        'AMT_DRAWINGS_ATM_CURRENT': ['sum', 'mean'],
        'AMT_PAYMENT_TOTAL_CURRENT': ['sum', 'mean'],
        'MONTHS_BALANCE': ['min', 'max', 'mean']
    }
)

# Add balance ratio
ccb_agg['CC_BALANCE_TO_CREDIT_LIMIT'] = (
    ccb_agg['CC_AMT_BALANCE_mean'] /
    ccb_agg['CC_AMT_CREDIT_LIMIT_ACTUAL_mean']
).replace([np.inf, -np.inf], np.nan)

# Simple derived features
prev_agg['PREV_APPLICATION_CREDIT_DIFF_mean'] = \
    prev_agg['PREV_AMT_APPLICATION_mean'] - prev_agg['PREV_AMT_CREDIT_mean']

prev_agg['PREV_APPLICATION_CREDIT_RATIO_mean'] = \
    prev_agg['PREV_AMT_APPLICATION_mean'] / prev_agg['PREV_AMT_CREDIT_mean']


In [9]:
application = application.merge(bureau_agg, on='SK_ID_CURR', how='left')
application = application.merge(prev_agg, on='SK_ID_CURR', how='left')
application = application.merge(pos_agg, on='SK_ID_CURR', how='left')
application = application.merge(install_agg, on='SK_ID_CURR', how='left')
application = application.merge(ccb_agg, on='SK_ID_CURR', how='left')

# Fill NaNs (optional)
application.fillna(0, inplace=True)

# Quick sanity check
print("Final application shape:", application.shape)
application.head()


Final application shape: (307511, 222)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CC_AMT_CREDIT_LIMIT_ACTUAL_mean,CC_AMT_CREDIT_LIMIT_ACTUAL_max,CC_AMT_DRAWINGS_ATM_CURRENT_sum,CC_AMT_DRAWINGS_ATM_CURRENT_mean,CC_AMT_PAYMENT_TOTAL_CURRENT_sum,CC_AMT_PAYMENT_TOTAL_CURRENT_mean,CC_MONTHS_BALANCE_min,CC_MONTHS_BALANCE_max,CC_MONTHS_BALANCE_mean,CC_BALANCE_TO_CREDIT_LIMIT
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,270000.0,270000.0,0.0,0.0,0.0,0.0,-6.0,-1.0,-3.5,0.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
print("Bureau agg cols:", len(bureau_agg.columns))
print("Prev agg cols:", len(prev_agg.columns))
print("POS agg cols:", len(pos_agg.columns))
print("Installment agg cols:", len(install_agg.columns))
print("CC agg cols:", len(ccb_agg.columns))

application.to_csv("application_train_engineered.csv", index=False)

Bureau agg cols: 16
Prev agg cols: 21
POS agg cols: 9
Installment agg cols: 11
CC agg cols: 13
