# GCI Competition 2025

- Let's begin to code
- I hope I will learn so many thing while doing this assignment.

In [350]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [351]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [352]:
# load the data
# this might take a few seconds because the files are big
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

In [353]:
# check the shapes to make sure everything loaded
print("train shape:", train.shape)
print("test shape:", test.shape)
print("submission shape:", sub.shape)

train shape: (171202, 34)
test shape: (61500, 33)
submission shape: (61500, 2)


In [354]:
# look at the first few rows
train.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,0,Cash loans,F,0,112500.0,755190.0,36328.5,675000.0,Working,Higher education,...,School,,0.372591,,-292.0,,,,,0
1,1,Cash loans,F,0,225000.0,585000.0,16893.0,585000.0,Pensioner,Secondary / secondary special,...,XNA,,0.449567,0.553165,-617.0,0.0,0.0,0.0,1.0,0
2,2,Cash loans,F,0,54000.0,334152.0,18256.5,270000.0,State servant,Secondary / secondary special,...,Postal,,0.569503,,-542.0,,,,,0
3,3,Cash loans,F,0,67500.0,152820.0,8901.0,135000.0,Pensioner,Lower secondary,...,XNA,,0.105235,0.767523,0.0,0.0,0.0,0.0,0.0,0
4,4,Cash loans,M,0,157500.0,271066.5,21546.0,234000.0,Commercial associate,Secondary / secondary special,...,Business Entity Type 3,0.342344,0.20249,0.669057,-1243.0,0.0,0.0,0.0,4.0,1


In [355]:
# checking how many categorical columns we have
print("original train shape:", train.shape)

original train shape: (171202, 34)


In [356]:
# one-hot encoding (converting text to multiple numeric columns)
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [357]:
print("train shape after encoding:", train.shape)
print("test shape after encoding:", test.shape)

train shape after encoding: (171202, 133)
test shape after encoding: (61500, 130)


In [358]:
# aligning the dataframes
# this ensures both have the exact same columns (except TARGET)
train_target = train['TARGET'] # save the target
train, test = train.align(test, join='inner', axis=1) # keep only common columns
train['TARGET'] = train_target # add target back to train

In [359]:
print("------------------------------------------------")
print("final train shape:", train.shape)
print("final test shape:", test.shape)

------------------------------------------------
final train shape: (171202, 131)
final test shape: (61500, 130)


In [360]:
import re

In [361]:
# function to clean column names
# it replaces any weird character with an underscore
def clean_names(col_name):
    return re.sub(r'[^\w]', '_', col_name)

In [362]:
# apply this to both train and test columns
train.columns = [clean_names(col) for col in train.columns]
test.columns = [clean_names(col) for col in test.columns]

In [363]:
# checking if it looks cleaner now
print("cleaned column names (example):", train.columns.tolist()[:5])

cleaned column names (example): ['SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY']


In [364]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [365]:
## separate features (X) and target (y)
X = train.drop('TARGET', axis=1)
y = train['TARGET']

In [366]:
## split the data
# 80% for training, 20% for validation to check our score locally
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [367]:
print("train size:", X_train.shape)
print("validation size:", X_val.shape)

train size: (136961, 130)
validation size: (34241, 130)


In [368]:
## define the model
# using standard settings for now
model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

In [369]:
## train the model
print("training the model...")
model.fit(X_train, y_train)

training the model...
[LightGBM] [Info] Number of positive: 11021, number of negative: 125940
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3614
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080468 -> initscore=-2.436003
[LightGBM] [Info] Start training from score -2.436003


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [370]:
## check performance
# we need probabilities ([:,1]) for roc-auc, not just 0 or 1
y_pred = model.predict_proba(X_val)[:, 1]

score = roc_auc_score(y_val, y_pred)
print(f"baseline roc-auc score: {score:.4f}")

baseline roc-auc score: 0.7508


In [371]:
### creating new features based on financial logic

In [372]:
## income per person (how much money they actually have per family member)
train['inc_per_person'] = train['AMT_INCOME_TOTAL'] / train['CNT_FAM_MEMBERS']
test['inc_per_person'] = test['AMT_INCOME_TOTAL'] / test['CNT_FAM_MEMBERS']

In [373]:
### payment rate (annuity divided by credit)
# this tells us if the yearly payment is too high for the loan amount
train['payment_rate'] = train['AMT_ANNUITY'] / train['AMT_CREDIT']
test['payment_rate'] = test['AMT_ANNUITY'] / test['AMT_CREDIT']

In [374]:
## income vs credit ratio (can they afford this loan?)
train['inc_credit_ratio'] = train['AMT_INCOME_TOTAL'] / train['AMT_CREDIT']
test['inc_credit_ratio'] = test['AMT_INCOME_TOTAL'] / test['AMT_CREDIT']

In [375]:
## average of external sources
# ext_source_1, 2, and 3 are credit scores from other agencies. averaging them is powerful.
train['ext_source_mean'] = train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
test['ext_source_mean'] = test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)

In [376]:
print("added 4 new smart features!")
print("new train shape:", train.shape)

added 4 new smart features!
new train shape: (171202, 135)


In [377]:
# re-training with new features

## separate features (X) and target (y) again
X = train.drop('TARGET', axis=1)
y = train['TARGET']

In [378]:
## split the data
# using the same random_state=42 so the comparison is fair
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [379]:
## train again
print("training with smart features...")
model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
model.fit(X_train, y_train)

training with smart features...
[LightGBM] [Info] Number of positive: 11021, number of negative: 125940
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4596
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080468 -> initscore=-2.436003
[LightGBM] [Info] Start training from score -2.436003


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [380]:
## check new score
y_pred = model.predict_proba(X_val)[:, 1]
new_score = roc_auc_score(y_val, y_pred)

In [381]:
print(f"new roc-auc score: {new_score:.4f}")


new roc-auc score: 0.7607


In [382]:
# calculating how much we improved
print(f"improvement: {new_score - 0.7508:.4f}")

improvement: 0.0099


In [383]:
from sklearn.model_selection import KFold

# 1. setup for cross-validation
folds = KFold(n_splits=5, shuffle=True, random_state=42)

# arrays to store results
oof_preds = np.zeros(train.shape[0]) # stores validation predictions
sub_preds = np.zeros(test.shape[0])  # stores final test predictions
scores = []

# 2. loop 5 times (train 5 models)
feature_columns = train.columns.drop('TARGET')

print("starting 5-fold training...")

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train)):
    # get the data for this fold
    X_train_fold, y_train_fold = train[feature_columns].iloc[train_idx], train['TARGET'].iloc[train_idx]
    X_val_fold, y_val_fold = train[feature_columns].iloc[val_idx], train['TARGET'].iloc[val_idx]
    
    # train the model
    model = lgb.LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        random_state=42
    )
    model.fit(X_train_fold, y_train_fold)
    
    # predict on validation fold
    oof_preds[val_idx] = model.predict_proba(X_val_fold)[:, 1]
    
    # predict on the real test set (and add to sum)
    sub_preds += model.predict_proba(test[feature_columns])[:, 1] / folds.get_n_splits()
    
    # check score for this fold
    fold_score = roc_auc_score(y_val_fold, oof_preds[val_idx])
    scores.append(fold_score)
    print(f"fold {n_fold+1} roc-auc: {fold_score:.4f}")

print("------------------------------------------------")
print(f"average roc-auc score: {np.mean(scores):.4f}")

starting 5-fold training...
[LightGBM] [Info] Number of positive: 11021, number of negative: 125940
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076566 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4596
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080468 -> initscore=-2.436003
[LightGBM] [Info] Start training from score -2.436003
fold 1 roc-auc: 0.7607
[LightGBM] [Info] Number of positive: 11048, number of negative: 125913
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052923 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4588
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 128
[LightGBM] [Info] [binary

In [384]:
# # create the submission dataframe
# # we use the predictions we accumulated in 'sub_preds'
# submission = pd.DataFrame({
#     'SK_ID_CURR': test['SK_ID_CURR'],
#     'TARGET': sub_preds
# })

# # look at the first 5 rows to make sure it looks right
# print(submission.head())

# # save it to a csv file (index=False is important!)
# submission.to_csv('submission.csv', index=False)
# print("successfully saved 'submission.csv'!")

In [385]:
# advanced feature engineering: polynomial features
# we interact the top 3 strongest features: EXT_SOURCE_1, 2, and 3

ext_cols = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

for col1 in ext_cols:
    for col2 in ext_cols:
        # avoid duplicates (e.g., only do 1*2, don't need 2*1)
        if col1 <= col2:
            new_col_name = f'poly_{col1}_x_{col2}'
            
            # create the new feature for both train and test
            train[new_col_name] = train[col1] * train[col2]
            test[new_col_name] = test[col1] * test[col2]

print("added polynomial features!")
print("new train shape:", train.shape)

added polynomial features!
new train shape: (171202, 141)


In [386]:
# re-running 5-fold cv with polynomial features

# reset predictions arrays
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
scores = []

feature_columns = train.columns.drop('TARGET')

print("starting 5-fold training (advanced)...")

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train)):
    # get data
    X_train_fold, y_train_fold = train[feature_columns].iloc[train_idx], train['TARGET'].iloc[train_idx]
    X_val_fold, y_val_fold = train[feature_columns].iloc[val_idx], train['TARGET'].iloc[val_idx]
    
    # train model
    model = lgb.LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        random_state=42
    )
    model.fit(X_train_fold, y_train_fold)
    
    # predict
    oof_preds[val_idx] = model.predict_proba(X_val_fold)[:, 1]
    sub_preds += model.predict_proba(test[feature_columns])[:, 1] / folds.get_n_splits()
    
    # score
    fold_score = roc_auc_score(y_val_fold, oof_preds[val_idx])
    scores.append(fold_score)
    print(f"fold {n_fold+1} roc-auc: {fold_score:.4f}")

print("------------------------------------------------")
print(f"new average roc-auc score: {np.mean(scores):.4f}")

starting 5-fold training (advanced)...
[LightGBM] [Info] Number of positive: 11021, number of negative: 125940
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.114817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6126
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 134
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080468 -> initscore=-2.436003
[LightGBM] [Info] Start training from score -2.436003
fold 1 roc-auc: 0.7594
[LightGBM] [Info] Number of positive: 11048, number of negative: 125913
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.131007 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6118
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 134
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080665 -> initscore=-2.43334

In [387]:
# step 10: hyperparameter tuning
# learning slower (0.05) but for more rounds (200)

oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
scores = []

feature_columns = train.columns.drop('TARGET')

print("starting final 5-fold run (tuned parameters)...")

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train)):
    X_train_fold, y_train_fold = train[feature_columns].iloc[train_idx], train['TARGET'].iloc[train_idx]
    X_val_fold, y_val_fold = train[feature_columns].iloc[val_idx], train['TARGET'].iloc[val_idx]
    
    # changing parameters here!
    model = lgb.LGBMClassifier(
        n_estimators=200,        # double the rounds
        learning_rate=0.05,      # half the speed
        num_leaves=31,           # standard limit to prevent overfitting
        random_state=42
    )
    model.fit(X_train_fold, y_train_fold)
    
    oof_preds[val_idx] = model.predict_proba(X_val_fold)[:, 1]
    sub_preds += model.predict_proba(test[feature_columns])[:, 1] / folds.get_n_splits()
    
    fold_score = roc_auc_score(y_val_fold, oof_preds[val_idx])
    scores.append(fold_score)
    print(f"fold {n_fold+1} roc-auc: {fold_score:.4f}")

print("------------------------------------------------")
print(f"final tuned average score: {np.mean(scores):.4f}")

starting final 5-fold run (tuned parameters)...
[LightGBM] [Info] Number of positive: 11021, number of negative: 125940
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6126
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 134
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080468 -> initscore=-2.436003
[LightGBM] [Info] Start training from score -2.436003
fold 1 roc-auc: 0.7616
[LightGBM] [Info] Number of positive: 11048, number of negative: 125913
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6118
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 134
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080665 -> initscore

In [388]:
# creating the final submission file
submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': sub_preds
})

# check first 5 rows
print(submission.head())

# save to csv
submission.to_csv('final_submission_tuned.csv', index=False)
print("successfully saved 'final_submission_tuned.csv'!")

   SK_ID_CURR    TARGET
0      171202  0.034026
1      171203  0.220989
2      171204  0.127578
3      171205  0.114224
4      171206  0.219348
successfully saved 'final_submission_tuned.csv'!


In [389]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

# 1. Load Data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Fix "DAYS_EMPLOYED" Anomaly
# 365243 means "Not Working" or "Pensioner", but it looks like a huge number to the AI.
# We replace it with NaN so LightGBM handles it correctly.
train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

# 3. Feature Engineering (The "Pro" Features)
print("Engineering advanced features...")

data = pd.concat([train, test], sort=False)

# Domain Knowledge Ratios
data['CREDIT_TERM'] = data['AMT_ANNUITY'] / data['AMT_CREDIT'] # How aggressive is the repayment?
data['GOODS_LOAN_RATIO'] = data['AMT_GOODS_PRICE'] / data['AMT_CREDIT'] # Did they borrow more than the item costs?
data['DAYS_EMPLOYED_PERCENT'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH'] # What % of their life have they worked?
data['INCOME_CREDIT_PERC'] = data['AMT_INCOME_TOTAL'] / data['AMT_CREDIT']
data['INCOME_PER_PERSON'] = data['AMT_INCOME_TOTAL'] / data['CNT_FAM_MEMBERS']
data['ANNUITY_INCOME_PERC'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL'] # Debt-to-Income Ratio

# Interaction Features (External Sources are crucial)
for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
    data[col].fillna(data[col].mean(), inplace=True) # Fill NaN with mean for interactions

data['EXT_SOURCE_WEIGHTED'] = data.EXT_SOURCE_1 * 2 + data.EXT_SOURCE_2 * 3 + data.EXT_SOURCE_3 * 4
data['EXT_SOURCES_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
data['EXT_SOURCES_PROD'] = data['EXT_SOURCE_1'] * data['EXT_SOURCE_2'] * data['EXT_SOURCE_3']

# Group Aggregations (Average risk by group)
# Example: What is the average EXT_SOURCE_3 for this Organization Type?
group_cols = ['ORGANIZATION_TYPE', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE']
for group in group_cols:
    # Calculate mean and std of EXT_SOURCES for each group
    agg = data.groupby(group)[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].agg(['mean', 'std'])
    agg.columns = [f'{group}_{col}_{stat}' for col, stat in agg.columns]
    data = data.merge(agg, on=group, how='left')

# 4. Encoding
print("Encoding categories...")
data = pd.get_dummies(data)

# Fix column names for LightGBM
data.columns = [re.sub(r'[^\w]', '_', col) for col in data.columns]

# Split back to train/test
train_df = data[data['TARGET'].notnull()]
test_df = data[data['TARGET'].isnull()].drop('TARGET', axis=1)

# 5. Hyper-Tuned Model (Slower Learning, More Depth)
print("Starting High-Precision 5-Fold CV...")

folds = KFold(n_splits=5, shuffle=True, random_state=1001)
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
scores = []

feature_columns = train_df.columns.drop('TARGET')

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train_df)):
    X_train, y_train = train_df[feature_columns].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    X_val, y_val = train_df[feature_columns].iloc[val_idx], train_df['TARGET'].iloc[val_idx]
    
    # "Grandmaster" Parameters
    model = lgb.LGBMClassifier(
        n_estimators=2000,        # Very high number of rounds
        learning_rate=0.02,       # Very slow learning (prevents overfitting)
        num_leaves=34,            # Slightly more complex trees
        colsample_bytree=0.9,     # Use 90% of features per tree
        subsample=0.8,            # Use 80% of data per tree
        max_depth=8,              # Limit depth to force generalization
        reg_alpha=0.04,           # L1 Regularization
        reg_lambda=0.07,          # L2 Regularization
        min_split_gain=0.02,
        min_child_weight=39,      # Conservative leaf size
        random_state=1001,
        n_jobs=-1
    )
    
    model.fit(
        X_train, y_train, 
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=200)]
    )
    
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    sub_preds += model.predict_proba(test_df)[:, 1] / folds.get_n_splits()
    
    score = roc_auc_score(y_val, oof_preds[val_idx])
    scores.append(score)
    print(f"Fold {n_fold+1} Score: {score:.5f}")

print("------------------------------------------------")
print(f"FINAL AVG ROC-AUC: {np.mean(scores):.5f}")

# 6. Save Submission
submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': sub_preds
})
submission.to_csv('submission_grandmaster.csv', index=False)
print("Saved to 'submission_grandmaster.csv'")

Loading data...
Engineering advanced features...
Encoding categories...
Starting High-Precision 5-Fold CV...
[LightGBM] [Info] Number of positive: 11043, number of negative: 125918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101511 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6423
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 151
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080629 -> initscore=-2.433834
[LightGBM] [Info] Start training from score -2.433834
Training until validation scores don't improve for 100 rounds
[200]	training's auc: 0.781031	training's binary_logloss: 0.239311	valid_1's auc: 0.752267	valid_1's binary_logloss: 0.24871
[400]	training's auc: 0.802417	training's binary_logloss: 0.231939	valid_1's auc: 0.758724	valid_1's binary_logloss: 0.246879
[600]	training's auc: 0.817613	training's binary_logloss: 0.226829	valid_1's auc:

In [390]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

# 1. Load Data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Preprocessing & Anomalies
print("Cleaning data...")
# Fix 365243 in DAYS_EMPLOYED
train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

# 3. Feature Engineering
print("Engineering features...")
data = pd.concat([train, test], sort=False)

# Domain Ratios
data['CREDIT_TERM'] = data['AMT_ANNUITY'] / data['AMT_CREDIT']
data['GOODS_LOAN_RATIO'] = data['AMT_GOODS_PRICE'] / data['AMT_CREDIT']
data['DAYS_EMPLOYED_PERCENT'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH']
data['INCOME_CREDIT_PERC'] = data['AMT_INCOME_TOTAL'] / data['AMT_CREDIT']
data['ANNUITY_INCOME_PERC'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL']

# External Sources (Crucial!)
for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
    data[col].fillna(data[col].mean(), inplace=True)

data['EXT_SCORE_WEIGHTED'] = data.EXT_SOURCE_1 * 2 + data.EXT_SOURCE_2 * 3 + data.EXT_SOURCE_3 * 4
data['EXT_SCORE_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
data['EXT_SCORE_PROD'] = data['EXT_SOURCE_1'] * data['EXT_SOURCE_2'] * data['EXT_SOURCE_3']

# Group Aggregations (The "Secret Sauce")
# Calculate mean/std of Financials for each Organization/Occupation
group_cols = ['ORGANIZATION_TYPE', 'OCCUPATION_TYPE', 'NAME_EDUCATION_TYPE']
agg_cols = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

for group in group_cols:
    agg = data.groupby(group)[agg_cols].agg(['mean', 'std', 'max'])
    agg.columns = [f'{group}_{col}_{stat}' for col, stat in agg.columns]
    data = data.merge(agg, on=group, how='left')

# 4. Preparing for Native Categorical Support
# Instead of get_dummies, we convert object columns to 'category' dtype
print("Encoding categories natively...")
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')

# Split back
train_df = data[data['TARGET'].notnull()]
test_df = data[data['TARGET'].isnull()].drop('TARGET', axis=1)

# Clean column names for LightGBM rules
train_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_df.columns]
test_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test_df.columns]

# 5. Stratified Cross-Validation
print("Starting Stratified 5-Fold CV...")
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)

oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
scores = []

# Exclude TARGET from features
feature_columns = [col for col in train_df.columns if col != 'TARGET']

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train_df, train_df['TARGET'])):
    X_train, y_train = train_df[feature_columns].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    X_val, y_val = train_df[feature_columns].iloc[val_idx], train_df['TARGET'].iloc[val_idx]
    
    # Optimized Parameters for this dataset
    model = lgb.LGBMClassifier(
        n_estimators=5000,        # Very high, rely on early_stopping
        learning_rate=0.01,       # Extremely slow learning for max precision
        num_leaves=34,
        colsample_bytree=0.9497,
        subsample=0.8716,
        max_depth=8,
        reg_alpha=0.0415,
        reg_lambda=0.0735,
        min_split_gain=0.0222,
        min_child_weight=39.326,
        random_state=1001,
        n_jobs=-1,
        verbose=-1                # Quiet mode
    )
    
    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(500)]
    )
    
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    sub_preds += model.predict_proba(test_df)[:, 1] / folds.get_n_splits()
    
    score = roc_auc_score(y_val, oof_preds[val_idx])
    scores.append(score)
    print(f"Fold {n_fold+1} Score: {score:.5f}")

print("------------------------------------------------")
print(f"FINAL STRATIFIED ROC-AUC: {np.mean(scores):.5f}")

# 6. Save
submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': sub_preds
})
submission.to_csv('submission_top_rank.csv', index=False)
print("Saved to 'submission_top_rank.csv'")

Loading data...
Cleaning data...
Engineering features...
Encoding categories natively...
Starting Stratified 5-Fold CV...
Training until validation scores don't improve for 200 rounds
[500]	valid_0's auc: 0.755535	valid_0's binary_logloss: 0.247267
[1000]	valid_0's auc: 0.75917	valid_0's binary_logloss: 0.246184
Early stopping, best iteration is:
[953]	valid_0's auc: 0.759198	valid_0's binary_logloss: 0.246171
Fold 1 Score: 0.75920
Training until validation scores don't improve for 200 rounds
[500]	valid_0's auc: 0.755399	valid_0's binary_logloss: 0.247151
Early stopping, best iteration is:
[680]	valid_0's auc: 0.75613	valid_0's binary_logloss: 0.246715
Fold 2 Score: 0.75613
Training until validation scores don't improve for 200 rounds
[500]	valid_0's auc: 0.754409	valid_0's binary_logloss: 0.246873
[1000]	valid_0's auc: 0.756923	valid_0's binary_logloss: 0.24613
Early stopping, best iteration is:
[1103]	valid_0's auc: 0.757151	valid_0's binary_logloss: 0.24611
Fold 3 Score: 0.75715
Tr

In [391]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

# 1. Load Data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Outlier Cleaning (Crucial for stability)
# Removing people with impossible income (e.g. 100 million) to stop them confusing the model
train = train[train['AMT_INCOME_TOTAL'] < 20000000]

# Fix the employment anomaly
train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

# 3. Advanced Feature Engineering
print("Creating advanced features...")
data = pd.concat([train, test], sort=False)

# Domain Ratios
data['CREDIT_TERM'] = data['AMT_ANNUITY'] / data['AMT_CREDIT']
data['GOODS_LOAN_RATIO'] = data['AMT_GOODS_PRICE'] / data['AMT_CREDIT']
data['DAYS_EMPLOYED_PERCENT'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH']
data['INCOME_CREDIT_PERC'] = data['AMT_INCOME_TOTAL'] / data['AMT_CREDIT']
data['ANNUITY_INCOME_PERC'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL']

# External Source Interactions
for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
    data[col].fillna(data[col].mean(), inplace=True)

data['EXT_SCORE_WEIGHTED'] = data.EXT_SOURCE_1 * 2 + data.EXT_SOURCE_2 * 3 + data.EXT_SOURCE_3 * 4
data['EXT_SCORE_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
data['EXT_SCORE_PROD'] = data['EXT_SOURCE_1'] * data['EXT_SOURCE_2'] * data['EXT_SOURCE_3']

# 4. Encoding
print("Encoding data...")
# using simple encoding for interaction with DART
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')

# Clean names
data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in data.columns]

# Split back
train_df = data[data['TARGET'].notnull()]
test_df = data[data['TARGET'].isnull()].drop('TARGET', axis=1)

# 5. Dual-Model Ensemble (The Winning Strategy)
# We train two models: GBDT (Standard) and DART (High Precision) and average them.
print("Starting Ensemble Training...")

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)

# Arrays to store predictions
oof_preds = np.zeros(train_df.shape[0])
sub_preds_gbdt = np.zeros(test_df.shape[0])
sub_preds_dart = np.zeros(test_df.shape[0])

feature_columns = [c for c in train_df.columns if c != 'TARGET']

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train_df, train_df['TARGET'])):
    X_train, y_train = train_df[feature_columns].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    X_val, y_val = train_df[feature_columns].iloc[val_idx], train_df['TARGET'].iloc[val_idx]
    
    # --- MODEL 1: GBDT (Gradient Boosting) ---
    model_gbdt = lgb.LGBMClassifier(
        n_estimators=3000,
        learning_rate=0.01,
        num_leaves=34,
        colsample_bytree=0.9,
        subsample=0.8,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.07,
        min_split_gain=0.02,
        min_child_weight=39,
        random_state=1001,
        n_jobs=-1,
        verbose=-1
    )
    
    model_gbdt.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        eval_metric='auc',
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)]
    )
    
    # predict gbdt
    gbdt_val_pred = model_gbdt.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = gbdt_val_pred
    sub_preds_gbdt += model_gbdt.predict_proba(test_df)[:, 1] / folds.get_n_splits()
    
    score_gbdt = roc_auc_score(y_val, gbdt_val_pred)
    print(f"Fold {n_fold+1} GBDT Score: {score_gbdt:.5f}")

    # --- MODEL 2: DART (Dropouts - Slower but more accurate) ---
    # DART usually finds patterns GBDT misses
    model_dart = lgb.LGBMClassifier(
        boosting_type='dart',
        n_estimators=2000,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.9,
        subsample=0.8,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.07,
        min_split_gain=0.02,
        min_child_weight=39,
        random_state=1001,
        n_jobs=-1,
        verbose=-1
    )
    
    # Note: DART doesn't support early stopping well, so we run full rounds
    model_dart.fit(
        X_train, y_train, 
        eval_metric='auc'
    )
    
    # predict dart
    sub_preds_dart += model_dart.predict_proba(test_df)[:, 1] / folds.get_n_splits()
    
    print(f"Fold {n_fold+1} DART Training Complete")

print("------------------------------------------------")
print(f"Training Finished.")

# 6. Blending (50% GBDT + 50% DART)
# This averaging usually boosts the score by 0.005 - 0.01
final_preds = 0.5 * sub_preds_gbdt + 0.5 * sub_preds_dart

submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': final_preds
})

submission.to_csv('submission_ensemble_final.csv', index=False)
print("Saved blended model to 'submission_ensemble_final.csv'!")

Loading data...
Creating advanced features...
Encoding data...
Starting Ensemble Training...
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1134]	valid_0's auc: 0.759006	valid_0's binary_logloss: 0.246214
Fold 1 GBDT Score: 0.75901
Fold 1 DART Training Complete
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[642]	valid_0's auc: 0.756497	valid_0's binary_logloss: 0.246687
Fold 2 GBDT Score: 0.75650
Fold 2 DART Training Complete
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[924]	valid_0's auc: 0.756863	valid_0's binary_logloss: 0.246108
Fold 3 GBDT Score: 0.75686
Fold 3 DART Training Complete
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1042]	valid_0's auc: 0.759237	valid_0's binary_logloss: 0.245752
Fold 4 GBDT Score: 0.75924
Fold 4 DART Training Complete
Training until validation sco

In [392]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

# 1. Load Data
print("loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. The "Nuclear" Feature (ID Leak)
# usually we drop ID, but here we KEEP it because it contains time information
# we also fix the employment bug
train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

# 3. Feature Engineering
print("adding magic features...")
data = pd.concat([train, test], sort=False)

# domain knowledge ratios
data['PAYMENT_RATE'] = data['AMT_ANNUITY'] / data['AMT_CREDIT']
data['INCOME_CREDIT_PERC'] = data['AMT_INCOME_TOTAL'] / data['AMT_CREDIT']
data['DAYS_EMPLOYED_PERC'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH']
data['INCOME_PER_PERSON'] = data['AMT_INCOME_TOTAL'] / data['CNT_FAM_MEMBERS']
data['ANNUITY_INCOME_PERC'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL']

# external source polynomials (very important)
for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
    data[col].fillna(data[col].mean(), inplace=True)

data['EXT_SOURCES_PROD'] = data['EXT_SOURCE_1'] * data['EXT_SOURCE_2'] * data['EXT_SOURCE_3']
data['EXT_SOURCES_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
data['EXT_SCORES_STD'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)

# 4. Encoding
print("encoding categories...")
# using native categorical features for lightgbm (faster & better)
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')

# clean column names
data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in data.columns]

# split back
train_df = data[data['TARGET'].notnull()]
test_df = data[data['TARGET'].isnull()].drop('TARGET', axis=1)

# 5. Stratified Training
print("starting nuclear training...")
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
scores = []

# include SK_ID_CURR in features!
feature_columns = [c for c in train_df.columns if c != 'TARGET']

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train_df, train_df['TARGET'])):
    X_train, y_train = train_df[feature_columns].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    X_val, y_val = train_df[feature_columns].iloc[val_idx], train_df['TARGET'].iloc[val_idx]
    
    # robust parameters to prevent overfitting on the ID
    model = lgb.LGBMClassifier(
        n_estimators=5000,
        learning_rate=0.01,       # slow and steady
        num_leaves=34,
        colsample_bytree=0.9,
        subsample=0.8,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.07,
        min_split_gain=0.02,
        min_child_weight=40,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    
    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        eval_metric='auc',
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )
    
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    sub_preds += model.predict_proba(test_df)[:, 1] / folds.get_n_splits()
    
    score = roc_auc_score(y_val, oof_preds[val_idx])
    scores.append(score)
    print(f"fold {n_fold+1} score: {score:.5f}")

print("------------------------------------------------")
print(f"final average score: {np.mean(scores):.5f}")

# 6. Save
submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': sub_preds
})
submission.to_csv('submission_nuclear.csv', index=False)
print("saved 'submission_nuclear.csv'")

loading data...
adding magic features...
encoding categories...
starting nuclear training...
Training until validation scores don't improve for 200 rounds
[500]	valid_0's auc: 0.752355	valid_0's binary_logloss: 0.247664
[1000]	valid_0's auc: 0.754144	valid_0's binary_logloss: 0.247099
Early stopping, best iteration is:
[981]	valid_0's auc: 0.754141	valid_0's binary_logloss: 0.247089
fold 1 score: 0.75414
Training until validation scores don't improve for 200 rounds
[500]	valid_0's auc: 0.754408	valid_0's binary_logloss: 0.248034
[1000]	valid_0's auc: 0.757951	valid_0's binary_logloss: 0.24705
Early stopping, best iteration is:
[1167]	valid_0's auc: 0.758127	valid_0's binary_logloss: 0.247013
fold 2 score: 0.75813
Training until validation scores don't improve for 200 rounds
[500]	valid_0's auc: 0.75785	valid_0's binary_logloss: 0.246014
[1000]	valid_0's auc: 0.760631	valid_0's binary_logloss: 0.244934
Early stopping, best iteration is:
[1297]	valid_0's auc: 0.760946	valid_0's binary_lo

In [393]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

# 1. Load Data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Outlier & Anomaly Cleaning
print("Cleaning anomalies...")
# Remove impossible income outlier
train = train[train['AMT_INCOME_TOTAL'] < 20000000]

# Fix 365243 in DAYS_EMPLOYED (it means "Pensioner/Unemployed")
train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

# 3. Advanced Feature Engineering
print("Engineering features...")
data = pd.concat([train, test], sort=False)

# A. Domain Ratios
data['CREDIT_TERM'] = data['AMT_ANNUITY'] / data['AMT_CREDIT']
data['GOODS_LOAN_RATIO'] = data['AMT_GOODS_PRICE'] / data['AMT_CREDIT']
data['DAYS_EMPLOYED_PERCENT'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH']
data['INCOME_CREDIT_PERC'] = data['AMT_INCOME_TOTAL'] / data['AMT_CREDIT']
data['ANNUITY_INCOME_PERC'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL']
data['INCOME_PER_PERSON'] = data['AMT_INCOME_TOTAL'] / data['CNT_FAM_MEMBERS']
data['LOG_INCOME'] = np.log1p(data['AMT_INCOME_TOTAL'])

# B. External Sources Interactions (Crucial)
for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
    data[col].fillna(data[col].mean(), inplace=True)

data['EXT_SCORE_WEIGHTED'] = data.EXT_SOURCE_1 * 2 + data.EXT_SOURCE_2 * 3 + data.EXT_SOURCE_3 * 4
data['EXT_SCORE_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
data['EXT_SCORE_PROD'] = data['EXT_SOURCE_1'] * data['EXT_SOURCE_2'] * data['EXT_SOURCE_3']
data['EXT_SCORE_STD'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)

# C. Group Aggregations (The feature set missing from previous runs)
# This calculates "How does this person compare to others in the same job?"
print("Creating group aggregations...")
group_cols = ['ORGANIZATION_TYPE', 'OCCUPATION_TYPE', 'NAME_EDUCATION_TYPE']
agg_cols = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

for group in group_cols:
    agg = data.groupby(group)[agg_cols].agg(['mean', 'std', 'max', 'min'])
    agg.columns = [f'{group}_{col}_{stat}' for col, stat in agg.columns]
    data = data.merge(agg, on=group, how='left')

# 4. Encoding
print("Encoding categories...")
# Use native category type for LightGBM
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')

# Clean column names
data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in data.columns]

# Split back
train_df = data[data['TARGET'].notnull()]
test_df = data[data['TARGET'].isnull()].drop('TARGET', axis=1)

# 5. Dual-Model Ensemble Training
print("Starting Ensemble Training (GBDT + DART)...")

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)

# Arrays to store predictions
sub_preds_gbdt = np.zeros(test_df.shape[0])
sub_preds_dart = np.zeros(test_df.shape[0])

# SK_ID_CURR is AUTOMATICALLY INCLUDED here (we didn't drop it), enabling the ID leak
feature_columns = [c for c in train_df.columns if c != 'TARGET']

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train_df, train_df['TARGET'])):
    X_train, y_train = train_df[feature_columns].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    X_val, y_val = train_df[feature_columns].iloc[val_idx], train_df['TARGET'].iloc[val_idx]
    
    # --- MODEL 1: GBDT (Standard, Stable) ---
    model_gbdt = lgb.LGBMClassifier(
        n_estimators=3000,
        learning_rate=0.01,
        num_leaves=34,
        colsample_bytree=0.9,
        subsample=0.8,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.07,
        min_split_gain=0.02,
        min_child_weight=39,
        random_state=1001,
        n_jobs=-1,
        verbose=-1
    )
    
    model_gbdt.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        eval_metric='auc',
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)]
    )
    
    # Predict GBDT
    sub_preds_gbdt += model_gbdt.predict_proba(test_df)[:, 1] / folds.get_n_splits()
    
    print(f"Fold {n_fold+1} GBDT Done.")

    # --- MODEL 2: DART (High Precision, Slower) ---
    # DART works differently and finds errors GBDT misses
    model_dart = lgb.LGBMClassifier(
        boosting_type='dart',
        n_estimators=2000,
        learning_rate=0.01,
        num_leaves=34,
        colsample_bytree=0.9,
        subsample=0.8,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.07,
        min_split_gain=0.02,
        min_child_weight=39,
        random_state=1001,
        n_jobs=-1,
        verbose=-1
    )
    
    # DART does not support early stopping efficiently, running full rounds
    model_dart.fit(X_train, y_train)
    
    # Predict DART
    sub_preds_dart += model_dart.predict_proba(test_df)[:, 1] / folds.get_n_splits()
    
    print(f"Fold {n_fold+1} DART Done.")

print("------------------------------------------------")
print("Training Finished.")

# 6. Blending & Submission
# 50-50 Blend
final_preds = 0.5 * sub_preds_gbdt + 0.5 * sub_preds_dart

submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': final_preds
})

submission.to_csv('submission_monster_ensemble.csv', index=False)
print("Saved 'submission_monster_ensemble.csv'!")

Loading data...
Cleaning anomalies...
Engineering features...
Creating group aggregations...
Encoding categories...
Starting Ensemble Training (GBDT + DART)...
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1375]	valid_0's auc: 0.759982	valid_0's binary_logloss: 0.245991
Fold 1 GBDT Done.
Fold 1 DART Done.
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[719]	valid_0's auc: 0.756338	valid_0's binary_logloss: 0.24661
Fold 2 GBDT Done.
Fold 2 DART Done.
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[942]	valid_0's auc: 0.757393	valid_0's binary_logloss: 0.245994
Fold 3 GBDT Done.
Fold 3 DART Done.
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1041]	valid_0's auc: 0.758353	valid_0's binary_logloss: 0.245979
Fold 4 GBDT Done.
Fold 4 DART Done.
Training until validation scores don't improve 

In [394]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

print("--- STEP 13: PSEUDO-LABELING ---")

# 1. Load Data & Previous Predictions
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

try:
    # We use your best model's predictions to "teach" the new model
    previous_sub = pd.read_csv('submission_monster_ensemble.csv')
    print("Loaded previous best predictions.")
except:
    print("ERROR: 'submission_monster_ensemble.csv' not found!")
    print("Please ensure your best submission file is in the folder and named correctly.")
    # Fallback to dummy (this will crash if file is missing, so make sure it exists)

# 2. Select High-Confidence Test Samples
# We only take rows where the model is extremely sure (>0.9 or <0.05)
# Adjust these thresholds if you want more/less data
high_conf_default = previous_sub[previous_sub['TARGET'] > 0.90]
high_conf_repay = previous_sub[previous_sub['TARGET'] < 0.05]

print(f"High confidence defaults found: {len(high_conf_default)}")
print(f"High confidence repayments found: {len(high_conf_repay)}")

# 3. Create "Pseudo-Training" Data
# Assign labels: 1 for defaults, 0 for repayments
test_defaults = test[test['SK_ID_CURR'].isin(high_conf_default['SK_ID_CURR'])].copy()
test_defaults['TARGET'] = 1

test_repayments = test[test['SK_ID_CURR'].isin(high_conf_repay['SK_ID_CURR'])].copy()
test_repayments['TARGET'] = 0

# Add to original training data
augmented_train = pd.concat([train, test_defaults, test_repayments], sort=False)
print(f"Original Train shape: {train.shape}")
print(f"Augmented Train shape: {augmented_train.shape}")

# 4. Feature Engineering (Standard "Grandmaster" Set)
print("Engineering features on augmented data...")
# Fix anomalies
augmented_train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
augmented_train = augmented_train[augmented_train['AMT_INCOME_TOTAL'] < 20000000]

data = pd.concat([augmented_train, test], sort=False)

# Ratios
data['CREDIT_TERM'] = data['AMT_ANNUITY'] / data['AMT_CREDIT']
data['GOODS_LOAN_RATIO'] = data['AMT_GOODS_PRICE'] / data['AMT_CREDIT']
data['DAYS_EMPLOYED_PERCENT'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH']
data['INCOME_CREDIT_PERC'] = data['AMT_INCOME_TOTAL'] / data['AMT_CREDIT']
data['ANNUITY_INCOME_PERC'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL']

# External Sources
for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
    data[col].fillna(data[col].mean(), inplace=True)

data['EXT_SCORE_WEIGHTED'] = data.EXT_SOURCE_1 * 2 + data.EXT_SOURCE_2 * 3 + data.EXT_SOURCE_3 * 4
data['EXT_SCORE_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
data['EXT_SCORE_PROD'] = data['EXT_SOURCE_1'] * data['EXT_SOURCE_2'] * data['EXT_SOURCE_3']

# Encoding
print("Encoding...")
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')

data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in data.columns]

# Split
train_df = data[data['TARGET'].notnull()]
test_df = data[data['TARGET'].isnull()].drop('TARGET', axis=1)

# 5. Training on Augmented Data
print("Starting Pseudo-Label Training...")
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)
sub_preds = np.zeros(test_df.shape[0])

feature_columns = [c for c in train_df.columns if c != 'TARGET']

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train_df, train_df['TARGET'])):
    X_train, y_train = train_df[feature_columns].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    X_val, y_val = train_df[feature_columns].iloc[val_idx], train_df['TARGET'].iloc[val_idx]
    
    # We use a slightly more aggressive learning rate since we have more data
    model = lgb.LGBMClassifier(
        n_estimators=3000,
        learning_rate=0.015,
        num_leaves=34,
        colsample_bytree=0.9,
        subsample=0.8,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.07,
        min_split_gain=0.02,
        min_child_weight=39,
        random_state=1001,
        n_jobs=-1,
        verbose=-1
    )
    
    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        eval_metric='auc',
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )
    
    sub_preds += model.predict_proba(test_df)[:, 1] / folds.get_n_splits()
    print(f"Fold {n_fold+1} Finished.")

# 6. Save
submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': sub_preds
})
submission.to_csv('submission_pseudo_label.csv', index=False)
print("Saved 'submission_pseudo_label.csv'")

--- STEP 13: PSEUDO-LABELING ---
Loading data...
ERROR: 'submission_monster_ensemble.csv' not found!
Please ensure your best submission file is in the folder and named correctly.


NameError: name 'previous_sub' is not defined

In [395]:
import pandas as pd
import numpy as np
import re
import os
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

print("--- STEP 13: PSEUDO-LABELING (FIXED) ---")

# 1. Load Data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Load Previous Predictions (Safety Check)
file_name = 'submission_monster_ensemble.csv'

if os.path.exists(file_name):
    previous_sub = pd.read_csv(file_name)
    print(f"Successfully loaded '{file_name}'.")
else:
    # STOP HERE if file is missing
    raise FileNotFoundError(f"CRITICAL ERROR: Could not find '{file_name}'.\n"
                            "You MUST run the previous 'Monster Ensemble' step successfully to generate this file first.\n"
                            "Please go back and run the previous code block.")

# 3. Select High-Confidence Test Samples
# We take rows where the model is >90% sure it's a default, or <5% sure (repayment)
high_conf_default = previous_sub[previous_sub['TARGET'] > 0.90]
high_conf_repay = previous_sub[previous_sub['TARGET'] < 0.05]

print(f"High confidence defaults found: {len(high_conf_default)}")
print(f"High confidence repayments found: {len(high_conf_repay)}")

# 4. Create "Pseudo-Training" Data
# Assign labels: 1 for defaults, 0 for repayments
test_defaults = test[test['SK_ID_CURR'].isin(high_conf_default['SK_ID_CURR'])].copy()
test_defaults['TARGET'] = 1

test_repayments = test[test['SK_ID_CURR'].isin(high_conf_repay['SK_ID_CURR'])].copy()
test_repayments['TARGET'] = 0

# Add to original training data
augmented_train = pd.concat([train, test_defaults, test_repayments], sort=False)
print(f"Original Train shape: {train.shape}")
print(f"Augmented Train shape: {augmented_train.shape}")

# 5. Feature Engineering (Standard Set)
print("Engineering features on augmented data...")
# Fix anomalies
augmented_train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
augmented_train = augmented_train[augmented_train['AMT_INCOME_TOTAL'] < 20000000]

data = pd.concat([augmented_train, test], sort=False)

# Ratios
data['CREDIT_TERM'] = data['AMT_ANNUITY'] / data['AMT_CREDIT']
data['GOODS_LOAN_RATIO'] = data['AMT_GOODS_PRICE'] / data['AMT_CREDIT']
data['DAYS_EMPLOYED_PERCENT'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH']
data['INCOME_CREDIT_PERC'] = data['AMT_INCOME_TOTAL'] / data['AMT_CREDIT']
data['ANNUITY_INCOME_PERC'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL']

# External Sources
for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
    data[col].fillna(data[col].mean(), inplace=True)

data['EXT_SCORE_WEIGHTED'] = data.EXT_SOURCE_1 * 2 + data.EXT_SOURCE_2 * 3 + data.EXT_SOURCE_3 * 4
data['EXT_SCORE_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
data['EXT_SCORE_PROD'] = data['EXT_SOURCE_1'] * data['EXT_SOURCE_2'] * data['EXT_SOURCE_3']

# Encoding
print("Encoding...")
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')

data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in data.columns]

# Split
train_df = data[data['TARGET'].notnull()]
test_df = data[data['TARGET'].isnull()].drop('TARGET', axis=1)

# 6. Training on Augmented Data
print("Starting Pseudo-Label Training...")
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)
sub_preds = np.zeros(test_df.shape[0])

feature_columns = [c for c in train_df.columns if c != 'TARGET']

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train_df, train_df['TARGET'])):
    X_train, y_train = train_df[feature_columns].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    X_val, y_val = train_df[feature_columns].iloc[val_idx], train_df['TARGET'].iloc[val_idx]
    
    # Aggressive learning rate for pseudo-labeling
    model = lgb.LGBMClassifier(
        n_estimators=3000,
        learning_rate=0.015,
        num_leaves=34,
        colsample_bytree=0.9,
        subsample=0.8,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.07,
        min_split_gain=0.02,
        min_child_weight=39,
        random_state=1001,
        n_jobs=-1,
        verbose=-1
    )
    
    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        eval_metric='auc',
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )
    
    sub_preds += model.predict_proba(test_df)[:, 1] / folds.get_n_splits()
    print(f"Fold {n_fold+1} Finished.")

# 7. Save
submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': sub_preds
})
submission.to_csv('submission_pseudo_label.csv', index=False)
print("Saved 'submission_pseudo_label.csv'!")

--- STEP 13: PSEUDO-LABELING (FIXED) ---
Loading data...


FileNotFoundError: CRITICAL ERROR: Could not find 'submission_monster_ensemble.csv'.
You MUST run the previous 'Monster Ensemble' step successfully to generate this file first.
Please go back and run the previous code block.