In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.combine import SMOTETomek
import shap

In [2]:
df = pd.read_csv("HACKATHON_TRAINING_DATA.CSV")

In [3]:
flag_columns = ['SI_FLG', 'LOCKER_HLDR_IND', 'UID_FLG', 'KYC_FLG', 'INB_FLG', 'EKYC_FLG']
df[flag_columns] = df[flag_columns].replace({'Y': 1, 'N': 0})
df[flag_columns] = df[flag_columns].astype(float)

  df[flag_columns] = df[flag_columns].replace({'Y': 1, 'N': 0})


In [4]:
def convert_to_months(s):
    if pd.isna(s): return np.nan
    s = str(s).lower().strip()
    pattern = r'(?:(\d+)\s*yrs?)?\s*(?:(\d+)\s*(?:months|mon))?'
    match = re.match(pattern, s)
    if match:
        years = int(match.group(1)) if match.group(1) else 0
        months = int(match.group(2)) if match.group(2) else 0
        return years * 12 + months
    return np.nan

df['CREDIT_HISTORY_LENGTH1'] = df['CREDIT_HISTORY_LENGTH1'].apply(convert_to_months)
df['AVERAGE_ACCT_AGE1'] = df['AVERAGE_ACCT_AGE1'].apply(convert_to_months)

In [5]:
income_band_mapping = {
    'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7,
    'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'EX05': 14
}
df['INCOME_BAND1'] = df['INCOME_BAND1'].map(lambda x: income_band_mapping.get(x, np.nan))

In [6]:
if 'ONEMNTHCR' in df.columns:
    df.rename(columns={'ONEMNTHCR': 'ONEMNTHSCR'}, inplace=True)

In [7]:
agreg_group_mapping = {
    '#Total Auto Loan': 1,
    '#Total Xpress Credit': 2,
    '#Housing Loan': 3,
    '#Education Loan Total': 4
}
df['AGREG_GROUP'] = df['AGREG_GROUP'].map(lambda x: agreg_group_mapping.get(x, np.nan))

In [8]:
product_type_mapping = {
    'AUTO LOAN': 1,
    'PERSONAL LOAN': 2,
    'HOME LOAN': 3,
    'EDUCATION LOAN': 4
}
df['PRODUCT_TYPE'] = df['PRODUCT_TYPE'].map(lambda x: product_type_mapping.get(x, np.nan))

In [9]:
month_map = {
    'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06',
    'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'
}

def convert_time_period(val):
    if pd.isna(val): return np.nan
    match = re.match(r'([A-Z]{3})(\d{2})', str(val).upper())
    if match:
        month = month_map.get(match.group(1), '00')
        year = '20' + match.group(2)
        return int(year + month)
    return np.nan

df['TIME_PERIOD'] = df['TIME_PERIOD'].apply(convert_time_period)

In [10]:
rg_columns = [
    'LAST_1_YR_RG4', 'LAST_3_YR_RG4', 'LAST_1_YR_RG3', 'LAST_1_YR_RG2', 'LAST_1_YR_RG1',
    'FIRST_NPA_TENURE', 'CUST_NO_OF_TIMES_NPA', 'LATEST_NPA_TENURE', 'NO_YRS_NPA',
    'LATEST_RG3_TENURE', 'NO_YRS_RG3', 'TOT_IRAC_CHNG', 'TIMES_IRAC_SLIP', 'TIMES_IRAC_UPR',
    'NO_ENQ', 'CRIFF_11', 'CRIFF_22', 'CRIFF_33', 'CRIFF_44', 'CRIFF_55', 'CRIFF_66', 'TOTAL_CRIFF1'
]
df[rg_columns] = df[rg_columns].fillna(0)

In [11]:
sdr_cols = [f'{i}MNTHSDR' for i in [
    'ONE','TWO','THREE','FOUR','FIVE','SIX','SEVEN','EIGHT','NINE','TEN','ELEVEN','TWELVE'
]]
df[sdr_cols] = df[sdr_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
df['ALL_LON_LIMIT'] = pd.to_numeric(df['ALL_LON_LIMIT'], errors='coerce').fillna(0)
df[sdr_cols] = df[sdr_cols].abs()

In [12]:
monthly_limit = df['ALL_LON_LIMIT'] / 12
df_overspend = df[sdr_cols].sub(monthly_limit, axis=0)
total_spend = df[sdr_cols].sum(axis=1)
total_overspend = df_overspend.clip(lower=0).sum(axis=1)
df['overspend_ratio'] = total_overspend / (total_spend + 1e-6)

In [13]:
limit_matrix = pd.DataFrame(
    np.tile(monthly_limit.values[:, None], (1, len(sdr_cols))),
    columns=sdr_cols, index=df.index
)
overspend_flags = df[sdr_cols] > limit_matrix

def max_consecutive_true(arr):
    max_streak = streak = 0
    for val in arr:
        if val:
            streak += 1
            max_streak = max(max_streak, streak)
        else:
            streak = 0
    return max_streak

df['max_consec_overspend'] = overspend_flags.apply(max_consecutive_true, axis=1)

In [14]:
out_cols = [f'{i}MNTHOUTSTANGBAL' for i in [
    'TWELVE','ELEVEN','TEN','NINE','EIGHT','SEVEN',
    'SIX','FIVE','FOUR','THREE','TWO','ONE'
]]
df[out_cols] = df[out_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

In [15]:
def calc_slope(row):
    x = np.arange(12)
    y = row.values
    return np.polyfit(x, y, 1)[0]

df['outbal_slope'] = df[out_cols].apply(calc_slope, axis=1)
df['outbal_is_declining'] = (df['outbal_slope'] < 0).astype(int)

In [16]:
term_debit_cols = [f"{i}MNTHAVGMTD" for i in [
    'ONE','TWO','THREE','FOUR','FIVE','SIX','SEVEN',
    'EIGHT','NINE','TEN','ELEVEN','TWELVE'
]]
df[term_debit_cols] = df[term_debit_cols].fillna(0)
df['slope_MTD'] = df[term_debit_cols].apply(calc_slope, axis=1)
df['is_debit_declining_MTD'] = (df['slope_MTD'] < 0).astype(int)

In [17]:
keywords_to_remove = ['SDR', 'SCR', 'OUTSTANGBAL', 'AVGMTD', 'AVGQTD', 'AVGYTD']
exceptions = ['KYC_SCR']
cols_to_drop = [
    col for col in df.columns
    if any(kw in col for kw in keywords_to_remove) and col not in exceptions
]
df.drop(columns=cols_to_drop, inplace=True)
print(f"\nDropped {len(cols_to_drop)} columns: {cols_to_drop}")


Dropped 72 columns: ['ONEMNTHSCR', 'ONEMNTHSDR', 'ONEMNTHOUTSTANGBAL', 'ONEMNTHAVGMTD', 'ONEMNTHAVGQTD', 'ONEMNTHAVGYTD', 'TWOMNTHSCR', 'TWOMNTHSDR', 'TWOMNTHOUTSTANGBAL', 'TWOMNTHAVGMTD', 'TWOMNTHAVGQTD', 'TWOMNTHAVGYTD', 'THREEMNTHSCR', 'THREEMNTHSDR', 'THREEMNTHOUTSTANGBAL', 'THREEMNTHAVGMTD', 'THREEMNTHAVGQTD', 'THREEMNTHAVGYTD', 'FOURMNTHSCR', 'FOURMNTHSDR', 'FOURMNTHOUTSTANGBAL', 'FOURMNTHAVGMTD', 'FOURMNTHAVGQTD', 'FOURMNTHAVGYTD', 'FIVEMNTHSCR', 'FIVEMNTHSDR', 'FIVEMNTHOUTSTANGBAL', 'FIVEMNTHAVGMTD', 'FIVEMNTHAVGQTD', 'FIVEMNTHAVGYTD', 'SIXMNTHSCR', 'SIXMNTHSDR', 'SIXMNTHOUTSTANGBAL', 'SIXMNTHAVGMTD', 'SIXMNTHAVGQTD', 'SIXMNTHAVGYTD', 'SEVENMNTHSCR', 'SEVENMNTHSDR', 'SEVENMNTHOUTSTANGBAL', 'SEVENMNTHAVGMTD', 'SEVENMNTHAVGQTD', 'SEVENMNTHAVGYTD', 'EIGHTMNTHSCR', 'EIGHTMNTHSDR', 'EIGHTMNTHOUTSTANGBAL', 'EIGHTMNTHAVGMTD', 'EIGHTMNTHAVGQTD', 'EIGHTMNTHAVGYTD', 'NINEMNTHSCR', 'NINEMNTHSDR', 'NINEMNTHOUTSTANGBAL', 'NINEMNTHAVGMTD', 'NINEMNTHAVGQTD', 'NINEMNTHAVGYTD', 'TENMNTHSCR', '

In [18]:
df.to_csv("cleaned_data.csv", index=False)

In [19]:
X = df.drop(columns=['TARGET'])
y = df['TARGET']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [21]:
imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [22]:
numeric_cols = X_train_imputed.select_dtypes(include=[np.number]).columns
yeo = PowerTransformer(method='yeo-johnson', standardize=False)
X_train_imputed[numeric_cols] = yeo.fit_transform(X_train_imputed[numeric_cols])
X_test_imputed[numeric_cols] = yeo.transform(X_test_imputed[numeric_cols])

In [23]:
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_imputed), columns=X_test.columns)

In [24]:
smote_tomek = SMOTETomek(random_state=42)
X_res, y_res = smote_tomek.fit_resample(X_train_scaled, y_train)

In [25]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_res, y_res)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [26]:
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_res)
shap_importance = np.abs(shap_values.values).mean(axis=0)
shap_feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': shap_importance
}).sort_values(by='importance', ascending=False)
top_features = shap_feature_importance['feature'].head(30).tolist()

In [27]:
X_train_shap = X_res[top_features]
X_test_shap = X_test_scaled[top_features]

In [30]:
lgbm = LGBMClassifier(
    objective='binary',
    class_weight='balanced',
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,
    reg_lambda=1.0,
    min_child_weight=10,
    random_state=42
)
lgbm.fit(X_train_shap, y_res)

[LightGBM] [Info] Number of positive: 233811, number of negative: 233811
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7642
[LightGBM] [Info] Number of data points in the train set: 467622, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [31]:
y_probs = lgbm.predict_proba(X_test_shap)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)
best_thresh = thresholds[np.argmax(f1_scores)]
y_pred = (y_probs >= best_thresh).astype(int)

In [32]:
print("\n🔷 Balanced LightGBM Results")
print(f"Best Threshold: {best_thresh:.2f}, F1: {max(f1_scores):.4f}")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


🔷 Balanced LightGBM Results
Best Threshold: 0.32, F1: 0.6050
Accuracy: 0.913316755404354
Confusion Matrix:
 [[55516  2945]
 [ 2737  4351]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95     58461
           1       0.60      0.61      0.60      7088

    accuracy                           0.91     65549
   macro avg       0.77      0.78      0.78     65549
weighted avg       0.91      0.91      0.91     65549

