In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('../Data/raw/train.csv')

In [3]:
df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [4]:
df['contact'].unique()

array(['cellular', 'unknown', 'telephone'], dtype=object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [6]:
# Merge rare categories
rare_jobs = ['self-employed', 'entrepreneur', 'housemaid','unemployed', 'student', 'unknown']

df['job'] = df['job'].replace(rare_jobs, 'other')

In [7]:
df['balance_bin'] = pd.qcut(
    df['balance'],
    q=[0, 0.1, 0.3, 0.7, 0.9, 1],  # Adjust quantiles based on distribution
    labels=["Negative", "Low", "Medium", "High", "Very High"]
)

In [8]:
age_bins = [18, 25, 35, 50, 65, 120]
df['age_bin'] = pd.cut(
    df['age'],
    bins=age_bins,
    labels=["18-25", "26-35", "36-50", "51-65", "65+"]
)

In [9]:
df['balance_shifted'] = df['balance'] - df['balance'].min() + 1 

df['log_balance'] = np.log(df['balance_shifted'])

In [10]:
# SAFE FEATURES (no leakage)
# 1. Client contact sensitivity (using previous campaign data)
df['contact_sensitivity'] = df['previous'] / (df['pdays'].replace(999, 0) + 1e-6)

# 2. Previous campaign engagement
df['prev_campaign_engaged'] = (df['poutcome'] == 'success').astype(int)

# 3. Historical contact responsiveness
df['responsiveness'] = np.select(
    [
        df['previous'] == 0,
        (df['previous'] > 0) & (df['poutcome'] == 'success'),
        (df['previous'] > 0) & (df['poutcome'] != 'success')
    ],
    ['new', 'responsive', 'unresponsive'],
    default='unknown'
)

# 4. Time since last contact (using pdays)
df['contact_recency'] = np.select(
    [
        df['pdays'] == 999,
        df['pdays'] <= 30,
        df['pdays'] <= 90,
        df['pdays'] > 90
    ],
    ['never', 'recent', 'medium', 'long'],
    default='missing'
)

# UNSAFE - EXCLUDE THESE:
# campaign, duration

In [11]:
df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,...,poutcome,y,balance_bin,age_bin,balance_shifted,log_balance,contact_sensitivity,prev_campaign_engaged,responsiveness,contact_recency
0,0,42,technician,married,secondary,no,7,no,no,cellular,...,unknown,0,Low,36-50,8027,8.990566,-0.0,0,new,recent
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,...,unknown,0,Medium,36-50,8534,9.051813,-0.0,0,new,recent
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,...,unknown,0,Medium,36-50,8622,9.062072,-0.0,0,new,recent
3,3,27,other,single,secondary,no,34,yes,no,unknown,...,unknown,0,Medium,26-35,8054,8.993924,-0.0,0,new,recent
4,4,26,technician,married,secondary,no,889,yes,no,cellular,...,unknown,1,Medium,26-35,8909,9.094817,-0.0,0,new,recent


In [12]:
df.drop(['id','day','month','age_bin','balance_bin','balance','balance_shifted'],axis=1,inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   age                    750000 non-null  int64  
 1   job                    750000 non-null  object 
 2   marital                750000 non-null  object 
 3   education              750000 non-null  object 
 4   default                750000 non-null  object 
 5   housing                750000 non-null  object 
 6   loan                   750000 non-null  object 
 7   contact                750000 non-null  object 
 8   duration               750000 non-null  int64  
 9   campaign               750000 non-null  int64  
 10  pdays                  750000 non-null  int64  
 11  previous               750000 non-null  int64  
 12  poutcome               750000 non-null  object 
 13  y                      750000 non-null  int64  
 14  log_balance            750000 non-nu

In [15]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# ---- Load data (adjust paths if needed) ----
train = df   # or your preprocessed train DataFrame
# test  = pd.read_csv("test.csv")    # uncomment when you have test.csv

TARGET_COL = "y"
DROP_COLS  = ["id"]                  # keep all other features for max signal

X = train.drop(columns=[TARGET_COL]).copy()
y = train[TARGET_COL].astype(int)

# ---- Categorical columns (LightGBM native handling) ----
categorical_cols = ["job","marital","housing","loan","contact","poutcome",'default','education','responsiveness','contact_recency']

for c in categorical_cols:
    if c in X.columns:
        X[c] = X[c].astype("category")

# ---- Class weight for imbalance ----
neg, pos = int((y == 0).sum()), int((y == 1).sum())
scale_pos_weight = neg / max(pos, 1)
print(f"Class balance -> pos: {pos}, neg: {neg}, scale_pos_weight: {scale_pos_weight:.2f}")

# ---- LightGBM params (good starting point) ----
params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 127,
    "min_data_in_leaf": 200,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 1,
    "max_bin": 255,
    "scale_pos_weight": scale_pos_weight,
    "verbosity": -1,
    "seed": 42,
}

# ---- Stratified K-Fold CV ----
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof = np.zeros(len(train))
auc_scores = []

models = []  # keep models if you want to ensemble or inspect feature importance

for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    dtrain = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_cols, free_raw_data=False)
    dvalid = lgb.Dataset(X_va, y_va, categorical_feature=categorical_cols, free_raw_data=False)

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=5000,
        valid_sets=[dtrain, dvalid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=False),
            lgb.log_evaluation(period=200)
        ],
    )

    pred_va = model.predict(X_va, num_iteration=model.best_iteration)
    oof[va_idx] = pred_va
    auc = roc_auc_score(y_va, pred_va)
    auc_scores.append(auc)
    models.append(model)
    print(f"Fold {fold}: AUC = {auc:.5f} | iters = {model.best_iteration}")

print(f"\nCV AUC: {np.mean(auc_scores):.5f} ± {np.std(auc_scores):.5f}")

# ---- (Optional) Feature importance ----
importances = pd.DataFrame({
    "feature": X.columns,
    "gain": np.mean([m.feature_importance(importance_type="gain") for m in models], axis=0),
    "split": np.mean([m.feature_importance(importance_type="split") for m in models], axis=0),
}).sort_values("gain", ascending=False)
display(importances.head(20))


Class balance -> pos: 90488, neg: 659512, scale_pos_weight: 7.29
[200]	train's auc: 0.963623	valid's auc: 0.959239
[400]	train's auc: 0.968067	valid's auc: 0.960093
[600]	train's auc: 0.971167	valid's auc: 0.960327
[800]	train's auc: 0.973803	valid's auc: 0.960372
Fold 1: AUC = 0.96039 | iters = 749
[200]	train's auc: 0.963934	valid's auc: 0.958082
[400]	train's auc: 0.968348	valid's auc: 0.958854
[600]	train's auc: 0.971476	valid's auc: 0.95901
[800]	train's auc: 0.974065	valid's auc: 0.959057
Fold 2: AUC = 0.95908 | iters = 784
[200]	train's auc: 0.963848	valid's auc: 0.958531
[400]	train's auc: 0.968214	valid's auc: 0.95948
[600]	train's auc: 0.971369	valid's auc: 0.959679
[800]	train's auc: 0.973886	valid's auc: 0.959661
Fold 3: AUC = 0.95970 | iters = 648
[200]	train's auc: 0.963586	valid's auc: 0.959505
[400]	train's auc: 0.967982	valid's auc: 0.960202
[600]	train's auc: 0.971213	valid's auc: 0.960474
[800]	train's auc: 0.973768	valid's auc: 0.960484
[1000]	train's auc: 0.976014	

Unnamed: 0,feature,gain,split
8,duration,5794809.0,23930.4
13,log_balance,764004.4,21879.6
7,contact,572980.4,2430.6
15,prev_campaign_engaged,356527.7,382.8
0,age,347404.9,15431.4
5,housing,340260.4,2416.2
10,pdays,181500.2,5060.4
9,campaign,181178.0,5968.6
1,job,168262.3,6399.6
14,contact_sensitivity,102044.2,3418.2
