In [27]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [28]:
def create_features(df):
    """Create all engineered features"""
    # Balance features
    df['log_balance'] = np.log(df['balance'] - df['balance'].min() + 1)
    df['is_zero_balance'] = (df['balance'] == 0).astype(int)
    
    # Contact recency features
    df['contact_recency'] = 'long'
    df.loc[df['pdays'] == -1, 'contact_recency'] = 'never'
    df.loc[(df['pdays'] >= 0) & (df['pdays'] <= 3), 'contact_recency'] = 'recent'
    df.loc[(df['pdays'] > 3) & (df['pdays'] <= 14), 'contact_recency'] = 'medium'
    
    return df

In [29]:
df=pd.read_csv('../Data/raw/train.csv')

In [30]:
df = create_features(df)

In [31]:
df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,...,month,duration,campaign,pdays,previous,poutcome,y,log_balance,is_zero_balance,contact_recency
0,0,42,technician,married,secondary,no,7,no,no,cellular,...,aug,117,3,-1,0,unknown,0,8.990566,0,never
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,...,jun,185,1,-1,0,unknown,0,9.051813,0,never
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,...,may,111,2,-1,0,unknown,0,9.062072,0,never
3,3,27,student,single,secondary,no,34,yes,no,unknown,...,may,10,2,-1,0,unknown,0,8.993924,0,never
4,4,26,technician,married,secondary,no,889,yes,no,cellular,...,feb,902,1,-1,0,unknown,1,9.094817,0,never


In [32]:
# Define features
features = [
    'log_balance', 'is_zero_balance',
    'pdays', 'age', 'contact', 'poutcome', 
    'housing', 'campaign', 'job', 'marital',
    'loan', 'education', 'previous', 'default',
    'contact_recency'  # XGBoost handles categoricals internally
]

# Define target
target = 'y'  # Replace with your target name

# Encode categoricals (XGBoost requires numeric input)
categorical_cols = ['contact', 'poutcome', 'housing', 'loan', 
                    'job', 'marital', 'education', 'default',
                    'contact_recency']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save for future use

In [33]:
neg, pos = np.bincount(df[target])
scale_pos_weight = neg / pos  # ≈7.29 for your data

In [34]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',  # Faster training
    'scale_pos_weight': scale_pos_weight,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.1,      # L1 regularization
    'reg_lambda': 0.2,     # L2 regularization
    'gamma': 0.1,          # Minimum loss reduction
    'seed': 42
}

In [35]:
X = df[features]
y = df[target]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []
feature_importances = pd.DataFrame()

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Create DMatrix (optimized XGBoost data structure)
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)
    
    # Train model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    # Predict and evaluate
    valid_preds = model.predict(dvalid)
    auc = roc_auc_score(y_valid, valid_preds)
    fold_results.append(auc)
    print(f'Fold {fold}: AUC = {auc:.5f}')
    
    # Corrected feature importance extraction
fold_importance = pd.DataFrame({
    'feature': features,
    'fold': fold
})

# Get gain scores as dictionary
gain_dict = model.get_score(importance_type='gain')

# Map gains to features (assign 0 if feature not in dictionary)
fold_importance['gain'] = fold_importance['feature'].map(
    lambda x: gain_dict.get(x, 0.0)
)

[0]	train-auc:0.78589	valid-auc:0.78504
[100]	train-auc:0.82270	valid-auc:0.82127
[200]	train-auc:0.82910	valid-auc:0.82516
[300]	train-auc:0.83385	valid-auc:0.82767
[400]	train-auc:0.83765	valid-auc:0.82925
[500]	train-auc:0.84045	valid-auc:0.83019
[600]	train-auc:0.84310	valid-auc:0.83075
[700]	train-auc:0.84533	valid-auc:0.83115
[800]	train-auc:0.84746	valid-auc:0.83150
[900]	train-auc:0.84934	valid-auc:0.83163
[999]	train-auc:0.85122	valid-auc:0.83172
Fold 1: AUC = 0.83172
[0]	train-auc:0.78617	valid-auc:0.78383
[100]	train-auc:0.82365	valid-auc:0.81850
[200]	train-auc:0.82999	valid-auc:0.82214
[300]	train-auc:0.83473	valid-auc:0.82463
[400]	train-auc:0.83830	valid-auc:0.82595
[500]	train-auc:0.84117	valid-auc:0.82674
[600]	train-auc:0.84363	valid-auc:0.82717
[700]	train-auc:0.84569	valid-auc:0.82743
[800]	train-auc:0.84789	valid-auc:0.82765
[900]	train-auc:0.84973	valid-auc:0.82781
[999]	train-auc:0.85156	valid-auc:0.82786
Fold 2: AUC = 0.82786
[0]	train-auc:0.78540	valid-auc:0.78

In [38]:

mean_auc = np.mean(fold_results)
std_auc = np.std(fold_results)
print(f'\nCV AUC: {mean_auc:.5f} ± {std_auc:.5f}')

# Aggregate feature importance if available
if not feature_importances.empty:
    # CORRECTED: Proper aggregation and column handling
    final_importance = feature_importances.groupby('feature').agg(
        gain_mean=('gain', 'mean'),
        gain_std=('gain', 'std')
    ).reset_index().sort_values('gain_mean', ascending=False)

    print("\nCross-Validated Feature Importance:")
    print(final_importance)


CV AUC: 0.82946 ± 0.00134
