# Baseline LightGBM Model

First baseline following the competition strategy:
- Simple LightGBM with basic preprocessing
- Stratified K-Fold CV (k=5, seed=42)
- Basic categorical encoding
- Generate OOF predictions and test predictions

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Basic feature engineering
print("\nBasic preprocessing...")

# Identify feature types
numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Check target distribution
print(f"\nTarget distribution:")
print(train['y'].value_counts(normalize=True))

In [None]:
# Prepare data for training
X = train.drop(['id', 'y'], axis=1)
y = train['y']
X_test = test.drop(['id'], axis=1)

# Handle categorical features - label encoding
cat_features = categorical_features.copy()

# Create label encoders for categorical features
from sklearn.preprocessing import LabelEncoder
label_encoders = {}

for col in cat_features:
    le = LabelEncoder()
    # Fit on combined train+test to handle unseen categories
    combined = pd.concat([X[col], X_test[col]], axis=0)
    le.fit(combined)
    
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le

print("Data prepared for training")
print(f"X shape: {X.shape}, X_test shape: {X_test.shape}")

In [None]:
# Cross-validation setup
n_folds = 5
seed = 42
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

# Initialize arrays for OOF predictions and test predictions
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))

# Model parameters - basic LightGBM
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': seed
}

print(f"Starting {n_folds}-fold CV training...")
print(f"Parameters: {params}")

fold_scores = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
    print(f"\nFold {fold}/{n_folds}")
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    valid_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store predictions
    oof_predictions[valid_idx] = valid_pred
    test_predictions += test_pred / n_folds
    
    # Calculate fold score
    fold_score = roc_auc_score(y_valid, valid_pred)
    fold_scores.append(fold_score)
    print(f"Fold {fold} AUC: {fold_score:.6f}")

# Overall CV score
cv_score = roc_auc_score(y, oof_predictions)
print(f"\n{'='*50}")
print(f"Overall CV AUC: {cv_score:.6f}")
print(f"Mean Fold AUC: {np.mean(fold_scores):.6f} Â± {np.std(fold_scores):.6f}")
print(f"Fold scores: {fold_scores}")
print(f"{'='*50}")

In [None]:
# Feature importance
feature_importance = model.feature_importance(importance_type='gain')
feature_names = X.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 10 features by importance:")
print(importance_df.head(10))

In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'y': test_predictions
})

# Save submission
submission_path = '/home/submission/submission_001_baseline_lgbm.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSubmission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Prediction range: [{submission['y'].min():.4f}, {submission['y'].max():.4f}]")
print(f"Prediction mean: {submission['y'].mean():.4f}")

# Save OOF predictions for ensembling
oof_df = pd.DataFrame({
    'id': train['id'],
    'oof_pred': oof_predictions,
    'target': y
})
oof_path = '/home/code/oof_predictions_001.csv'
oof_df.to_csv(oof_path, index=False)

print(f"\nOOF predictions saved to: {oof_path}")