# LightGBM with Cyclical Features

Implementing cyclical feature engineering for month and day as the highest priority improvement from the strategy:
- Add sin/cos transformations for month and day
- Fix label encoding leakage by moving encoding inside CV loop
- Keep same LightGBM parameters to isolate feature impact
- Target CV improvement: 0.970-0.973 (expected gain: 0.002-0.005 AUC)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Feature engineering function
def create_cyclical_features(df):
    """Add cyclical sin/cos transformations for temporal features"""
    df = df.copy()
    
    # Map month names to numbers
    month_map = {
        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
        'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }
    df['month_num'] = df['month'].map(month_map)
    
    # Month cyclical features (12 months)
    df['month_sin'] = np.sin(2 * np.pi * df['month_num'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month_num'] / 12)
    
    # Day cyclical features (31 days max)
    df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
    
    return df

print("\nCreating cyclical features...")
train = create_cyclical_features(train)
test = create_cyclical_features(test)

print("New features created:")
print(f"  - month_num, month_sin, month_cos")
print(f"  - day_sin, day_cos")
print(f"\nTrain shape after feature engineering: {train.shape}")
print(f"Test shape after feature engineering: {test.shape}")

# Identify feature types
numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 
                    'month_sin', 'month_cos', 'day_sin', 'day_cos']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Check target distribution
print(f"\nTarget distribution:")
print(train['y'].value_counts(normalize=True))

In [None]:
# Prepare data for training
X = train.drop(['id', 'y'], axis=1)
y = train['y']
X_test = test.drop(['id'], axis=1)

print(f"Training features shape: {X.shape}")
print(f"Test features shape: {X_test.shape}")

# Cross-validation setup
n_folds = 5
seed = 42
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

# Initialize arrays for OOF predictions and test predictions
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))

# Model parameters - same as baseline
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': seed
}

print(f"\nStarting {n_folds}-fold CV training with cyclical features...")
print(f"Parameters: {params}")

fold_scores = []

In [None]:
# Training loop with proper label encoding inside folds
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
    print(f"\nFold {fold}/{n_folds}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx].copy(), X.iloc[valid_idx].copy()
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    X_test_fold = X_test.copy()
    
    # Label encoding - fit only on training data (fixing leakage)
    label_encoders = {}
    for col in categorical_features:
        le = LabelEncoder()
        # Fit on training data only
        le.fit(X_train[col])
        
        # Transform train, valid, and test
        X_train[col] = le.transform(X_train[col])
        X_valid[col] = le.transform(X_valid[col])
        X_test_fold[col] = le.transform(X_test_fold[col])
        label_encoders[col] = le
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    valid_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test_fold, num_iteration=model.best_iteration)
    
    # Store predictions
    oof_predictions[valid_idx] = valid_pred
    test_predictions += test_pred / n_folds
    
    # Calculate fold score
    fold_score = roc_auc_score(y_valid, valid_pred)
    fold_scores.append(fold_score)
    print(f"Fold {fold} AUC: {fold_score:.6f}")

# Overall CV score
cv_score = roc_auc_score(y, oof_predictions)
print(f"\n{'='*50}")
print(f"Overall CV AUC: {cv_score:.6f}")
print(f"Mean Fold AUC: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
print(f"Fold scores: {fold_scores}")
print(f"{'='*50}")

# Compare with baseline
baseline_score = 0.968226
improvement = cv_score - baseline_score
print(f"Baseline CV AUC: {baseline_score:.6f}")
print(f"Improvement: {improvement:.6f}")
print(f"Success threshold (>0.001): {'✓' if improvement > 0.001 else '✗'}")

In [None]:
# Feature importance
feature_importance = model.feature_importance(importance_type='gain')
feature_names = X.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 15 features by importance:")
print(importance_df.head(15))

In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'y': test_predictions
})

# Save submission
submission_path = '/home/submission/submission_002_cyclical_lgbm.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSubmission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Prediction range: [{submission['y'].min():.4f}, {submission['y'].max():.4f}]")
print(f"Prediction mean: {submission['y'].mean():.4f}")

# Save OOF predictions for ensembling
oof_df = pd.DataFrame({
    'id': train['id'],
    'oof_pred': oof_predictions,
    'target': y
})
oof_path = '/home/code/oof_predictions_002.csv'
oof_df.to_csv(oof_path, index=False)

print(f"\nOOF predictions saved to: {oof_path}")