In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

In [16]:
# Load training and test data (FIX: train should use training_data.csv)
train_df = pd.read_csv('phase_balancing_training_data.csv')
test_df = pd.read_csv('phase_balancing_test_data.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"\nClass distribution:\n{train_df['switch'].value_counts()}")

Training data shape: (14000, 4)
Test data shape: (14000, 3)

Class distribution:
switch
not_switch    7000
switch        7000
Name: count, dtype: int64


In [17]:
# Prepare features and labels
X_train = train_df[['L1', 'L2', 'L3']].to_numpy()
y_train = train_df['switch'].to_numpy()

X_test = test_df[['L1', 'L2', 'L3']].to_numpy()

# Encode labels (switch -> 1, not_switch -> 0)
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

print(f"Training features: {X_train.shape}")
print(f"Test features: {X_test.shape}")
print(f"Label encoding: {dict(zip(list(le.classes_), list(le.transform(le.classes_))))}") # type: ignore

Training features: (14000, 3)
Test features: (14000, 3)
Label encoding: {'not_switch': np.int64(0), 'switch': np.int64(1)}


## Why XGBoost and LogLoss?

**Why XGBoost (Extreme Gradient Boosting)?**
- **Ensemble Learning**: Combines multiple weak learners (decision trees) to create a strong predictor
- **Handles Imbalanced Data**: Works well with our 50/50 split and can handle class imbalance
- **Regularization**: Built-in L1 (Lasso) and L2 (Ridge) regularization prevents overfitting
- **Feature Importance**: Provides insights into which features (L1, L2, L3) matter most
- **Speed & Efficiency**: Optimized for performance with parallel processing
- **Robust to Outliers**: Tree-based models handle extreme values well (important for our export scenarios)

**Why LogLoss (Cross-Entropy Loss)?**
- **Probabilistic Predictions**: Measures the quality of probability estimates, not just class labels
- **Penalizes Confident Wrong Predictions**: High penalty when model is confident but wrong
- **Smooth Gradient**: Continuous differentiable function enables gradient descent optimization
- **Binary Classification Standard**: Industry standard for binary classification problems like ours (switch/not_switch)
- **Better than Accuracy**: Captures prediction certainty, crucial for phase switching decisions where confidence matters

In [21]:
# Enhanced XGBoost with Anti-Overfitting Parameters
from sklearn.model_selection import cross_val_score, train_test_split

# Split data for validation monitoring
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train_encoded, test_size=0.2, random_state=42, stratify=y_train_encoded
)

# XGBoost model with anti-overfitting parameters
xgb_model = xgb.XGBClassifier(
    n_estimators=100,           # Reduced from 200 to prevent overfitting
    max_depth=4,                # Reduced from 6 - shallower trees generalize better
    learning_rate=0.05,         # Reduced from 0.1 - slower learning prevents overfitting
    subsample=0.7,              # Reduced from 0.8 - more randomness
    colsample_bytree=0.7,       # Reduced from 0.8 - use fewer features per tree
    colsample_bylevel=0.7,      # Reduced from 0.8
    gamma=1.0,                  # Increased from 0.1 - stronger pruning (minimum loss reduction)
    min_child_weight=5,         # Increased from 3 - require more samples in leaf
    reg_alpha=1.0,              # Increased from 0.1 - stronger L1 regularization
    reg_lambda=2.0,             # Increased from 1.0 - stronger L2 regularization
    scale_pos_weight=1,
    random_state=42,
    eval_metric='logloss',
    tree_method='hist',
    n_jobs=-1
)

# Fit with validation monitoring (for overfitting check)
xgb_model.fit(
    X_train_split, y_train_split,
    eval_set=[(X_train_split, y_train_split), (X_val_split, y_val_split)],
    verbose=False
)

# Check for overfitting by comparing train vs validation
train_pred = xgb_model.predict(X_train_split)
val_pred = xgb_model.predict(X_val_split)

from sklearn.metrics import accuracy_score
train_acc = accuracy_score(y_train_split, train_pred)
val_acc = accuracy_score(y_val_split, val_pred)

print(f"üìä Overfitting Check (Train/Val Split):")
print(f"Training Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"Gap (Train - Val): {(train_acc - val_acc):.4f}")

if train_acc - val_acc > 0.05:
    print("‚ö†Ô∏è  Warning: Significant overfitting detected (gap > 5%)")
elif train_acc - val_acc > 0.02:
    print("‚ö†Ô∏è  Mild overfitting detected (gap 2-5%)")
else:
    print("‚úÖ Good generalization (gap < 2%)")

# Now create fresh model for cross-validation (without early stopping)
xgb_model_cv = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    colsample_bylevel=0.7,
    gamma=1.0,
    min_child_weight=5,
    reg_alpha=1.0,
    reg_lambda=2.0,
    scale_pos_weight=1,
    random_state=42,
    eval_metric='logloss',
    tree_method='hist',
    n_jobs=-1
)

# Cross-validation scores on full training data
scores = cross_val_score(xgb_model_cv, X_train, y_train_encoded, cv=5, scoring='accuracy')
print(f"\n5-Fold Cross-Validation:")
print(f"XGBoost - CV Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")
print(f"Individual fold scores: {[f'{s:.4f}' for s in scores]}")

# Train final model on full training data
xgb_model_cv.fit(X_train, y_train_encoded)

# Set as best model
best_model_name = 'XGBoost'
best_model = xgb_model_cv

print(f"\n‚úÖ Using Anti-Overfitting XGBoost with regularization")

üìä Overfitting Check (Train/Val Split):
Training Accuracy: 0.9908 (99.08%)
Validation Accuracy: 0.9914 (99.14%)
Gap (Train - Val): -0.0006
‚úÖ Good generalization (gap < 2%)

5-Fold Cross-Validation:
XGBoost - CV Accuracy: 0.9866 (+/- 0.0055)
Individual fold scores: ['0.9768', '0.9850', '0.9879', '0.9918', '0.9914']

‚úÖ Using Anti-Overfitting XGBoost with regularization


In [22]:
# Train the best model on full training data and predict on test set
from sklearn.metrics import log_loss, accuracy_score, classification_report, confusion_matrix

best_model.fit(X_train, y_train_encoded)

# Predict on test data
test_predictions = best_model.predict(X_test)
test_predictions_proba = best_model.predict_proba(X_test)  # Get probability estimates
test_predictions_labels = le.inverse_transform(test_predictions)

# Calculate metrics on training data (for comparison)
train_predictions = best_model.predict(X_train)
train_predictions_proba = best_model.predict_proba(X_train)

train_loss = log_loss(y_train_encoded, train_predictions_proba)
train_accuracy = accuracy_score(y_train_encoded, train_predictions)

print(f"üìä Model Performance Metrics using {best_model_name}:")
print("="*60)
print(f"\nüîπ Training Set:")
print(f"   Log Loss: {train_loss:.4f}")
print(f"   Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")

# Since test data doesn't have labels, we can't calculate test loss
# But we can show prediction distribution and confidence
print(f"\nüîπ Test Set Predictions:")
print(f"   Total predictions: {len(test_predictions)}")

# Show prediction distribution
unique, counts = np.unique(test_predictions_labels, return_counts=True)
for label, count in zip(unique, counts):
    print(f"   {label}: {count} ({count/len(test_predictions)*100:.1f}%)")

# Show confidence statistics
confidence_switch = test_predictions_proba[test_predictions == 1][:, 1]
confidence_not_switch = test_predictions_proba[test_predictions == 0][:, 0]

if len(confidence_switch) > 0:
    print(f"\nüîπ Prediction Confidence (switch):")
    print(f"   Mean: {confidence_switch.mean():.4f}")
    print(f"   Min: {confidence_switch.min():.4f}, Max: {confidence_switch.max():.4f}")

if len(confidence_not_switch) > 0:
    print(f"\nüîπ Prediction Confidence (not_switch):")
    print(f"   Mean: {confidence_not_switch.mean():.4f}")
    print(f"   Min: {confidence_not_switch.min():.4f}, Max: {confidence_not_switch.max():.4f}")

# Create results dataframe with confidence scores
results_df = test_df.copy()
results_df['predicted_switch'] = list(test_predictions_labels)
results_df['confidence'] = [test_predictions_proba[i][test_predictions[i]] for i in range(len(test_predictions))]

print(f"\nüìã First 10 predictions with confidence:")
print(results_df[['L1', 'L2', 'L3', 'predicted_switch', 'confidence']].head(10))

# Save results
results_df.to_csv('test_predictions.csv', index=False)
print(f"\n‚úÖ Results saved to test_predictions.csv")

üìä Model Performance Metrics using XGBoost:

üîπ Training Set:
   Log Loss: 0.1409
   Accuracy: 0.9907 (99.07%)

üîπ Test Set Predictions:
   Total predictions: 14000
   not_switch: 4827 (34.5%)
   switch: 9173 (65.5%)

üîπ Prediction Confidence (switch):
   Mean: 0.9110
   Min: 0.5005, Max: 0.9961

üîπ Prediction Confidence (not_switch):
   Mean: 0.8273
   Min: 0.5020, Max: 0.9168

üìã First 10 predictions with confidence:
     L1    L2    L3 predicted_switch  confidence
0  2.55  0.97  5.31           switch    0.896121
1 -1.31 -1.38 -1.29           switch    0.996085
2  3.74  3.66  3.45       not_switch    0.822054
3 -1.71 -1.80 -1.73           switch    0.996085
4  5.29  5.35  5.28       not_switch    0.916097
5  4.75  0.38  1.90           switch    0.858436
6  2.81  2.58  2.99       not_switch    0.819929
7  1.02  2.91  3.71           switch    0.754264
8  3.70  3.32  3.66       not_switch    0.830484
9  4.89  2.08  1.77           switch    0.752867

‚úÖ Results saved to test

In [23]:
# Save the trained model and label encoder
import pickle

# Save XGBoost model
model_path = 'phase_balancing_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)
print(f"‚úÖ Model saved to {model_path}")

# Save label encoder
encoder_path = 'label_encoder.pkl'
with open(encoder_path, 'wb') as f:
    pickle.dump(le, f)
print(f"‚úÖ Label encoder saved to {encoder_path}")

# Verify saved model by loading and testing
with open(model_path, 'rb') as f:
    loaded_model = pickle.load(f)

with open(encoder_path, 'rb') as f:
    loaded_encoder = pickle.load(f)

# Test with sample data
test_sample = [[2.5, 2.6, 2.4], [1.0, 5.5, 2.0]]
test_pred = loaded_model.predict(test_sample)
test_pred_labels = loaded_encoder.inverse_transform(test_pred)
test_pred_proba = loaded_model.predict_proba(test_sample)

print(f"\n‚úÖ Model verification successful!")
print(f"Sample predictions:")
for i, (sample, pred, proba) in enumerate(zip(test_sample, test_pred_labels, test_pred_proba)):
    confidence = proba[test_pred[i]]
    print(f"  L1={sample[0]}, L2={sample[1]}, L3={sample[2]} ‚Üí {pred} (confidence: {confidence:.4f})")

‚úÖ Model saved to phase_balancing_model.pkl
‚úÖ Label encoder saved to label_encoder.pkl

‚úÖ Model verification successful!
Sample predictions:
  L1=2.5, L2=2.6, L3=2.4 ‚Üí not_switch (confidence: 0.7943)
  L1=1.0, L2=5.5, L3=2.0 ‚Üí switch (confidence: 0.8790)
