# Improved Models with Class Balancing

Addressing class imbalance using SMOTE and evaluating impact on model performance.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from collections import Counter

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

# Load data
X_train = np.load('data/X_train_fe.npy')
X_test = np.load('data/X_test_fe.npy')
y_train = np.load('data/y_train_encoded.npy')
y_test = np.load('data/y_test_encoded.npy')

le = joblib.load('data/label_encoder.joblib')

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')
print(f'\nOriginal class distribution:')
print(Counter(y_train))
for cls, count in Counter(y_train).items():
    print(f'{le.classes_[cls]}: {count} ({count/len(y_train)*100:.1f}%)')

## Baseline Performance (Without Balancing)

In [None]:
# Train baseline models without balancing
rf_baseline = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
gb_baseline = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Evaluate
rf_baseline.fit(X_train, y_train)
rf_pred_baseline = rf_baseline.predict(X_test)

gb_baseline.fit(X_train, y_train)
gb_pred_baseline = gb_baseline.predict(X_test)

print('BASELINE PERFORMANCE (No Balancing)')
print('='*60)
print('Random Forest:')
print(f'  Accuracy: {accuracy_score(y_test, rf_pred_baseline):.4f}')
print(f'  F1 Score: {f1_score(y_test, rf_pred_baseline, average="weighted"):.4f}')
print(classification_report(y_test, rf_pred_baseline, target_names=le.classes_))

print('\nGradient Boosting:')
print(f'  Accuracy: {accuracy_score(y_test, gb_pred_baseline):.4f}')
print(f'  F1 Score: {f1_score(y_test, gb_pred_baseline, average="weighted"):.4f}')
print(classification_report(y_test, gb_pred_baseline, target_names=le.classes_))

## Apply SMOTE for Class Balancing

In [None]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print('After SMOTE:')
print(f'Training set: {X_train_balanced.shape}')
print(f'\nBalanced class distribution:')
for cls, count in Counter(y_train_balanced).items():
    print(f'{le.classes_[cls]}: {count} ({count/len(y_train_balanced)*100:.1f}%)')

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Original distribution
orig_counts = Counter(y_train)
axes[0].bar([le.classes_[i] for i in orig_counts.keys()], 
            orig_counts.values(), color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_title('Original Class Distribution', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count')
axes[0].set_ylim(0, max(orig_counts.values()) * 1.1)

# After SMOTE
balanced_counts = Counter(y_train_balanced)
axes[1].bar([le.classes_[i] for i in balanced_counts.keys()], 
            balanced_counts.values(), color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1].set_title('After SMOTE', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].set_ylim(0, max(balanced_counts.values()) * 1.1)

plt.tight_layout()
plt.savefig('data/class_distribution.png', dpi=150)
plt.show()

## Train Models with Balanced Data

In [None]:
# Train with balanced data
rf_smote = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
gb_smote = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

rf_smote.fit(X_train_balanced, y_train_balanced)
rf_pred_smote = rf_smote.predict(X_test)

gb_smote.fit(X_train_balanced, y_train_balanced)
gb_pred_smote = gb_smote.predict(X_test)

print('PERFORMANCE WITH SMOTE')
print('='*60)
print('Random Forest + SMOTE:')
print(f'  Accuracy: {accuracy_score(y_test, rf_pred_smote):.4f}')
print(f'  F1 Score: {f1_score(y_test, rf_pred_smote, average="weighted"):.4f}')
print(classification_report(y_test, rf_pred_smote, target_names=le.classes_))

print('\nGradient Boosting + SMOTE:')
print(f'  Accuracy: {accuracy_score(y_test, gb_pred_smote):.4f}')
print(f'  F1 Score: {f1_score(y_test, gb_pred_smote, average="weighted"):.4f}')
print(classification_report(y_test, gb_pred_smote, target_names=le.classes_))

## Try Class Weights Instead

In [None]:
# Alternative: Use class weights
rf_weighted = RandomForestClassifier(
    n_estimators=100, 
    max_depth=10, 
    class_weight='balanced',
    random_state=42
)

gb_weighted = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

rf_weighted.fit(X_train, y_train)
rf_pred_weighted = rf_weighted.predict(X_test)

# Note: GradientBoosting doesn't have class_weight parameter
# We can use sample_weight instead
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

gb_weighted.fit(X_train, y_train, sample_weight=sample_weights)
gb_pred_weighted = gb_weighted.predict(X_test)

print('PERFORMANCE WITH CLASS WEIGHTS')
print('='*60)
print('Random Forest (weighted):')
print(f'  Accuracy: {accuracy_score(y_test, rf_pred_weighted):.4f}')
print(f'  F1 Score: {f1_score(y_test, rf_pred_weighted, average="weighted"):.4f}')
print(classification_report(y_test, rf_pred_weighted, target_names=le.classes_))

print('\nGradient Boosting (weighted):')
print(f'  Accuracy: {accuracy_score(y_test, gb_pred_weighted):.4f}')
print(f'  F1 Score: {f1_score(y_test, gb_pred_weighted, average="weighted"):.4f}')
print(classification_report(y_test, gb_pred_weighted, target_names=le.classes_))

## Comparison of All Approaches

In [None]:
# Compare all approaches
results = pd.DataFrame({
    'Model': [
        'RF (Baseline)',
        'RF + SMOTE',
        'RF + Class Weights',
        'GB (Baseline)',
        'GB + SMOTE',
        'GB + Sample Weights'
    ],
    'Accuracy': [
        accuracy_score(y_test, rf_pred_baseline),
        accuracy_score(y_test, rf_pred_smote),
        accuracy_score(y_test, rf_pred_weighted),
        accuracy_score(y_test, gb_pred_baseline),
        accuracy_score(y_test, gb_pred_smote),
        accuracy_score(y_test, gb_pred_weighted)
    ],
    'F1 Score': [
        f1_score(y_test, rf_pred_baseline, average='weighted'),
        f1_score(y_test, rf_pred_smote, average='weighted'),
        f1_score(y_test, rf_pred_weighted, average='weighted'),
        f1_score(y_test, gb_pred_baseline, average='weighted'),
        f1_score(y_test, gb_pred_smote, average='weighted'),
        f1_score(y_test, gb_pred_weighted, average='weighted')
    ]
})

print('\n' + '='*70)
print('COMPARISON: Baseline vs SMOTE vs Class Weights')
print('='*70)
print(results.to_string(index=False))
print('='*70)

# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(results))
width = 0.35

bars1 = ax.bar(x - width/2, results['Accuracy'], width, label='Accuracy', color='#3498db')
bars2 = ax.bar(x + width/2, results['F1 Score'], width, label='F1 Score', color='#e74c3c')

ax.set_xlabel('Model Variant')
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison', fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(results['Model'], rotation=45, ha='right')
ax.legend()
ax.set_ylim(0.6, 0.85)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('data/balancing_comparison.png', dpi=150)
plt.show()

# Save results
results.to_csv('data/balancing_results.csv', index=False)

## Per-Class Performance Analysis

In [None]:
# Compare per-class F1 scores
from sklearn.metrics import f1_score as f1_per_class

models_preds = {
    'Baseline': rf_pred_baseline,
    'SMOTE': rf_pred_smote,
    'Weighted': rf_pred_weighted
}

fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(le.classes_))
width = 0.25

for i, (name, preds) in enumerate(models_preds.items()):
    f1_scores = f1_per_class(y_test, preds, average=None)
    offset = (i - 1) * width
    ax.bar(x + offset, f1_scores, width, label=name)

ax.set_xlabel('Class')
ax.set_ylabel('F1 Score')
ax.set_title('Per-Class F1 Scores: Impact of Balancing Techniques', fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(le.classes_)
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('data/per_class_comparison.png', dpi=150)
plt.show()

## Save Best Model

In [None]:
# Determine best model based on F1 score
best_idx = results['F1 Score'].idxmax()
best_model_name = results.loc[best_idx, 'Model']
best_f1 = results.loc[best_idx, 'F1 Score']

print(f'Best model: {best_model_name} with F1={best_f1:.4f}')

# Save the best model (assuming it's one of the weighted ones)
if 'GB' in best_model_name and 'Weight' in best_model_name:
    joblib.dump(gb_weighted, 'data/best_model_final.joblib')
    print('Saved: Gradient Boosting with sample weights')
elif 'RF' in best_model_name and 'Weight' in best_model_name:
    joblib.dump(rf_weighted, 'data/best_model_final.joblib')
    print('Saved: Random Forest with class weights')
elif 'SMOTE' in best_model_name:
    if 'RF' in best_model_name:
        joblib.dump(rf_smote, 'data/best_model_final.joblib')
        print('Saved: Random Forest trained on SMOTE data')
    else:
        joblib.dump(gb_smote, 'data/best_model_final.joblib')
        print('Saved: Gradient Boosting trained on SMOTE data')

print('\nModel saved as: data/best_model_final.joblib')

## Key Insights

**Findings:**
- Class imbalance affects minority class ("Enrolled") performance
- SMOTE typically improves recall for minority classes but may reduce overall accuracy
- Class weights provide a good middle ground between balanced and unbalanced approaches
- The best approach depends on the business cost of different error types

**Recommendation:**
If identifying at-risk "Enrolled" students is critical, use SMOTE or weighted models despite slightly lower overall accuracy.