# Baseline Model Analysis

This notebook analyzes the performance of the XGBoost baseline models trained on Day 5.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load results
results = pd.read_csv('../results/baseline_results.csv')
results

In [None]:
# 1. Performance Comparison Bar Chart
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Accuracy
axes[0, 0].barh(results['smell'], results['val_accuracy'], color='steelblue')
axes[0, 0].set_xlabel('Validation Accuracy')
axes[0, 0].set_title('Baseline Accuracy by Code Smell')
axes[0, 0].set_xlim(0, 1.1)

# F1 Score
axes[0, 1].barh(results['smell'], results['val_f1'], color='coral')
axes[0, 1].set_xlabel('Validation F1 Score')
axes[0, 1].set_title('Baseline F1 Score by Code Smell')
axes[0, 1].set_xlim(0, 1.1)

# ROC-AUC
axes[1, 0].barh(results['smell'], results['val_roc_auc'], color='mediumseagreen')
axes[1, 0].set_xlabel('Validation ROC-AUC')
axes[1, 0].set_title('Baseline ROC-AUC by Code Smell')
axes[1, 0].set_xlim(0, 1.1)

# Training Time
axes[1, 1].barh(results['smell'], results['training_time_seconds'], color='orchid')
axes[1, 1].set_xlabel('Training Time (seconds)')
axes[1, 1].set_title('Training Time by Code Smell')

plt.tight_layout()
plt.savefig('../results/baseline_comparison.png', dpi=300)
plt.show()

In [None]:
# 2. Identify best and worst performing smells
print("PERFORMANCE ANALYSIS:")
print("="*60)
print(f"\nBest performing (by F1): {results.loc[results['val_f1'].idxmax(), 'smell']}")
print(f"F1 Score: {results['val_f1'].max():.4f}")

print(f"\nWorst performing (by F1): {results.loc[results['val_f1'].idxmin(), 'smell']}")
print(f"F1 Score: {results['val_f1'].min():.4f}")

In [None]:
# 3. Analysis insights
print("\n\nKEY INSIGHTS:")
print("="*60)

# Check if no_docstring is hardest
if results[results['smell'] == 'has_no_docstring']['val_f1'].values[0] < 0.90:
    print("⚠️ 'No Docstring' detection is challenging (F1 < 0.90)")
    print("   Reason: We excluded direct docstring features to prevent leakage.")

# Check for perfect scores
perfect_smells = results[results['val_f1'] > 0.99]['smell'].tolist()
if perfect_smells:
    print(f"✅ Perfect/Near-Perfect detection for: {', '.join(perfect_smells)}")
    print("   Reason: The engineered features (e.g. Radon complexity) align perfectly with the labeling logic.")

# Overall baseline quality
avg_f1 = results['val_f1'].mean()
print(f"\nOverall Average F1: {avg_f1:.4f}")