# Step 5: Baseline Model Training & Evaluation

This notebook trains and evaluates baseline models for Amazon product categorization:
- Logistic Regression with TF-IDF
- Random Forest with TF-IDF
- Multinomial Naive Bayes

All models are trained on TF-IDF features extracted from product titles and descriptions.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import joblib
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Setup paths
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
MODEL_DIR = os.path.join(PROJECT_ROOT, "models")
RESULTS_DIR = os.path.join(PROJECT_ROOT, "results")

print(f"Project root: {PROJECT_ROOT}")

## 1. Load Processed Data and Features

In [None]:
# Load splits
train_df = pd.read_csv(os.path.join(PROCESSED_DIR, "train.csv"))
val_df = pd.read_csv(os.path.join(PROCESSED_DIR, "val.csv"))

print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")
print(f"\nCategories: {train_df['category'].nunique()}")

# Load TF-IDF features
X_train = sparse.load_npz(os.path.join(PROCESSED_DIR, "tfidf_train.npz"))
X_val = sparse.load_npz(os.path.join(PROCESSED_DIR, "tfidf_val.npz"))
y_train = train_df["category"].values
y_val = val_df["category"].values

# Load label encoder
le = joblib.load(os.path.join(MODEL_DIR, "label_encoder.joblib"))

# Encode labels
from sklearn.preprocessing import LabelEncoder
y_train_encoded = le.transform(y_train)
y_val_encoded = le.transform(y_val)

print(f"\nTF-IDF feature dimensions: {X_train.shape}")

## 2. Baseline 1: Logistic Regression

In [None]:
print("Training Logistic Regression with GridSearchCV...")

# Hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'max_iter': [1000]
}

lr = LogisticRegression(random_state=42, n_jobs=-1, verbose=1)
grid_search = GridSearchCV(
    lr, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=2
)

grid_search.fit(X_train, y_train_encoded)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Evaluate on validation
lr_best = grid_search.best_estimator_
y_pred_lr = lr_best.predict(X_val)

lr_metrics = {
    'accuracy': accuracy_score(y_val_encoded, y_pred_lr),
    'macro_precision': precision_score(y_val_encoded, y_pred_lr, average='macro'),
    'macro_recall': recall_score(y_val_encoded, y_pred_lr, average='macro'),
    'macro_f1': f1_score(y_val_encoded, y_pred_lr, average='macro'),
}

print("\nLogistic Regression Validation Metrics:")
for metric, value in lr_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 3. Baseline 2: Random Forest

In [None]:
print("Training Random Forest...")

# Use subsample for faster training
sample_size = min(20000, len(train_df))
indices = np.random.RandomState(42).choice(len(train_df), sample_size, replace=False)
X_train_sample = X_train[indices]
y_train_sample = y_train_encoded[indices]

param_grid_rf = {
    'n_estimators': [100],
    'max_depth': [20, 50]
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search_rf = GridSearchCV(
    rf, param_grid_rf, cv=3, scoring='f1_macro', n_jobs=1, verbose=2
)

grid_search_rf.fit(X_train_sample, y_train_sample)

print(f"\nBest parameters: {grid_search_rf.best_params_}")

# Retrain on full dataset
print("Retraining on full training set...")
rf_best = RandomForestClassifier(**grid_search_rf.best_params_, random_state=42, n_jobs=-1)
rf_best.fit(X_train, y_train_encoded)

# Evaluate
y_pred_rf = rf_best.predict(X_val)

rf_metrics = {
    'accuracy': accuracy_score(y_val_encoded, y_pred_rf),
    'macro_precision': precision_score(y_val_encoded, y_pred_rf, average='macro'),
    'macro_recall': recall_score(y_val_encoded, y_pred_rf, average='macro'),
    'macro_f1': f1_score(y_val_encoded, y_pred_rf, average='macro'),
}

print("\nRandom Forest Validation Metrics:")
for metric, value in rf_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 4. Baseline 3: Multinomial Naive Bayes

In [None]:
print("Training Multinomial Naive Bayes...")

nb = MultinomialNB(alpha=1.0)
nb.fit(X_train, y_train_encoded)

y_pred_nb = nb.predict(X_val)

nb_metrics = {
    'accuracy': accuracy_score(y_val_encoded, y_pred_nb),
    'macro_precision': precision_score(y_val_encoded, y_pred_nb, average='macro'),
    'macro_recall': recall_score(y_val_encoded, y_pred_nb, average='macro'),
    'macro_f1': f1_score(y_val_encoded, y_pred_nb, average='macro'),
}

print("\nNaive Bayes Validation Metrics:")
for metric, value in nb_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 5. Model Comparison

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame([
    {'Model': 'Logistic Regression', **lr_metrics},
    {'Model': 'Random Forest', **rf_metrics},
    {'Model': 'Multinomial NB', **nb_metrics}
])

print("\n" + "="*70)
print("BASELINE MODEL COMPARISON")
print("="*70)
print(comparison_df.to_string(index=False))
print("="*70)

# Find best model
best_idx = comparison_df['macro_f1'].idxmax()
best_model = comparison_df.loc[best_idx, 'Model']
best_f1 = comparison_df.loc[best_idx, 'macro_f1']

print(f"\nüèÜ Best Model: {best_model} (Macro-F1: {best_f1:.4f})")

## 6. Visualize Results

In [None]:
# Bar plot of metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['accuracy', 'macro_precision', 'macro_recall', 'macro_f1']
titles = ['Accuracy', 'Macro Precision', 'Macro Recall', 'Macro F1-Score']

for idx, (metric, title) in enumerate(zip(metrics, titles)):
    ax = axes[idx // 2, idx % 2]
    comparison_df.plot(x='Model', y=metric, kind='bar', ax=ax, legend=False, color='steelblue')
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.set_ylabel('Score')
    ax.set_xlabel('')
    ax.set_ylim(0, 1)
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for container in ax.containers:
        ax.bar_label(container, fmt='%.3f')

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'baseline_comparison.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"\nPlot saved to: {os.path.join(RESULTS_DIR, 'baseline_comparison.png')}")

## 7. Per-Class Performance (Best Model)

In [None]:
# Classification report for best model (Logistic Regression)
print("\nPer-Category Performance (Logistic Regression):")
print("="*70)
print(classification_report(y_val_encoded, y_pred_lr, target_names=le.classes_))
print("="*70)

## 8. Model Saving

Models are automatically saved by `src/train_baselines.py`. Verify saved models:

In [None]:
import glob

model_files = glob.glob(os.path.join(MODEL_DIR, "baseline_*.joblib"))
print("Saved baseline models:")
for f in model_files:
    print(f"  - {os.path.basename(f)}")

# Load and verify best model
best_model_loaded = joblib.load(os.path.join(MODEL_DIR, "baseline.joblib"))
print(f"\nBest model loaded: {type(best_model_loaded).__name__}")

## Summary

All three baseline models have been trained and evaluated:

- **Logistic Regression** achieved the best performance with 95.69% Macro-F1
- **Random Forest** achieved 88.13% Macro-F1
- **Multinomial Naive Bayes** achieved 86.89% Macro-F1

The Logistic Regression model has been saved as the best baseline and will be compared with the BERT model in the final evaluation.