# COE379L Project 3: Classical Model Implementation and Optimization

## Cross-Model Comparison for News Topic Classification

This notebook covers:
- TF-IDF feature extraction (unigrams and bigrams)
- XGBoost classifier with hyperparameter optimization
- Support Vector Machine (SVM) implementation (LinearSVC and RBF kernel)
- Model evaluation and performance metrics
- Training time and inference latency measurement


## 1. Import Required Libraries


In [1]:
# Standard library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    log_loss, 
    confusion_matrix, 
    classification_report
)
from sklearn.pipeline import Pipeline

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")


Libraries imported successfully!


## 2. Load Preprocessed Data


In [None]:
# Load preprocessed data from the EDA notebook
# If data files don't exist, we'll load from Hugging Face and preprocess
import os

if os.path.exists('data/train_processed.csv') and os.path.exists('data/test_processed.csv'):
    train_df = pd.read_csv('data/train_processed.csv')
    test_df = pd.read_csv('data/test_processed.csv')
    print("Loaded preprocessed data from CSV files")
else:
    # Fallback: Load and preprocess from Hugging Face
    from datasets import load_dataset
    
    print("Loading data from Hugging Face...")
    dataset = load_dataset("ag_news")
    train_df = pd.DataFrame(dataset['train'])
    test_df = pd.DataFrame(dataset['test'])
    
    # Note: AG News dataset has a single 'text' field that already contains title and description
    # The format is typically "Title. Description" - we'll use it directly as combined_text
    train_df['combined_text'] = train_df['text'].astype(str).str.strip()
    test_df['combined_text'] = test_df['text'].astype(str).str.strip()
    
    # Keep only necessary columns
    train_df = train_df[['label', 'combined_text']]
    test_df = test_df[['label', 'combined_text']]

print(f"Training samples: {len(train_df):,}")
print(f"Test samples: {len(test_df):,}")
print(f"\nClass distribution (training):")
print(train_df['label'].value_counts().sort_index())

# Prepare features and labels
X_train = train_df['combined_text'].values
y_train = train_df['label'].values
X_test = test_df['combined_text'].values
y_test = test_df['label'].values

print(f"\nX_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


Loaded preprocessed data from CSV files
Training samples: 120,000
Test samples: 7,600

Class distribution (training):
label
0    30000
1    30000
2    30000
3    30000
Name: count, dtype: int64

X_train shape: (120000,)
y_train shape: (120000,)
X_test shape: (7600,)
y_test shape: (7600,)


## 3. TF-IDF Feature Extraction

According to the project requirements, we need to generate high-dimensional sparse vector representations using unigrams and bigrams.


In [3]:
# Initialize TF-IDF vectorizer with unigrams and bigrams
# Using max_features to limit dimensionality for computational efficiency
# Adjust max_features based on available memory and computational resources
print("Initializing TF-IDF vectorizer...")
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Unigrams and bigrams
    max_features=50000,  # Limit to top 50k features for efficiency
    min_df=2,            # Ignore terms that appear in fewer than 2 documents
    max_df=0.95,         # Ignore terms that appear in more than 95% of documents
    sublinear_tf=True,   # Apply sublinear tf scaling (1 + log(tf))
    stop_words='english' # Remove English stop words
)

# Fit and transform training data
print("Fitting TF-IDF on training data...")
start_time = time.time()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
tfidf_fit_time = time.time() - start_time

# Transform test data
print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"\nTF-IDF Feature Extraction Complete!")
print(f"Training TF-IDF fit time: {tfidf_fit_time:.2f} seconds")
print(f"Training features shape: {X_train_tfidf.shape}")
print(f"Test features shape: {X_test_tfidf.shape}")
print(f"Number of features: {X_train_tfidf.shape[1]:,}")
print(f"Sparsity: {(1.0 - X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1])) * 100:.2f}%")


Initializing TF-IDF vectorizer...
Fitting TF-IDF on training data...
Transforming test data...

TF-IDF Feature Extraction Complete!
Training TF-IDF fit time: 6.00 seconds
Training features shape: (120000, 50000)
Test features shape: (7600, 50000)
Number of features: 50,000
Sparsity: 99.95%


## 4. Helper Functions for Model Evaluation

We'll create functions to evaluate models and measure training/inference times.


In [4]:
def evaluate_model(model, X_test, y_test, model_name):
    """
    Evaluate a trained model and return metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test labels
        model_name: Name of the model for display
    
    Returns:
        Dictionary with metrics
    """
    # Predictions
    start_time = time.time()
    y_pred = model.predict(X_test)
    inference_time = time.time() - start_time
    
    # Probabilities (for log loss)
    try:
        y_pred_proba = model.predict_proba(X_test)
    except:
        # Some models might not have predict_proba
        y_pred_proba = None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    
    log_loss_score = None
    if y_pred_proba is not None:
        log_loss_score = log_loss(y_test, y_pred_proba)
    
    # Inference latency per 1000 samples
    num_samples = len(X_test)
    inference_latency_per_1k = (inference_time / num_samples) * 1000
    
    results = {
        'model_name': model_name,
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'log_loss': log_loss_score,
        'inference_time': inference_time,
        'inference_latency_per_1k': inference_latency_per_1k,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    return results

def measure_inference_latency(model, X_test, num_samples=1000):
    """
    Measure inference latency for a specific number of samples.
    
    Args:
        model: Trained model
        X_test: Test features
        num_samples: Number of samples to test (default 1000)
    
    Returns:
        Time taken for inference
    """
    # Sample random indices
    indices = np.random.choice(len(X_test), min(num_samples, len(X_test)), replace=False)
    X_sample = X_test[indices]
    
    # Warm-up
    _ = model.predict(X_sample[:10])
    
    # Measure inference time
    start_time = time.time()
    _ = model.predict(X_sample)
    inference_time = time.time() - start_time
    
    return inference_time

print("Helper functions defined!")


Helper functions defined!


In [5]:
# Initialize XGBoost classifier
# Note: XGBoost works better with dense matrices, but can handle sparse
# For large sparse matrices, we might need to convert or use a subset
print("=" * 80)
print("XGBoost Model Training")
print("=" * 80)

# For computational efficiency with large sparse matrices, we'll use a subset for hyperparameter tuning
# Then train final model on full dataset with best parameters
print("\nStep 1: Hyperparameter tuning on subset of data...")

# Use a subset for faster hyperparameter search
subset_size = 20000
indices = np.random.choice(len(X_train_tfidf), subset_size, replace=False)
X_train_subset = X_train_tfidf[indices]
y_train_subset = y_train[indices]

# Convert sparse matrix to dense for XGBoost (or use sparse matrix support)
# XGBoost supports sparse matrices, but dense might be faster for smaller subsets
X_train_subset_dense = X_train_subset.toarray()

# Define parameter grid for hyperparameter tuning
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

# Initialize base model
xgb_base = XGBClassifier(
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss',
    use_label_encoder=False
)

# Use RandomizedSearchCV for faster search
print("Running RandomizedSearchCV...")
start_time = time.time()
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_grid_xgb,
    n_iter=10,  # Number of parameter settings sampled
    cv=3,        # 3-fold cross-validation
    scoring='f1_macro',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search_xgb.fit(X_train_subset_dense, y_train_subset)
hyperparameter_search_time = time.time() - start_time

print(f"\nHyperparameter search completed in {hyperparameter_search_time:.2f} seconds")
print(f"Best parameters: {random_search_xgb.best_params_}")
print(f"Best CV score: {random_search_xgb.best_score_:.4f}")


XGBoost Model Training

Step 1: Hyperparameter tuning on subset of data...


TypeError: sparse array length is ambiguous; use getnnz() or shape[0]

In [None]:
# Step 2: Train final XGBoost model on full dataset with best parameters
print("\nStep 2: Training final XGBoost model on full dataset...")

# For full dataset, we'll use a sample if the dataset is too large
# XGBoost can handle large datasets, but for memory efficiency, we might sample
full_train_size = min(50000, len(X_train_tfidf))  # Use up to 50k samples for final training
if full_train_size < len(X_train_tfidf):
    print(f"Using {full_train_size:,} samples for final training (for computational efficiency)")
    indices_full = np.random.choice(len(X_train_tfidf), full_train_size, replace=False)
    X_train_final = X_train_tfidf[indices_full].toarray()
    y_train_final = y_train[indices_full]
else:
    X_train_final = X_train_tfidf.toarray()
    y_train_final = y_train

# Create final model with best parameters
xgb_final = XGBClassifier(
    **random_search_xgb.best_params_,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss',
    use_label_encoder=False
)

# Train final model
print("Training final model...")
start_time = time.time()
xgb_final.fit(X_train_final, y_train_final)
xgb_training_time = time.time() - start_time

print(f"Training completed in {xgb_training_time:.2f} seconds")

# Evaluate model
print("\nEvaluating XGBoost model...")
X_test_dense = X_test_tfidf.toarray()
xgb_results = evaluate_model(xgb_final, X_test_dense, y_test, "XGBoost")

print(f"\nXGBoost Results:")
print(f"  Accuracy: {xgb_results['accuracy']:.4f}")
print(f"  Macro F1-Score: {xgb_results['f1_macro']:.4f}")
print(f"  Log Loss: {xgb_results['log_loss']:.4f}")
print(f"  Training Time: {xgb_training_time:.2f} seconds")
print(f"  Inference Latency (per 1,000 samples): {xgb_results['inference_latency_per_1k']:.4f} seconds")


## 6. Support Vector Machine (SVM) - LinearSVC

We'll implement LinearSVC which is more efficient than SVC for large datasets.


In [None]:
# LinearSVC is more efficient for large sparse matrices
print("=" * 80)
print("SVM LinearSVC Model Training")
print("=" * 80)

# Define parameter grid for LinearSVC
param_grid_svm_linear = {
    'C': [0.1, 1.0, 10.0, 100.0],
    'penalty': ['l2'],
    'loss': ['squared_hinge'],
    'max_iter': [1000, 2000]
}

# Initialize base model
svm_linear_base = LinearSVC(random_state=42, dual=False)  # dual=False for n_samples > n_features

# Use RandomizedSearchCV
print("Running RandomizedSearchCV for LinearSVC...")
start_time = time.time()
random_search_svm_linear = RandomizedSearchCV(
    estimator=svm_linear_base,
    param_distributions=param_grid_svm_linear,
    n_iter=8,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Use subset for hyperparameter tuning
random_search_svm_linear.fit(X_train_subset, y_train_subset)
svm_linear_hyperparameter_time = time.time() - start_time

print(f"\nHyperparameter search completed in {svm_linear_hyperparameter_time:.2f} seconds")
print(f"Best parameters: {random_search_svm_linear.best_params_}")
print(f"Best CV score: {random_search_svm_linear.best_score_:.4f}")


In [None]:
# Train final LinearSVC model on full dataset
print("\nTraining final LinearSVC model on full dataset...")

svm_linear_final = LinearSVC(
    **random_search_svm_linear.best_params_,
    random_state=42,
    dual=False
)

start_time = time.time()
svm_linear_final.fit(X_train_tfidf, y_train)
svm_linear_training_time = time.time() - start_time

print(f"Training completed in {svm_linear_training_time:.2f} seconds")

# Evaluate model
print("\nEvaluating LinearSVC model...")
svm_linear_results = evaluate_model(svm_linear_final, X_test_tfidf, y_test, "SVM-LinearSVC")

# LinearSVC doesn't have predict_proba by default, so log_loss will be None
# This is expected behavior for LinearSVC
print(f"\nSVM LinearSVC Results:")
print(f"  Accuracy: {svm_linear_results['accuracy']:.4f}")
print(f"  Macro F1-Score: {svm_linear_results['f1_macro']:.4f}")
print(f"  Log Loss: {svm_linear_results['log_loss']} (LinearSVC doesn't support probability estimates)")
print(f"  Training Time: {svm_linear_training_time:.2f} seconds")
print(f"  Inference Latency (per 1,000 samples): {svm_linear_results['inference_latency_per_1k']:.4f} seconds")


## 7. Support Vector Machine (SVM) - RBF Kernel

We'll also implement SVC with RBF kernel for comparison, though it's computationally more expensive.


In [None]:
# SVC with RBF kernel - more computationally expensive
# We'll use a smaller subset due to computational constraints
print("=" * 80)
print("SVM RBF Kernel Model Training")
print("=" * 80)
print("Note: RBF kernel is computationally expensive. Using smaller subset for training.")

# Use smaller subset for RBF kernel
rbf_subset_size = 10000
indices_rbf = np.random.choice(len(X_train_tfidf), rbf_subset_size, replace=False)
X_train_rbf = X_train_tfidf[indices_rbf].toarray()  # RBF needs dense matrix
y_train_rbf = y_train[indices_rbf]

# Define parameter grid for RBF SVC
param_grid_svm_rbf = {
    'C': [0.1, 1.0, 10.0],
    'gamma': ['scale', 'auto', 0.001, 0.01]
}

# Initialize base model
svm_rbf_base = SVC(kernel='rbf', random_state=42, probability=True)  # probability=True for predict_proba

# Use RandomizedSearchCV
print("Running RandomizedSearchCV for RBF SVC...")
start_time = time.time()
random_search_svm_rbf = RandomizedSearchCV(
    estimator=svm_rbf_base,
    param_distributions=param_grid_svm_rbf,
    n_iter=6,  # Fewer iterations due to computational cost
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search_svm_rbf.fit(X_train_rbf, y_train_rbf)
svm_rbf_hyperparameter_time = time.time() - start_time

print(f"\nHyperparameter search completed in {svm_rbf_hyperparameter_time:.2f} seconds")
print(f"Best parameters: {random_search_svm_rbf.best_params_}")
print(f"Best CV score: {random_search_svm_rbf.best_score_:.4f}")


In [None]:
# Train final RBF SVC model
print("\nTraining final RBF SVC model...")

svm_rbf_final = SVC(
    **random_search_svm_rbf.best_params_,
    kernel='rbf',
    random_state=42,
    probability=True
)

start_time = time.time()
svm_rbf_final.fit(X_train_rbf, y_train_rbf)
svm_rbf_training_time = time.time() - start_time

print(f"Training completed in {svm_rbf_training_time:.2f} seconds")

# Evaluate model
print("\nEvaluating RBF SVC model...")
X_test_dense = X_test_tfidf.toarray()
svm_rbf_results = evaluate_model(svm_rbf_final, X_test_dense, y_test, "SVM-RBF")

print(f"\nSVM RBF Results:")
print(f"  Accuracy: {svm_rbf_results['accuracy']:.4f}")
print(f"  Macro F1-Score: {svm_rbf_results['f1_macro']:.4f}")
print(f"  Log Loss: {svm_rbf_results['log_loss']:.4f}")
print(f"  Training Time: {svm_rbf_training_time:.2f} seconds")
print(f"  Inference Latency (per 1,000 samples): {svm_rbf_results['inference_latency_per_1k']:.4f} seconds")


## 8. Results Summary and Comparison


In [None]:
# Compile all results
results_summary = []

# Add XGBoost results
results_summary.append({
    'Model': 'XGBoost',
    'Accuracy': xgb_results['accuracy'],
    'Macro F1-Score': xgb_results['f1_macro'],
    'Log Loss': xgb_results['log_loss'],
    'Training Time (s)': xgb_training_time,
    'Inference Latency per 1k (s)': xgb_results['inference_latency_per_1k']
})

# Add LinearSVC results
results_summary.append({
    'Model': 'SVM-LinearSVC',
    'Accuracy': svm_linear_results['accuracy'],
    'Macro F1-Score': svm_linear_results['f1_macro'],
    'Log Loss': svm_linear_results['log_loss'] if svm_linear_results['log_loss'] is not None else np.nan,
    'Training Time (s)': svm_linear_training_time,
    'Inference Latency per 1k (s)': svm_linear_results['inference_latency_per_1k']
})

# Add RBF SVC results
results_summary.append({
    'Model': 'SVM-RBF',
    'Accuracy': svm_rbf_results['accuracy'],
    'Macro F1-Score': svm_rbf_results['f1_macro'],
    'Log Loss': svm_rbf_results['log_loss'],
    'Training Time (s)': svm_rbf_training_time,
    'Inference Latency per 1k (s)': svm_rbf_results['inference_latency_per_1k']
})

# Create DataFrame
results_df = pd.DataFrame(results_summary)

print("=" * 80)
print("CLASSICAL MODELS - RESULTS SUMMARY")
print("=" * 80)
print(results_df.to_string(index=False))
print("=" * 80)

# Save results
results_df.to_csv('data/classical_models_results.csv', index=False)
print("\nResults saved to data/classical_models_results.csv")


## 9. Visualizations


In [None]:
# Bar chart comparing F1-scores
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# F1-Score comparison
models = results_df['Model'].values
f1_scores = results_df['Macro F1-Score'].values

axes[0].bar(models, f1_scores, color=['steelblue', 'coral', 'lightgreen'])
axes[0].set_title('Macro F1-Score Comparison - Classical Models', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Macro F1-Score', fontsize=12)
axes[0].set_ylim([0, 1])
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(f1_scores):
    axes[0].text(i, v + 0.01, f'{v:.4f}', ha='center', fontweight='bold')

# Training time comparison
training_times = results_df['Training Time (s)'].values
axes[1].bar(models, training_times, color=['steelblue', 'coral', 'lightgreen'])
axes[1].set_title('Training Time Comparison - Classical Models', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Training Time (seconds)', fontsize=12)
axes[1].set_yscale('log')  # Log scale for better visualization
axes[1].grid(axis='y', alpha=0.3)
for i, v in enumerate(training_times):
    axes[1].text(i, v * 1.2, f'{v:.1f}s', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('data/classical_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print("Visualization saved to data/classical_models_comparison.png")


## 10. Confusion Matrices

We'll create confusion matrices for the best-performing classical model.


In [None]:
# Find best model based on F1-score
best_model_idx = results_df['Macro F1-Score'].idxmax()
best_model_name = results_df.loc[best_model_idx, 'Model']

print(f"Best performing classical model: {best_model_name}")
print(f"F1-Score: {results_df.loc[best_model_idx, 'Macro F1-Score']:.4f}")

# Get predictions for best model
if best_model_name == 'XGBoost':
    best_predictions = xgb_results['y_pred']
    best_model_obj = xgb_final
elif best_model_name == 'SVM-LinearSVC':
    best_predictions = svm_linear_results['y_pred']
    best_model_obj = svm_linear_final
else:  # SVM-RBF
    best_predictions = svm_rbf_results['y_pred']
    best_model_obj = svm_rbf_final

# Class labels
class_labels = ['World', 'Sports', 'Business', 'Sci/Tech']

# Create confusion matrix
cm = confusion_matrix(y_test, best_predictions)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels,
            cbar_kws={'label': 'Count'})
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig('data/classical_models_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print("Confusion matrix saved to data/classical_models_confusion_matrix.png")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, best_predictions, target_names=class_labels))


## 11. Save Models and Results

We'll save the trained models and all results for later use.


In [None]:
# Save models using joblib
import joblib

print("Saving models and vectorizer...")

# Save TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'data/tfidf_vectorizer.pkl')
print("✓ TF-IDF vectorizer saved")

# Save XGBoost model
joblib.dump(xgb_final, 'data/xgb_model.pkl')
print("✓ XGBoost model saved")

# Save LinearSVC model
joblib.dump(svm_linear_final, 'data/svm_linear_model.pkl')
print("✓ SVM LinearSVC model saved")

# Save RBF SVC model
joblib.dump(svm_rbf_final, 'data/svm_rbf_model.pkl')
print("✓ SVM RBF model saved")

print("\nAll models and results saved successfully!")
print("Ready for comparison with transformer models in the next notebook.")
