# Baseline Model Training

This notebook trains a baseline sentiment classifier (DistilBERT) on hierarchically nested training sets (100, 250, 500, 1000 samples) and evaluates its performance.

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_loader import load_and_split_data
from models import SentimentClassifier

## Load Data

In [None]:
splits = load_and_split_data()
test_df = splits['test']
val_df = splits['val']

## Train and Evaluate on Different Sizes

We will train a separate model for each training set size and record the test accuracy and F1 score.

In [None]:
train_sizes = [100, 250, 500, 1000]
results = []

for size in train_sizes:
    print(f"\n=== Training on {size} samples ===")
    train_df = splits[f'train_{size}']
    
    # Initialize model
    classifier = SentimentClassifier(model_name="distilbert-base-uncased", output_dir=f"../models/baseline_{size}")
    
    # Train
    classifier.train(train_df, val_df, epochs=3, batch_size=16)
    
    # Evaluate
    metrics = classifier.evaluate(test_df)
    print(f"Results for {size}: {metrics}")
    
    results.append({
        'train_size': size,
        'accuracy': metrics['eval_accuracy'],
        'f1': metrics['eval_f1']
    })

## Plot Learning Curve

In [None]:
results_df = pd.DataFrame(results)

plt.figure(figsize=(10, 6))
plt.plot(results_df['train_size'], results_df['f1'], marker='o', label='F1 Score')
plt.plot(results_df['train_size'], results_df['accuracy'], marker='s', label='Accuracy')
plt.title('Learning Curve: Baseline Model Performance vs Training Size')
plt.xlabel('Number of Training Samples')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
results_df