# Semi-Supervised Training

This notebook trains the sentiment classifier on a combined dataset of hard labels (ground truth) and weak labels (generated by k-NN).

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_loader import load_and_split_data
from models import SentimentClassifier
from weak_labeling import WeakLabeler

## Load Data

In [None]:
splits = load_and_split_data()
test_df = splits['test']
val_df = splits['val']

## Initialize Weak Labeler

In [None]:
labeler = WeakLabeler(model_name="all-mpnet-base-v2")

## Train Semi-Supervised Models

For each split size:
1. Train k-NN on labeled data.
2. Generate weak labels for unlabeled data.
3. Combine labeled and weak-labeled data.
4. Train classifier on combined data.
5. Evaluate.

In [None]:
train_sizes = [100, 250, 500, 1000]
results = []

for size in train_sizes:
    print(f"\n=== Semi-Supervised Training for {size} labeled samples ===")
    train_df = splits[f'train_{size}']
    unlabeled_df = splits[f'unlabeled_{size}']
    
    # 1. Train k-NN
    knn = labeler.train_knn(train_df, n_neighbors=5)
    
    # 2. Generate Weak Labels
    weak_labeled_df = labeler.predict(knn, unlabeled_df)
    
    # 3. Combine Data
    # We can add a flag to distinguish source if needed, but for training we just need sentence and label
    combined_df = pd.concat([train_df, weak_labeled_df[['sentence', 'label']]]).reset_index(drop=True)
    print(f"Combined training set size: {len(combined_df)}")
    
    # 4. Train Classifier
    classifier = SentimentClassifier(model_name="distilbert-base-uncased", output_dir=f"../models/semi_supervised_{size}")
    classifier.train(combined_df, val_df, epochs=3, batch_size=16)
    
    # 5. Evaluate
    metrics = classifier.evaluate(test_df)
    print(f"Results for {size} (Semi-Supervised): {metrics}")
    
    results.append({
        'train_size': size,
        'accuracy': metrics['eval_accuracy'],
        'f1': metrics['eval_f1']
    })

## Plot Comparison (Baseline vs Semi-Supervised)

Note: You should manually input the baseline results here or load them from a saved file for comparison.

In [None]:
results_df = pd.DataFrame(results)

plt.figure(figsize=(10, 6))
plt.plot(results_df['train_size'], results_df['f1'], marker='o', label='Semi-Supervised F1')
plt.plot(results_df['train_size'], results_df['accuracy'], marker='s', label='Semi-Supervised Accuracy')
plt.title('Semi-Supervised Model Performance vs Training Size')
plt.xlabel('Number of Labeled Samples')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()