# Weak Labeling

This notebook generates weak labels for the unlabeled data using k-Nearest Neighbors on sentence embeddings.

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_loader import load_and_split_data
from weak_labeling import WeakLabeler

## Load Data

In [None]:
splits = load_and_split_data()

## Initialize Weak Labeler

We use `all-mpnet-base-v2` for generating embeddings.

In [None]:
labeler = WeakLabeler(model_name="all-mpnet-base-v2")

## Generate and Evaluate Weak Labels

For each split size, we train a k-NN on the labeled set and predict labels for the unlabeled set. We then compare these weak labels with the true labels (which we have access to for evaluation purposes).

In [None]:
train_sizes = [100, 250, 500, 1000]
results = []

for size in train_sizes:
    print(f"\n=== Weak Labeling for {size} labeled samples ===")
    train_df = splits[f'train_{size}']
    unlabeled_df = splits[f'unlabeled_{size}']
    
    # Train k-NN
    knn = labeler.train_knn(train_df, n_neighbors=5)
    
    # Predict
    weak_labeled_df = labeler.predict(knn, unlabeled_df)
    
    # Evaluate (comparing weak labels to true labels hidden in unlabeled_df)
    # Note: unlabeled_df still has the 'label' column with true labels
    true_labels = unlabeled_df['label']
    predicted_labels = weak_labeled_df['label']
    
    acc = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    
    print(f"Weak Label Quality (Size {size}): Accuracy={acc:.4f}, F1={f1:.4f}")
    
    results.append({
        'train_size': size,
        'weak_accuracy': acc,
        'weak_f1': f1
    })
    
    # Save weak labeled data for next step (optional, or just re-generate)
    # weak_labeled_df.to_csv(f"../data/weak_labeled_{size}.csv", index=False)

## Plot Weak Label Quality

In [None]:
results_df = pd.DataFrame(results)

plt.figure(figsize=(10, 6))
plt.plot(results_df['train_size'], results_df['weak_f1'], marker='o', label='Weak Label F1')
plt.plot(results_df['train_size'], results_df['weak_accuracy'], marker='s', label='Weak Label Accuracy')
plt.title('Quality of Weak Labels vs Seed Training Size')
plt.xlabel('Number of Labeled Samples')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()