In [None]:
# !pip install datasets
# !pip install transformers[torch]

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, losses, InputExample
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import Dataset
from datasets import Dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = 'sentence-transformers/all-mpnet-base-v2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [27]:
df = pd.read_json('./Data/synthetic_data_for_contrastive_learning.jsonl', lines=True)
df.head()

Unnamed: 0,model_name,anchor_story,similar_story,dissimilar_story
0,meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo,"A mysterious individual, known only by their a...","In the secluded hamlet of Ravenshire, a myster...","In the coastal city of Tidal Cove, a reclusive..."
1,gpt-4o,A mysterious drifter arrives in the lawless fr...,A lone wanderer arrives in the turbulent minin...,"In a sprawling, rain-soaked city, a quiet mech..."
2,OpenAI GPT4o Mini,"A team of paranormal investigators, led by sea...","A group of spectral researchers, led by experi...","In a bustling modern city, a group of amateur ..."
3,OpenAI GPT 5 Chat,A prolonged drought devastates a rural farming...,A severe heatwave grips the remote farming set...,"In a remote coastal town, a series of mysterio..."
4,meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo,The film revolves around a Marine who is sever...,The film follows Gunnery Sergeant Ryder Thomps...,"In a dystopian future, Captain Rachel Kim, a r..."


In [28]:
def load_and_prepare_data(data_path='./Data/synthetic_data_for_contrastive_learning.jsonl'):
    """
    Load data and convert to training format
    
    Args:
        data_path: Path to CSV file
        format_type: 'format1' (triplet) or 'format2' (pairwise)
    
    Returns:
        List of InputExample objects
    """
    df = pd.read_json('./Data/synthetic_data_for_contrastive_learning.jsonl', lines=True)

    df = df.dropna(subset=['anchor_story', 'similar_story', 'dissimilar_story'])

    print(f"Loaded {len(df)} examples from {data_path}")

    examples = []
    
    for i, row in df.iterrows():
        example = InputExample(
            texts=[row['anchor_story'], row['similar_story'], row['dissimilar_story']]
        )
        examples.append(example)  
    print(f"Created {len(examples)} training examples")
    return examples

In [29]:
def create_train_val_split(examples, val_size=0.15, random_state=42):
    """Split data into train and validation sets"""
    train_examples, val_examples = train_test_split(
        examples, 
        test_size=val_size, 
        random_state=random_state
    )
    print(f"Train: {len(train_examples)}, Val: {len(val_examples)}")
    return train_examples, val_examples

In [None]:
def fine_tune(train_examples, output_path, epochs, batch_size, warmup_steps):
    """
    Fine-tune the model using triplet loss
    """
    print("\n" + "="*60)
    print("Starting Fine-Tuning")
    print("="*60)

    # Load base model
    print(f"Loading base model: {base_model}")
    model = SentenceTransformer(base_model, device=device)

    # Create dataloader
    train_dataloader = DataLoader(
        train_examples,
        shuffle=True,
        batch_size=batch_size
    )

    # Define loss function (Triplet Loss)
    train_loss = losses.TripletLoss(model)

    # Calculate training steps
    steps_per_epoch = len(train_dataloader)
    total_steps = steps_per_epoch * epochs

    print(f"\nTraining Configuration:")
    print(f"  Epochs: {epochs}")
    print(f"  Batch size: {batch_size}")
    print(f"  Steps per epoch: {steps_per_epoch}")
    print(f"  Total steps: {total_steps}")
    print(f"  Warmup steps: {warmup_steps}")

    # Fine-tune with lower learning rate
    print("\nTraining...")
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        warmup_steps=warmup_steps,
        optimizer_params={'lr': 5e-6},
        output_path=output_path,
        show_progress_bar=True,
        save_best_model=True
    )

    print(f"\nModel saved to: {output_path}")
    return model

In [None]:

def evaluate_on_test(model_path, test_data_path):
    """
    Evaluate fine-tuned model on test set

    Args:
        model_path: Path to fine-tuned model
        val: Path to test CSV (format2)
    """
    from sklearn.metrics.pairwise import cosine_similarity

    print("\n" + "="*60)
    print("Evaluating Fine-tuned Model")
    print("="*60)

    # Load model
    model = SentenceTransformer(model_path, device=device)

    # Load test data
    df = pd.read_json(test_data_path, lines=True)
    print(f"Test set size: {len(df)}")

    correct = 0
    predictions = []

    for idx, row in df.iterrows():
        # Encode
        embeddings = model.encode([
            row['anchor_text'],
            row['text_a'],
            row['text_b']
        ])

        # Calculate similarities
        sim_a = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        sim_b = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]

        # Predict
        pred = sim_a > sim_b
        predictions.append(pred)

        if pred == row['text_a_is_closer']:
            correct += 1

    accuracy = correct / len(df)

    print(f"\n✓ Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"  Correct: {correct}/{len(df)}")

    return accuracy, predictions

In [None]:
"""
Complete pipeline: train, evaluate, and compare with baseline
"""


# 1. Load and prepare training data
print("\n[1/5] Loading training data...")
train_examples = load_and_prepare_data()

# 2. Split into train/val
print("\n[2/5] Splitting data...")
train_examples, val_examples = create_train_val_split(train_examples)

# 3. Evaluate baseline (before fine-tuning)
print("\n[3/5] Evaluating baseline model...")
baseline_model = SentenceTransformer(base_model)

# from sklearn.metrics.pairwise import cosine_similarity
test_data_path = 'dev_track_a.jsonl'
df_test = pd.read_json(test_data_path, lines=True)

correct_baseline = 0
for idx, row in df_test.iterrows():
    embeddings = baseline_model.encode([row['anchor_text'], row['text_a'], row['text_b']])
    sim_a = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    sim_b = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]
    if (sim_a > sim_b) == row['text_a_is_closer']:
        correct_baseline += 1

baseline_acc = correct_baseline / len(df_test)
print(f"Baseline accuracy: {baseline_acc:.4f} ({baseline_acc*100:.2f}%)")

# 4. Fine-tune
print("\n[4/5] Fine-tuning model...")
finetuned_model = fine_tune(
    train_examples,
    output_path='./finetuned_narrative_model',
    epochs=1,
    batch_size=8,
    warmup_steps=100
)

# 5. Evaluate fine-tuned model
print("\n[5/5] Evaluating fine-tuned model...")
finetuned_acc, _ = evaluate_on_test(
    './finetuned_narrative_model',
    test_data_path
)

# Summary
print("\n" + "="*60)
print("FINAL COMPARISON")
print("="*60)
print(f"Baseline:    {baseline_acc:.4f} ({baseline_acc*100:.2f}%)")
print(f"Fine-tuned:  {finetuned_acc:.4f} ({finetuned_acc*100:.2f}%)")
improvement = (finetuned_acc - baseline_acc) * 100
print(f"Improvement: {improvement:+.2f} percentage points")

if finetuned_acc > baseline_acc:
    print(f"\n Fine-tuning improved performance!")
else:
    print(f"\n Fine-tuning did not improve performance")
    print("   Consider: more data, different hyperparameters, or data quality issues")



[1/5] Loading training data...
Loaded 1897 examples from ./Data/synthetic_data_for_contrastive_learning.jsonl
Created 1897 training examples

[2/5] Splitting data...
Train: 1612, Val: 285

[4/5] Fine-tuning model...

Starting Fine-Tuning
Loading base model: sentence-transformers/all-mpnet-base-v2

Training Configuration:
  Epochs: 1
  Batch size: 16
  Steps per epoch: 101
  Total steps: 101
  Warmup steps: 100

Training...
101




KeyboardInterrupt: 

In [33]:
for i in range(3):
    print(f"\nExample {i}:")
    print(f"Anchor: {train_examples[i].texts[0][:100]}")
    print(f"Similar: {train_examples[i].texts[1][:100]}")
    print(f"Dissimilar: {train_examples[i].texts[2][:100]}")


Example 0:
Anchor: Two rival bands, the Silver Strikes and the Midnight Echoes, compete for dominance in their local mu
Similar: Two competing dance crews, the Fire Steps and the Shadow Beats, vie for supremacy in their city's un
Dissimilar: In a bustling coastal town, two rival street artists, Lina and Marco, vie for the prime mural spaces

Example 1:
Anchor: The film follows the life of a former priest who, after a crisis of faith, becomes an outspoken athe
Similar: The film follows the life of a former imam who, after a crisis of faith, becomes an outspoken skepti
Dissimilar: In a dystopian future, a brilliant scientist, haunted by the consequences of her groundbreaking tech

Example 2:
Anchor: In a world on the brink of environmental catastrophe, rising global temperatures and sea levels wrea
Similar: In a world teetering on the edge of ecological collapse, soaring global temperatures and rising ocea
Dissimilar: In a dystopian metropolis, a lone archivist uncovers a hidden reposit