In [2]:
#Package Conflicts

# !pip uninstall -y transformers huggingface_hub
# !pip cache purge
# !pip install "transformers==4.56.1" "huggingface_hub==0.35.0"
# !pip install -U sentence-transformers==5.1.0

# !pip show torch

# !pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

# !pip install hf_xet

In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
import warnings
warnings.filterwarnings('ignore')

In [4]:
def _encode_longformer(texts):
    """Encode using Longformer with mean pooling"""
    embeddings = []
    
    for text in texts:
        # Tokenize
        inputs = tokenizer(text, return_tensors='pt', 
                                truncation=True, max_length=4096, 
                                padding=True).to(device)
    
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Mean pooling over tokens
            embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.append(embedding[0])
    
    return np.array(embeddings)

In [5]:
def encode(texts):
    """Encode texts into embeddings"""
    if model_type == 'sbert':
        return model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
    elif model_type == 'longformer':
        return _encode_longformer(texts)

In [6]:
def predict_triple(anchor, text_a, text_b):
    """
    Predict which text is more similar to anchor
    
    Returns:
        True if text_a is closer, False if text_b is closer
    """
    # Encode all three texts
    embeddings = encode([anchor, text_a, text_b])
    
    # Calculate cosine similarities
    sim_a = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    sim_b = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]
    
    return sim_a > sim_b

In [7]:
def evaluate(df):
    """
    Evaluate on dataset
    
    Args:
        df: DataFrame with columns ['anchor_text', 'text_a', 'text_b', 'text_a_is_closer']
    
    Returns:
        accuracy, predictions, similarities
    """
    predictions = []
    similarities_a = []
    similarities_b = []
    
    print(f"Evaluating {len(df)} samples...")
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        # Encode
        embeddings = encode([row['anchor_text'], row['text_a'], row['text_b']])
        
        # Calculate similarities
        sim_a = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        sim_b = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]
        
        # Predict
        pred = sim_a > sim_b
        predictions.append(pred)
        similarities_a.append(sim_a)
        similarities_b.append(sim_b)
    
    # Calculate accuracy
    df['prediction'] = predictions
    df['sim_a'] = similarities_a
    df['sim_b'] = similarities_b
    accuracy = (df['prediction'] == df['text_a_is_closer']).mean()
    
    return accuracy, df

In [8]:
df = pd.read_json('./Data/SemEval2026-Task_4-sample-v1/sample_track_a.jsonl', lines=True)
df.head()

Unnamed: 0,anchor_text,text_a,text_b,text_a_is_closer
0,The film takes place in Burma and India during...,"During the Irish War of Independence in 1921, ...","1914, German advance through Belgium: the youn...",False
1,"The foursome (Gérard Rinaldi, Jean Sarrus, Gér...",The old grandmother Tina arrives in town to at...,Brendan Byers III is a rich playboy who enlist...,True
2,Anna (Gry Bay) is a single woman who seeks to ...,A psychological portrait of relationships and ...,The story of this film starts with the difficu...,True
3,"A stevedore in Thessaloniki, Greece, Salamo Ar...","The story takes place in Berlin in 1940, where...","During the World War II, in the 1930s to 1940s...",True
4,A teenage girl accuses her primary schoolteach...,Art Brooks and Kelly Moore are a couple who ge...,"Tonino is a high school student, in love with ...",False


In [9]:
print(f"Loaded {len(df)} samples")
print(f"Columns: {df.columns.tolist()}")

Loaded 39 samples
Columns: ['anchor_text', 'text_a', 'text_b', 'text_a_is_closer']


In [11]:
models_to_test = [
    ('sentence-transformers/all-MiniLM-L6-v2', 'sbert'),
    ('sentence-transformers/all-mpnet-base-v2', 'sbert'),
    ('allenai/longformer-base-4096', 'longformer'),
]
results =[]
for mn,mt in models_to_test:
    model_name=mn
    model_type=mt

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Loading {model_name} on {device}...")

    if model_type == 'sbert':
        model = SentenceTransformer(model_name, device=device)
    elif model_type == 'longformer':
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name).to(device)

    accuracy, predictions_df = evaluate(df.copy())


    # Store results
    results.append({
        'model': model_name,
        'type': model_type,
        'accuracy': accuracy,
        'predictions_df': predictions_df
    })

    print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

    # Show some statistics
    correct = (predictions_df['prediction'] == predictions_df['text_a_is_closer']).sum()
    total = len(predictions_df)
    print(f"  Correct: {correct}/{total}")

    # Analyze errors
    errors = predictions_df[predictions_df['prediction'] != predictions_df['text_a_is_closer']]
    if len(errors) > 0:
        avg_sim_diff = (errors['sim_a'] - errors['sim_b']).abs().mean()
        print(f"  Error cases: {len(errors)}")

print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")

summary_df = pd.DataFrame([
    {
        'Model': r['model'],
        'Type': r['type'],
        'Accuracy': f"{r['accuracy']:.4f}" if 'accuracy' in r else 'ERROR',
        'Percentage': f"{r['accuracy']*100:.2f}%" if 'accuracy' in r else 'ERROR'
    }
    for r in results
])

print(summary_df.to_string(index=False))

# Find best model
valid_results = [r for r in results if 'accuracy' in r and r['accuracy'] > 0]
if valid_results:
    best = max(valid_results, key=lambda x: x['accuracy'])
    print(f"\n Best Model: {best['model']}")
    print(f"   Accuracy: {best['accuracy']:.4f} ({best['accuracy']*100:.2f}%)")

Loading sentence-transformers/all-MiniLM-L6-v2 on cpu...
Evaluating 39 samples...


100%|██████████| 39/39 [00:02<00:00, 14.57it/s]



Accuracy: 0.5897 (58.97%)
  Correct: 23/39
  Error cases: 16
Loading sentence-transformers/all-mpnet-base-v2 on cpu...
Evaluating 39 samples...


100%|██████████| 39/39 [00:17<00:00,  2.24it/s]



Accuracy: 0.6410 (64.10%)
  Correct: 25/39
  Error cases: 14
Loading allenai/longformer-base-4096 on cpu...
Evaluating 39 samples...


100%|██████████| 39/39 [01:29<00:00,  2.31s/it]


Accuracy: 0.5897 (58.97%)
  Correct: 23/39
  Error cases: 16

SUMMARY
                                  Model       Type Accuracy Percentage
 sentence-transformers/all-MiniLM-L6-v2      sbert   0.5897     58.97%
sentence-transformers/all-mpnet-base-v2      sbert   0.6410     64.10%
           allenai/longformer-base-4096 longformer   0.5897     58.97%

 Best Model: sentence-transformers/all-mpnet-base-v2
   Accuracy: 0.6410 (64.10%)





In [None]:


    

    

    



def run_baseline_comparison(data_path, models_config):
    """
    Run comparison of multiple baseline models
    
    Args:
        data_path: Path to CSV/JSON with data
        models_config: List of (model_name, model_type) tuples
    """
    # Load data
    print(f"Loading data from {data_path}...")
    if data_path.endswith('.csv'):
        df = pd.read_csv(data_path)
    elif data_path.endswith('.json'):
        
    else:
        raise ValueError("Data must be CSV or JSON")
    
    print(f"Loaded {len(df)} samples")
    print(f"Columns: {df.columns.tolist()}")
    
    # Verify required columns
    required = ['anchor', 'text_a', 'text_b', 'text_a_is_closer']
    missing = set(required) - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    
    # Results storage
    results = []
    
    # Test each model
    for model_name, model_type in models_config:
        print(f"\n{'='*60}")
        print(f"Testing: {model_name}")
        print(f"{'='*60}")
        
        try:
            # Initialize model
            baseline = NarrativeSimilarityBaseline(model_name, model_type)
            
            # Evaluate
            accuracy, predictions_df = baseline.evaluate(df.copy())
            
            # Store results
            results.append({
                'model': model_name,
                'type': model_type,
                'accuracy': accuracy,
                'predictions_df': predictions_df
            })
            
            print(f"\n✓ Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
            
            # Show some statistics
            correct = (predictions_df['prediction'] == predictions_df['text_a_is_closer']).sum()
            total = len(predictions_df)
            print(f"  Correct: {correct}/{total}")
            
            # Analyze errors
            errors = predictions_df[predictions_df['prediction'] != predictions_df['text_a_is_closer']]
            if len(errors) > 0:
                avg_sim_diff = (errors['sim_a'] - errors['sim_b']).abs().mean()
                print(f"  Error cases: {len(errors)}")
                print(f"  Avg similarity difference in errors: {avg_sim_diff:.4f}")
        
        except Exception as e:
            print(f"✗ Error with {model_name}: {str(e)}")
            results.append({
                'model': model_name,
                'type': model_type,
                'accuracy': 0.0,
                'error': str(e)
            })
    
    # Summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    
    summary_df = pd.DataFrame([
        {
            'Model': r['model'],
            'Type': r['type'],
            'Accuracy': f"{r['accuracy']:.4f}" if 'accuracy' in r else 'ERROR',
            'Percentage': f"{r['accuracy']*100:.2f}%" if 'accuracy' in r else 'ERROR'
        }
        for r in results
    ])
    
    print(summary_df.to_string(index=False))
    
    # Find best model
    valid_results = [r for r in results if 'accuracy' in r and r['accuracy'] > 0]
    if valid_results:
        best = max(valid_results, key=lambda x: x['accuracy'])
        print(f"\n🏆 Best Model: {best['model']}")
        print(f"   Accuracy: {best['accuracy']:.4f} ({best['accuracy']*100:.2f}%)")
    
    return results


# Example usage
if __name__ == "__main__":
    # Define models to compare

    
    # Run comparison
    # Replace 'your_data.csv' with your actual file path
    results = run_baseline_comparison(
        data_path='your_data.csv',  # UPDATE THIS
        models_config=models_to_test
    )
    
    # Optional: Save detailed results
    for result in results:
        if 'predictions_df' in result:
            model_safe_name = result['model'].replace('/', '_')
            result['predictions_df'].to_csv(
                f"predictions_{model_safe_name}.csv", 
                index=False
            )
            print(f"Saved predictions to: predictions_{model_safe_name}.csv")

In [1]:
from transformers import AutoTokenizer, AutoModel
model = AutoModel.from_pretrained("allenai/longformer-base-4096")

  from .autonotebook import tqdm as notebook_tqdm
