# Metaphor Label Projection: Spanish to Catalan (Test Set)

This notebook projects existing metaphor labels from a labeled Spanish test set to a parallel Catalan test set.

In [None]:
# Install required packages
print("Installing dependencies...")
!pip install transformers torch awesome-align sentencepiece --quiet

print("\nSetup complete.")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 2. Define Helper Functions

These functions will read the data from the `.tsv` files and perform the label projection.

In [None]:
from tqdm.notebook import tqdm
import torch
import awesome_align as align
import os

def read_conll_file(file_path):
    """Reads a CoNLL-style file and returns lists of sentences and labels."""
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        current_sentence = []
        current_labels = []
        for line in f:
            line = line.strip()
            if not line:
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                    current_sentence = []
                    current_labels = []
            else:
                parts = line.split('\t')
                token = parts[0]
                label = parts[1] if len(parts) > 1 else 'O' # Default to 'O' if no label
                current_sentence.append(token)
                current_labels.append(label)
        if current_sentence: # Add the last sentence if file doesn't end with a newline
            sentences.append(current_sentence)
            labels.append(current_labels)
    return sentences, labels

def write_conll_file(file_path, sentences, labels):
    """Writes sentences and labels to a CoNLL-style file."""
    with open(file_path, 'w', encoding='utf-8') as f:
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                f.write(f"{sentences[i][j]}\t{labels[i][j]}\n")
            f.write('\n')

def project_labels_conll(source_sents, target_sents, source_labels, output_file):
    """
    Projects BIO labels from a source language file to a target language file.
    """
    print(f"\nProjecting labels to '{output_file}'...")
    # Load alignment model
    aligner = align.AwesomeAlign(model_name_or_path='bert-base-multilingual-cased')
    
    assert len(source_sents) == len(target_sents) == len(source_labels), "File line count mismatch."

    projected_target_labels = []
    for i in tqdm(range(len(source_sents)), desc="Processing items"):
        src_tokens, tgt_tokens, src_labels = source_sents[i], target_sents[i], source_labels[i]
        src_sent_str = ' '.join(src_tokens)
        tgt_sent_str = ' '.join(tgt_tokens)

        try:
            alignment_str = aligner.align(src_sent_str, tgt_sent_str)
            alignments = [tuple(map(int, x.split('-'))) for x in alignment_str.split()]
        except Exception as e:
            print(f"Could not align sentence pair {i}, skipping. Error: {e}")
            # Add 'O' labels for the unaligned sentence to keep counts consistent
            projected_target_labels.append(['O'] * len(tgt_tokens))
            continue

        tgt_labels = ['O'] * len(tgt_tokens)
        src_to_tgt_map = {s_idx: [] for s_idx in range(len(src_tokens))}
        for s_idx, t_idx in alignments:
            src_to_tgt_map[s_idx].append(t_idx)

        for src_idx, label in enumerate(src_labels):
            if label != 'O' and src_idx in src_to_tgt_map:
                for tgt_idx in src_to_tgt_map[src_idx]:
                    if tgt_idx < len(tgt_labels):
                        # A more advanced implementation could handle B-I transitions
                        tgt_labels[tgt_idx] = label
        
        projected_target_labels.append(tgt_labels)
    
    # Write the final output
    write_conll_file(output_file, target_sents, projected_target_labels)
    print(f"Projection finished. Output at '{output_file}'.")

## 3. Run the Projection

This block sets the file paths and executes the projection for the premise and hypothesis files.

In [None]:
print("--- Starting Catalan Label Projection for Test Set ---")

# --- Configuration ---
# Source language data (Spanish - Labeled)
SOURCE_PREMISE_FILE = "hle_files/data/meta4xnli/detection/source_datasets/es/xnli_test_prem.tsv"
SOURCE_HYPOTHESIS_FILE = "hle_files/data/meta4xnli/detection/source_datasets/es/xnli_test_hyp.tsv"

# Target language data (Catalan - Unlabeled)
CATALAN_PREMISE_FILE = "hle_files/data/meta4xnli/detection/source_datasets/ca/xnli_test_prem.tsv"
CATALAN_HYPOTHESIS_FILE = "hle_files/data/meta4xnli/detection/source_datasets/ca/xnli_test_hyp.tsv"

# Final output paths
output_dir = "hle_files/data/output/projected_ca_test/"
os.makedirs(output_dir, exist_ok=True)
PROJECTED_PREMISE_FILE = os.path.join(output_dir, "meta4xnli_ca_test_prem.tsv")
PROJECTED_HYPOTHESIS_FILE = os.path.join(output_dir, "meta4xnli_ca_test_hyp.tsv")

# --- Read Data ---
print("Reading source and target files...")
src_prem_sents, src_prem_labels = read_conll_file(SOURCE_PREMISE_FILE)
tgt_prem_sents, _ = read_conll_file(CATALAN_PREMISE_FILE)

src_hyp_sents, src_hyp_labels = read_conll_file(SOURCE_HYPOTHESIS_FILE)
tgt_hyp_sents, _ = read_conll_file(CATALAN_HYPOTHESIS_FILE)
print('hi')
# --- Project Labels ---
project_labels_conll(src_prem_sents, tgt_prem_sents, src_prem_labels, PROJECTED_PREMISE_FILE)
project_labels_conll(src_hyp_sents, tgt_hyp_sents, src_hyp_labels, PROJECTED_HYPOTHESIS_FILE)

print("\n--- Pipeline Finished Successfully! ---")