In [1]:
# Install the 'datasets' library from Hugging Face
!pip install datasets pandas --quiet

print("Dependencies installed.")

Dependencies installed.


## 2. Load Dataset and Process Data

This cell loads the dataset, combines the `validation` and `test` splits, and extracts the premise and hypothesis sentences.

In [None]:
from datasets import load_dataset
import os

print("Loading 'projecte-aina/xnli-ca' dataset from Hugging Face Hub...")
ds = load_dataset("projecte-aina/xnli-ca")

data = {'dev': ds['validation'], 'test': ds['test']}

output_dir = "C:/Users/xavid/Documents/GitHub/meta4xnli/data/meta4xnli/detection/source_datasets/ca/"
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory '{output_dir}' is ready.")

def save_tokenized(file_path, sentences):
    # Drop duplicates while preserving order
    seen = set()
    unique_sentences = []
    for sent in sentences:
        if sent not in seen:
            unique_sentences.append(sent)
            seen.add(sent)
    with open(file_path, "w", encoding="utf-8") as f:
        for sent in unique_sentences:
            tokens = sent.strip().split()
            for token in tokens:
                f.write(f"{token}\tO\n")
            f.write("\n")

for split_name, split in data.items():
    print(f"Processing split: {split_name} ({len(split)} sentence pairs)")
    premise_file = os.path.join(output_dir, f"xnli_{split_name}_prem.tsv")
    hypothesis_file = os.path.join(output_dir, f"xnli_{split_name}_hyp.tsv")
    save_tokenized(premise_file, split["premise"])
    print(f"Premises saved to: {premise_file}")
    save_tokenized(hypothesis_file, split["hypothesis"])
    print(f"Hypotheses saved to: {hypothesis_file}")

print("Data preparation complete.")

Loading 'projecte-aina/xnli-ca' dataset from Hugging Face Hub...
Output directory 'C:/Users/xavid/Documents/GitHub/meta4xnli/data/meta4xnli/detection/source_datasets/ca/' is ready.
Processing split: dev (2490 sentence pairs)
Premises saved to: C:/Users/xavid/Documents/GitHub/meta4xnli/data/meta4xnli/detection/source_datasets/ca/xnli_dev_prem.tsv
Hypotheses saved to: C:/Users/xavid/Documents/GitHub/meta4xnli/data/meta4xnli/detection/source_datasets/ca/xnli_dev_hyp.tsv
Processing split: test (5010 sentence pairs)
Premises saved to: C:/Users/xavid/Documents/GitHub/meta4xnli/data/meta4xnli/detection/source_datasets/ca/xnli_test_prem.tsv
Hypotheses saved to: C:/Users/xavid/Documents/GitHub/meta4xnli/data/meta4xnli/detection/source_datasets/ca/xnli_test_hyp.tsv
Data preparation complete.
