In [2]:
import os

# Define paths to the datasets
BASE_PATH = r"c:\Users\xavid\Documents\GitHub\meta4xnli\data\meta4xnli\detection"
DATASETS = {
    "Spanish (ES)": os.path.join(BASE_PATH, "source_datasets", "es"),
    "English (EN)": os.path.join(BASE_PATH, "source_datasets", "en"),
    "Catalan Original (CA)": os.path.join(BASE_PATH, "source_datasets", "ca"),
    "Catalan Projection (CA-ES)": os.path.join(BASE_PATH, "projected_labels", "ca-es"),
    "Catalan Projection (CA-EN)": os.path.join(BASE_PATH, "projected_labels", "ca-en"),
}

def get_sentence_lengths(file_path):
    """Calculate the lengths of sentences in a dataset file."""
    if not os.path.exists(file_path):
        return []
    
    sentence_lengths = []
    with open(file_path, 'r', encoding='utf-8') as f:
        current_length = 0
        for line in f:
            line = line.strip()
            if line:  # Non-empty line (token and label)
                current_length += 1
            else:  # Empty line (end of sentence)
                if current_length > 0:
                    sentence_lengths.append(current_length)
                current_length = 0
        # Add the last sentence if the file doesn't end with a blank line
        if current_length > 0:
            sentence_lengths.append(current_length)
    return sentence_lengths

# Check sentence lengths for each dataset
for dataset_name, dataset_path in DATASETS.items():
    print(f"Dataset: {dataset_name}")
    for file_name in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file_name)
        lengths = get_sentence_lengths(file_path)
        print(f"  File: {file_name} - Total Sentences: {len(lengths)}, Avg Length: {sum(lengths)/len(lengths):.2f} tokens" if lengths else "  File: {file_name} - No sentences found.")
    print()

Dataset: Spanish (ES)
  File: esxnli_hyp.tsv - Total Sentences: 2490, Avg Length: 9.34 tokens
  File: esxnli_prem.tsv - Total Sentences: 830, Avg Length: 23.34 tokens
  File: xnli_dev_hyp.tsv - Total Sentences: 2490, Avg Length: 9.78 tokens
  File: xnli_dev_prem.tsv - Total Sentences: 830, Avg Length: 19.45 tokens
  File: xnli_test_hyp.tsv - Total Sentences: 5010, Avg Length: 9.72 tokens
  File: xnli_test_prem.tsv - Total Sentences: 1670, Avg Length: 19.65 tokens

Dataset: English (EN)
  File: esxnli_hyp.tsv - Total Sentences: 2490, Avg Length: 9.05 tokens
  File: esxnli_prem.tsv - Total Sentences: 830, Avg Length: 22.80 tokens
  File: xnli_dev_hyp.tsv - Total Sentences: 2490, Avg Length: 9.35 tokens
  File: xnli_dev_prem.tsv - Total Sentences: 830, Avg Length: 18.19 tokens
  File: xnli_test_hyp.tsv - Total Sentences: 5010, Avg Length: 9.30 tokens
  File: xnli_test_prem.tsv - Total Sentences: 1670, Avg Length: 18.21 tokens

Dataset: Catalan Original (CA)
  File: xnli_dev_hyp.tsv - Tota