In [1]:
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
import numpy as np

In [2]:
base_dir = Path("../data")
raw_dir = base_dir / "raw" / "wmt_de_en"
processed_dir = base_dir / "processed" / "de_en"
eval_dir = base_dir / "evaluation_sets" / "de_en"
error_dir = base_dir / "error_dataset" / "de_en"
reserve_dir = base_dir / "reserve" / "de_en"

for dir_path in [processed_dir, eval_dir, error_dir, reserve_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)

In [3]:
train_file = raw_dir / "train" / "europarl-v9.de-en.tsv"

source_texts = []
target_texts = []
problematic_lines = 0
total_lines = 0

with open(train_file, 'r', encoding='utf-8') as f:
    for line in f:
        total_lines += 1
        if not line.strip():
            continue
            
        fields = [f.strip() for f in line.strip().split('\t') if f.strip()]
        
        if len(fields) == 2:
            source_texts.append(fields[0])
            target_texts.append(fields[1])
        else:
            problematic_lines += 1

train_df = pd.DataFrame({
    'source_de': source_texts,
    'target_en': target_texts
})

print(f"Total lines: {total_lines:,}")
print(f"Valid pairs: {len(train_df):,}")

Total lines: 1,838,568
Valid pairs: 1,817,761


In [4]:
train_df['source_words'] = train_df['source_de'].str.split().str.len()
train_df['target_words'] = train_df['target_en'].str.split().str.len()

train_filtered = train_df[
    (train_df['source_words'] >= 10) & 
    (train_df['source_words'] <= 40) &
    (train_df['target_words'] >= 10) & 
    (train_df['target_words'] <= 40)
].copy()

train_filtered = train_filtered.drop(['source_words', 'target_words'], axis=1)
train_clean = train_filtered.drop_duplicates(subset=['source_de', 'target_en'])

print(f"After filtering and deduplication: {len(train_clean):,}")

After filtering and deduplication: 1,281,027


In [5]:
def read_sgm_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    soup = BeautifulSoup(content, 'xml')
    return [seg.text.strip() for seg in soup.find_all('seg')]

In [6]:
test_files = [
    ("newstest2019-deen-src.de.sgm", "newstest2019-deen-ref.en.sgm"),
    ("newstest2018-deen-src.de.sgm", "newstest2018-deen-ref.en.sgm"),
    ("newstest2017-deen-src.de.sgm", "newstest2017-deen-ref.en.sgm"),
    ("newstest2016-deen-src.de.sgm", "newstest2016-deen-ref.en.sgm"),
    ("newstest2015-deen-src.de.sgm", "newstest2015-deen-ref.en.sgm"),
    ("newstest2014-deen-src.de.sgm", "newstest2014-deen-ref.en.sgm"),
    ("newstest2013-src.de.sgm", "newstest2013-src.en.sgm"),
    ("newstest2012-src.de.sgm", "newstest2012-src.en.sgm"),
    ("newstest2011-src.de.sgm", "newstest2011-src.en.sgm"),
    ("newstest2010-src.de.sgm", "newstest2010-src.en.sgm"),
    ("newstest2009-src.de.sgm", "newstest2009-src.en.sgm"),
    ("test2008-src.de.sgm", "test2008-src.en.sgm")
]

all_test_data = []
test_dir = raw_dir / "test"

for de_file, en_file in test_files:
    de_path = test_dir / de_file
    en_path = test_dir / en_file
    
    if de_path.exists() and en_path.exists():
        de_segments = read_sgm_file(de_path)
        en_segments = read_sgm_file(en_path)
        
        for de, en in zip(de_segments, en_segments):
            all_test_data.append({'source_de': de, 'target_en': en})

test_df = pd.DataFrame(all_test_data)
print(f"Total test pairs: {len(test_df):,}")

Total test pairs: 30,697


In [7]:
test_df['source_words'] = test_df['source_de'].str.split().str.len()
test_df['target_words'] = test_df['target_en'].str.split().str.len()

test_filtered = test_df[
    (test_df['source_words'] >= 10) & 
    (test_df['source_words'] <= 40) &
    (test_df['target_words'] >= 10) & 
    (test_df['target_words'] <= 40)
].copy()

test_filtered = test_filtered.drop(['source_words', 'target_words'], axis=1)
test_clean = test_filtered.drop_duplicates(subset=['source_de', 'target_en'])

print(f"After filtering and deduplication: {len(test_clean):,}")

After filtering and deduplication: 22,414


In [8]:
error_file = error_dir / "full_analysis_german_low_comet_score_translations_0-5_0-75_final_balanced.xlsx"
error_df = pd.read_excel(error_file)

print(f"Error dataset size: {len(error_df)}")

error_sources_normalized = set(error_df['source_de'].str.strip().str.lower())

test_clean['source_normalized'] = test_clean['source_de'].str.strip().str.lower()
test_without_error = test_clean[~test_clean['source_normalized'].isin(error_sources_normalized)].copy()
test_without_error = test_without_error.drop('source_normalized', axis=1)

print(f"Test data after removing error dataset: {len(test_without_error):,}")
print(f"Removed: {len(test_clean) - len(test_without_error)} sentences")

Error dataset size: 736
Test data after removing error dataset: 21,679
Removed: 735 sentences


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean['source_normalized'] = test_clean['source_de'].str.strip().str.lower()


In [9]:
test_shuffled = test_without_error.sample(frac=1, random_state=42).reset_index(drop=True)

test_eval_pool = test_shuffled.iloc[:5000].copy()
test_error_augment_pool = test_shuffled.iloc[5000:6000].copy()  
test_reserve_pool = test_shuffled.iloc[6000:].copy()

print(f"Test evaluation pool: {len(test_eval_pool):,}")
print(f"Test error augmentation pool: {len(test_error_augment_pool):,}")
print(f"Test reserve pool: {len(test_reserve_pool):,}")

Test evaluation pool: 5,000
Test error augmentation pool: 1,000
Test reserve pool: 15,679


In [10]:
train_shuffled = train_clean.sample(frac=1, random_state=42).reset_index(drop=True)

split_point = int(len(train_shuffled) * 0.9)

train_pool = train_shuffled.iloc[:split_point].copy()
train_reserve_pool = train_shuffled.iloc[split_point:].copy()

print(f"Training pool: {len(train_pool):,}")
print(f"Training reserve pool: {len(train_reserve_pool):,}")

Training pool: 1,152,924
Training reserve pool: 128,103


In [11]:
train_sizes = [1000, 5000, 10000, 20000, 50000, 100000, 200000, 400000, 600000, 800000, 1000000]

for size in train_sizes:
    if size <= len(train_pool):
        train_subset = train_pool.sample(n=size, random_state=42)
        output_path = processed_dir / f"train_{size}.tsv"
        train_subset.to_csv(output_path, sep='\t', index=False)
        print(f"Saved train_{size}.tsv")

output_path = processed_dir / "train_full.tsv"
train_pool.to_csv(output_path, sep='\t', index=False)
print(f"Saved train_full.tsv: {len(train_pool):,} samples")

Saved train_1000.tsv
Saved train_5000.tsv
Saved train_10000.tsv
Saved train_20000.tsv
Saved train_50000.tsv
Saved train_100000.tsv
Saved train_200000.tsv
Saved train_400000.tsv
Saved train_600000.tsv
Saved train_800000.tsv
Saved train_1000000.tsv
Saved train_full.tsv: 1,152,924 samples


In [12]:
test_sizes = [500, 1000, 2000, 5000]

for size in test_sizes:
    test_subset = test_eval_pool.sample(n=size, random_state=42)
    output_path = eval_dir / f"test_{size}_clean.tsv"
    test_subset.to_csv(output_path, sep='\t', index=False)
    print(f"Saved test_{size}_clean.tsv")

Saved test_500_clean.tsv
Saved test_1000_clean.tsv
Saved test_2000_clean.tsv
Saved test_5000_clean.tsv


In [13]:
test_reserve_path = reserve_dir / "test_reserve.tsv"
test_reserve_pool.to_csv(test_reserve_path, sep='\t', index=False)
print(f"test reserve: {len(test_reserve_pool):,} samples")

train_reserve_path = reserve_dir / "train_reserve.tsv"
train_reserve_pool.to_csv(train_reserve_path, sep='\t', index=False)
print(f"train reserve: {len(train_reserve_pool):,} samples")

error_augment_path = reserve_dir / "test_error_augment_pool.tsv"
test_error_augment_pool.to_csv(error_augment_path, sep='\t', index=False)
print(f"error augmentation pool: {len(test_error_augment_pool):,} samples")

test reserve: 15,679 samples
train reserve: 128,103 samples
error augmentation pool: 1,000 samples


In [14]:
train_sample = pd.read_csv(processed_dir / "train_50000.tsv", delimiter='\t')
test_sample = pd.read_csv(eval_dir / "test_2000_clean.tsv", delimiter='\t')

train_normalized = set(train_sample['source_de'].str.strip().str.lower())
test_normalized = set(test_sample['source_de'].str.strip().str.lower())
error_normalized = set(error_df['source_de'].str.strip().str.lower())

test_train_overlap = len(test_normalized.intersection(train_normalized))
test_error_overlap = len(test_normalized.intersection(error_normalized))
train_error_overlap = len(train_normalized.intersection(error_normalized))

print("Contamination Check:")
print(f"Test vs Train: {test_train_overlap} overlaps")
print(f"Test vs Error: {test_error_overlap} overlaps")
print(f"Train vs Error: {train_error_overlap} overlaps")

if test_train_overlap == 0 and test_error_overlap == 0:
    print("No contamination in test set")
else:
    print("Contamination")

Contamination Check:
Test vs Train: 0 overlaps
Test vs Error: 0 overlaps
Train vs Error: 0 overlaps
No contamination in test set


In [16]:
print("Error Dataset Verification:")
print(f"Error vs test_eval_pool: {len(error_normalized.intersection(set(test_eval_pool['source_de'].str.strip().str.lower())))} overlaps")
print(f"Error vs test_error_augment: {len(error_normalized.intersection(set(test_error_augment_pool['source_de'].str.strip().str.lower())))} overlaps")
print(f"Error vs test_reserve: {len(error_normalized.intersection(set(test_reserve_pool['source_de'].str.strip().str.lower())))} overlaps")
print(f"Error vs train_pool: {len(error_normalized.intersection(set(train_pool.sample(n=50000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")
print(f"Error vs train_reserve: {len(error_normalized.intersection(set(train_reserve_pool.sample(n=10000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")

print("\nReserve Pools Verification:")
train_reserve_norm = set(train_reserve_pool.sample(n=10000, random_state=42)['source_de'].str.strip().str.lower())
test_reserve_norm = set(test_reserve_pool.sample(n=5000, random_state=42)['source_de'].str.strip().str.lower())
test_error_augment_norm = set(test_error_augment_pool['source_de'].str.strip().str.lower())

print(f"train_reserve vs test_reserve: {len(train_reserve_norm.intersection(test_reserve_norm))} overlaps")
print(f"train_reserve vs test_error_augment: {len(train_reserve_norm.intersection(test_error_augment_norm))} overlaps")
print(f"test_reserve vs test_error_augment: {len(test_reserve_norm.intersection(test_error_augment_norm))} overlaps")

print(f"train_reserve vs test_eval: {len(train_reserve_norm.intersection(set(test_eval_pool.sample(n=2000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")
print(f"test_reserve vs test_eval: {len(test_reserve_norm.intersection(set(test_eval_pool.sample(n=2000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")

Error Dataset Verification:
Error vs test_eval_pool: 0 overlaps
Error vs test_error_augment: 0 overlaps
Error vs test_reserve: 0 overlaps
Error vs train_pool: 0 overlaps
Error vs train_reserve: 0 overlaps

Reserve Pools Verification:
train_reserve vs test_reserve: 0 overlaps
train_reserve vs test_error_augment: 0 overlaps
test_reserve vs test_error_augment: 0 overlaps
train_reserve vs test_eval: 0 overlaps
test_reserve vs test_eval: 0 overlaps
