In [18]:
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
import numpy as np

In [19]:
base_dir = Path("../data")
raw_dir = base_dir / "raw" / "wmt_de_en"
processed_dir = base_dir / "processed" / "de_en"
eval_dir = base_dir / "evaluation_sets" / "de_en"
error_dir = base_dir / "error_dataset" / "de_en"
reserve_dir = base_dir / "reserve" / "de_en"

for dir_path in [processed_dir, eval_dir, error_dir, reserve_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)

In [20]:
train_file = raw_dir / "train" / "europarl-v9.de-en.tsv"

source_texts = []
target_texts = []
problematic_lines = 0
total_lines = 0

with open(train_file, 'r', encoding='utf-8') as f:
    for line in f:
        total_lines += 1
        if not line.strip():
            continue
            
        fields = [f.strip() for f in line.strip().split('\t') if f.strip()]
        
        if len(fields) == 2:
            source_texts.append(fields[0])
            target_texts.append(fields[1])
        else:
            problematic_lines += 1

train_df = pd.DataFrame({
    'source_de': source_texts,
    'target_en': target_texts
})

print(f"Total lines: {total_lines:,}")
print(f"Valid pairs: {len(train_df):,}")

Total lines: 1,838,568
Valid pairs: 1,817,761


In [21]:
train_df['source_words'] = train_df['source_de'].str.split().str.len()
train_df['target_words'] = train_df['target_en'].str.split().str.len()

train_filtered = train_df[
    (train_df['source_words'] >= 10) & 
    (train_df['source_words'] <= 40) &
    (train_df['target_words'] >= 10) & 
    (train_df['target_words'] <= 40)
].copy()

train_filtered = train_filtered.drop(['source_words', 'target_words'], axis=1)
train_clean = train_filtered.drop_duplicates(subset=['source_de', 'target_en'])

print(f"After filtering and deduplication: {len(train_clean):,}")

After filtering and deduplication: 1,281,027


In [22]:
def read_sgm_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    soup = BeautifulSoup(content, 'xml')
    return [seg.text.strip() for seg in soup.find_all('seg')]

In [23]:
test_files = [
    ("newstest2019-deen-src.de.sgm", "newstest2019-deen-ref.en.sgm"),
    ("newstest2018-deen-src.de.sgm", "newstest2018-deen-ref.en.sgm"),
    ("newstest2017-deen-src.de.sgm", "newstest2017-deen-ref.en.sgm"),
    ("newstest2016-deen-src.de.sgm", "newstest2016-deen-ref.en.sgm"),
    ("newstest2015-deen-src.de.sgm", "newstest2015-deen-ref.en.sgm"),
    ("newstest2014-deen-src.de.sgm", "newstest2014-deen-ref.en.sgm"),
    ("newstest2013-src.de.sgm", "newstest2013-src.en.sgm"),
    ("newstest2012-src.de.sgm", "newstest2012-src.en.sgm"),
    ("newstest2011-src.de.sgm", "newstest2011-src.en.sgm"),
    ("newstest2010-src.de.sgm", "newstest2010-src.en.sgm"),
    ("newstest2009-src.de.sgm", "newstest2009-src.en.sgm"),
    ("test2008-src.de.sgm", "test2008-src.en.sgm")
]

all_test_data = []
test_dir = raw_dir / "test"

for de_file, en_file in test_files:
    de_path = test_dir / de_file
    en_path = test_dir / en_file
    
    if de_path.exists() and en_path.exists():
        de_segments = read_sgm_file(de_path)
        en_segments = read_sgm_file(en_path)
        
        for de, en in zip(de_segments, en_segments):
            all_test_data.append({'source_de': de, 'target_en': en})

test_df = pd.DataFrame(all_test_data)
print(f"Total test pairs: {len(test_df):,}")

Total test pairs: 30,697


In [24]:
test_df['source_words'] = test_df['source_de'].str.split().str.len()
test_df['target_words'] = test_df['target_en'].str.split().str.len()

test_filtered = test_df[
    (test_df['source_words'] >= 10) & 
    (test_df['source_words'] <= 40) &
    (test_df['target_words'] >= 10) & 
    (test_df['target_words'] <= 40)
].copy()

test_filtered = test_filtered.drop(['source_words', 'target_words'], axis=1)
test_clean = test_filtered.drop_duplicates(subset=['source_de', 'target_en'])

print(f"After filtering and deduplication: {len(test_clean):,}")

After filtering and deduplication: 22,414


In [25]:
error_file = error_dir / "full_analysis_german_low_comet_score_translations_0-5_0-75_final_balanced.xlsx"
error_df = pd.read_excel(error_file)
print(f"Error dataset size: {len(error_df)}")

isolated_clean_file = Path("../data/isolated_clean/de_en/isolated_clean_736.tsv")
self_correction_file = Path("../data/isolated_clean/de_en/self_correction_isolated_training.tsv")
dev_mini_file = Path("../data/processed/de_en/dev_mini200.tsv")
mix2k_dev_file = Path("../data/processed/de_en/mix2k_dev.tsv")

def load_contamination_source(file_path, name):
    """Load contamination source with proper error handling."""
    if not file_path.exists():
        print(f"Warning: {name} not found at {file_path}")
        return pd.DataFrame(columns=['source_de', 'target_en'])
    
    try:
        if file_path.suffix == '.xlsx':
            df = pd.read_excel(file_path)
        else:
            df = pd.read_csv(file_path, sep='\t')
        
        df.columns = [col.strip().lower() for col in df.columns]
        
        if 'src' in df.columns and 'ref' in df.columns:
            df = df[['src', 'ref']].rename(columns={'src': 'source_de', 'ref': 'target_en'})
        elif 'source_de' in df.columns and 'target_en' in df.columns:
            df = df[['source_de', 'target_en']].copy()
        else:
            df = df.iloc[:, :2].copy()
            df.columns = ['source_de', 'target_en']
        
        df = df.dropna()
        df = df[(df['source_de'].str.strip() != '') & (df['target_en'].str.strip() != '')]
        
        print(f"Loaded {name}: {len(df)} samples")
        return df
    
    except Exception as e:
        print(f"Error loading {name}: {e}")
        return pd.DataFrame(columns=['source_de', 'target_en'])

isolated_clean_df = load_contamination_source(isolated_clean_file, "isolated_clean_736.tsv")
self_correction_df = load_contamination_source(self_correction_file, "self_correction_isolated_training.tsv")
dev_mini_df = load_contamination_source(dev_mini_file, "dev_mini200.tsv")
mix2k_dev_df = load_contamination_source(mix2k_dev_file, "mix2k_dev.tsv")

all_contamination_dfs = [error_df, isolated_clean_df, self_correction_df, dev_mini_df, mix2k_dev_df]
all_contamination = pd.concat([df for df in all_contamination_dfs if not df.empty], ignore_index=True)

print(f"Total contamination samples before dedup: {len(all_contamination)}")

all_contamination_clean = all_contamination.drop_duplicates(subset=['source_de', 'target_en'])
print(f"Total unique contamination samples: {len(all_contamination_clean)}")

all_contamination_normalized = set(all_contamination_clean['source_de'].str.strip().str.lower())

print(f"Final exclusion set size: {len(all_contamination_normalized)} unique German sources")

test_clean['source_normalized'] = test_clean['source_de'].str.strip().str.lower()
test_without_contamination = test_clean[~test_clean['source_normalized'].isin(all_contamination_normalized)].copy()
test_without_contamination = test_without_contamination.drop('source_normalized', axis=1)

print(f"Test data before contamination removal: {len(test_clean):,}")
print(f"Test data after contamination removal: {len(test_without_contamination):,}")
print(f"Removed: {len(test_clean) - len(test_without_contamination)} sentences")

test_without_error = test_without_contamination

Loading contamination sources for exclusion...
Error dataset size: 736
Loaded isolated_clean_736.tsv: 736 samples
Loaded self_correction_isolated_training.tsv: 1472 samples
Loaded dev_mini200.tsv: 199 samples
Loaded mix2k_dev.tsv: 1999 samples
Total contamination samples before dedup: 5142
Total unique contamination samples: 3670
Final exclusion set size: 3670 unique German sources
Test data before contamination removal: 22,414
Test data after contamination removal: 19,479
Removed: 2935 sentences


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean['source_normalized'] = test_clean['source_de'].str.strip().str.lower()


In [26]:
# Also exclude contamination from training data
print("Applying contamination exclusion to training data...")

train_clean['source_normalized'] = train_clean['source_de'].str.strip().str.lower()
train_without_contamination = train_clean[~train_clean['source_normalized'].isin(all_contamination_normalized)].copy()
train_without_contamination = train_without_contamination.drop('source_normalized', axis=1)

print(f"Training data before contamination removal: {len(train_clean):,}")
print(f"Training data after contamination removal: {len(train_without_contamination):,}")
print(f"Removed: {len(train_clean) - len(train_without_contamination)} sentences")

# Update the variable for rest of your notebook
train_clean = train_without_contamination

Applying contamination exclusion to training data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_clean['source_normalized'] = train_clean['source_de'].str.strip().str.lower()


Training data before contamination removal: 1,281,027
Training data after contamination removal: 1,280,291
Removed: 736 sentences


In [27]:
test_shuffled = test_without_error.sample(frac=1, random_state=42).reset_index(drop=True)

test_eval_pool = test_shuffled.iloc[:5000].copy()
test_error_augment_pool = test_shuffled.iloc[5000:6000].copy()  
test_reserve_pool = test_shuffled.iloc[6000:].copy()

print(f"Test evaluation pool: {len(test_eval_pool):,}")
print(f"Test error augmentation pool: {len(test_error_augment_pool):,}")
print(f"Test reserve pool: {len(test_reserve_pool):,}")

Test evaluation pool: 5,000
Test error augmentation pool: 1,000
Test reserve pool: 13,479


In [28]:
train_shuffled = train_clean.sample(frac=1, random_state=42).reset_index(drop=True)

split_point = int(len(train_shuffled) * 0.9)

train_pool = train_shuffled.iloc[:split_point].copy()
train_reserve_pool = train_shuffled.iloc[split_point:].copy()

print(f"Training pool: {len(train_pool):,}")
print(f"Training reserve pool: {len(train_reserve_pool):,}")

Training pool: 1,152,261
Training reserve pool: 128,030


In [29]:
train_sizes = [1000, 5000, 10000, 15000, 20000, 30000, 50000, 100000, 200000, 400000, 600000, 800000, 1000000]

for size in train_sizes:
    if size <= len(train_pool):
        train_subset = train_pool.sample(n=size, random_state=42)
        output_path = processed_dir / f"train_{size}.tsv"
        train_subset.to_csv(output_path, sep='\t', index=False)
        print(f"Saved train_{size}.tsv")

output_path = processed_dir / "train_full.tsv"
train_pool.to_csv(output_path, sep='\t', index=False)
print(f"Saved train_full.tsv: {len(train_pool):,} samples")

Saved train_1000.tsv
Saved train_5000.tsv
Saved train_10000.tsv
Saved train_15000.tsv
Saved train_20000.tsv
Saved train_30000.tsv
Saved train_50000.tsv
Saved train_100000.tsv
Saved train_200000.tsv
Saved train_400000.tsv
Saved train_600000.tsv
Saved train_800000.tsv
Saved train_1000000.tsv
Saved train_full.tsv: 1,152,261 samples


In [30]:
test_sizes = [500, 1000, 2000, 5000]

for size in test_sizes:
    test_subset = test_eval_pool.sample(n=size, random_state=42)
    output_path = eval_dir / f"test_{size}_clean.tsv"
    test_subset.to_csv(output_path, sep='\t', index=False)
    print(f"Saved test_{size}_clean.tsv")

Saved test_500_clean.tsv
Saved test_1000_clean.tsv
Saved test_2000_clean.tsv
Saved test_5000_clean.tsv


In [31]:
test_reserve_path = reserve_dir / "test_reserve.tsv"
test_reserve_pool.to_csv(test_reserve_path, sep='\t', index=False)
print(f"test reserve: {len(test_reserve_pool):,} samples")

train_reserve_path = reserve_dir / "train_reserve.tsv"
train_reserve_pool.to_csv(train_reserve_path, sep='\t', index=False)
print(f"train reserve: {len(train_reserve_pool):,} samples")

error_augment_path = reserve_dir / "test_error_augment_pool.tsv"
test_error_augment_pool.to_csv(error_augment_path, sep='\t', index=False)
print(f"error augmentation pool: {len(test_error_augment_pool):,} samples")

test reserve: 13,479 samples
train reserve: 128,030 samples
error augmentation pool: 1,000 samples


In [32]:
print("Comprehensive Contamination Check:")

train_sample = pd.read_csv(processed_dir / "train_50000.tsv", delimiter='\t')
test_sample = pd.read_csv(eval_dir / "test_2000_clean.tsv", delimiter='\t')

train_normalized = set(train_sample['source_de'].str.strip().str.lower())
test_normalized = set(test_sample['source_de'].str.strip().str.lower())

contamination_checks = {
    "Error dataset": set(error_df['source_de'].str.strip().str.lower()),
    "Isolated clean": set(isolated_clean_df['source_de'].str.strip().str.lower()) if not isolated_clean_df.empty else set(),
    "Self correction": set(self_correction_df['source_de'].str.strip().str.lower()) if not self_correction_df.empty else set(),
    "Dev mini": set(dev_mini_df['source_de'].str.strip().str.lower()) if not dev_mini_df.empty else set(),
    "Mix2k dev": set(mix2k_dev_df['source_de'].str.strip().str.lower()) if not mix2k_dev_df.empty else set(),
}

for name, contamination_set in contamination_checks.items():
    if contamination_set:  # Only check non-empty sets
        test_overlap = len(test_normalized.intersection(contamination_set))
        train_overlap = len(train_normalized.intersection(contamination_set))
        print(f"Test vs {name}: {test_overlap} overlaps")
        print(f"Train vs {name}: {train_overlap} overlaps")

test_train_overlap = len(test_normalized.intersection(train_normalized))
print(f"Test vs Train: {test_train_overlap} overlaps")

total_contamination = sum([
    len(test_normalized.intersection(contamination_set)) 
    for contamination_set in contamination_checks.values() 
    if contamination_set
]) + test_train_overlap

if total_contamination == 0:
    print("No contamination")
else:
    print(f"Total contamination detected: {total_contamination} overlaps")

Comprehensive Contamination Check:
Test vs Error dataset: 0 overlaps
Train vs Error dataset: 0 overlaps
Test vs Isolated clean: 0 overlaps
Train vs Isolated clean: 0 overlaps
Test vs Self correction: 0 overlaps
Train vs Self correction: 0 overlaps
Test vs Dev mini: 0 overlaps
Train vs Dev mini: 0 overlaps
Test vs Mix2k dev: 0 overlaps
Train vs Mix2k dev: 0 overlaps
Test vs Train: 0 overlaps
No contamination


In [33]:
print("Comprehensive Dataset Verification:")

print(f"All contamination vs test_eval_pool: {len(all_contamination_normalized.intersection(set(test_eval_pool['source_de'].str.strip().str.lower())))} overlaps")
print(f"All contamination vs test_error_augment: {len(all_contamination_normalized.intersection(set(test_error_augment_pool['source_de'].str.strip().str.lower())))} overlaps")
print(f"All contamination vs test_reserve: {len(all_contamination_normalized.intersection(set(test_reserve_pool['source_de'].str.strip().str.lower())))} overlaps")
print(f"All contamination vs train_pool: {len(all_contamination_normalized.intersection(set(train_pool.sample(n=50000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")
print(f"All contamination vs train_reserve: {len(all_contamination_normalized.intersection(set(train_reserve_pool.sample(n=10000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")

print("\nReserve Pools Verification:")
train_reserve_norm = set(train_reserve_pool.sample(n=10000, random_state=42)['source_de'].str.strip().str.lower())
test_reserve_norm = set(test_reserve_pool.sample(n=5000, random_state=42)['source_de'].str.strip().str.lower())
test_error_augment_norm = set(test_error_augment_pool['source_de'].str.strip().str.lower())

print(f"train_reserve vs test_reserve: {len(train_reserve_norm.intersection(test_reserve_norm))} overlaps")
print(f"train_reserve vs test_error_augment: {len(train_reserve_norm.intersection(test_error_augment_norm))} overlaps")
print(f"test_reserve vs test_error_augment: {len(test_reserve_norm.intersection(test_error_augment_norm))} overlaps")
print(f"train_reserve vs test_eval: {len(train_reserve_norm.intersection(set(test_eval_pool.sample(n=2000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")
print(f"test_reserve vs test_eval: {len(test_reserve_norm.intersection(set(test_eval_pool.sample(n=2000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")

print("\nIndividual Contamination Source Checks:")
if not isolated_clean_df.empty:
    isolated_norm = set(isolated_clean_df['source_de'].str.strip().str.lower())
    print(f"Isolated clean vs train_pool: {len(isolated_norm.intersection(set(train_pool.sample(n=50000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")

if not self_correction_df.empty:
    self_corr_norm = set(self_correction_df['source_de'].str.strip().str.lower())
    print(f"Self correction vs train_pool: {len(self_corr_norm.intersection(set(train_pool.sample(n=50000, random_state=42)['source_de'].str.strip().str.lower())))} overlaps")

Comprehensive Dataset Verification:
All contamination vs test_eval_pool: 0 overlaps
All contamination vs test_error_augment: 0 overlaps
All contamination vs test_reserve: 0 overlaps
All contamination vs train_pool: 0 overlaps
All contamination vs train_reserve: 0 overlaps

Reserve Pools Verification:
train_reserve vs test_reserve: 0 overlaps
train_reserve vs test_error_augment: 0 overlaps
test_reserve vs test_error_augment: 0 overlaps
train_reserve vs test_eval: 0 overlaps
test_reserve vs test_eval: 0 overlaps

Individual Contamination Source Checks:
Isolated clean vs train_pool: 0 overlaps
Self correction vs train_pool: 0 overlaps


In [None]:
##creating mixed train dataset

In [34]:
import pandas as pd
from pathlib import Path
import numpy as np

TRAIN_15K_PATH = "/sc/home/sandeep.uprety/thesis_project/self_correction_llm_based_translation_thesis/data/processed/de_en/train_15000.tsv"
TEST_RESERVE_PATH = "/sc/home/sandeep.uprety/thesis_project/self_correction_llm_based_translation_thesis/data/reserve/de_en/test_reserve.tsv"
OUTPUT_DIR = Path("/sc/home/sandeep.uprety/thesis_project/self_correction_llm_based_translation_thesis/data/processed/de_en/")

train_15k = pd.read_csv(TRAIN_15K_PATH, sep='\t', dtype=str, keep_default_na=False)
test_reserve = pd.read_csv(TEST_RESERVE_PATH, sep='\t', dtype=str, keep_default_na=False)

print(f"Train 15k: {len(train_15k)}")
print(f"Test reserve: {len(test_reserve)}")

Train 15k: 15000
Test reserve: 13479


In [35]:
np.random.seed(42)

train_13k = train_15k.sample(n=13000, random_state=42).reset_index(drop=True)
test_reserve_13k = test_reserve.sample(n=13000, random_state=42).reset_index(drop=True)

combined = pd.concat([train_13k, test_reserve_13k], ignore_index=True)
combined_dedup = combined.drop_duplicates(subset=['source_de', 'target_en'], keep='first')
combined_final = combined_dedup.sample(frac=1, random_state=42).reset_index(drop=True)

filename = f"train_europarl_newstest_balanced_{len(combined_final)}.tsv"
output_path = OUTPUT_DIR / filename
combined_final.to_csv(output_path, sep='\t', index=False)

print(f"Created: {filename}")
print(f"Size: {len(combined_final)}")

Created: train_europarl_newstest_balanced_26000.tsv
Size: 26000


In [None]:
# contamination check
def load_contamination_sources():
    sources = []
    paths = [
        "../data/isolated_clean/de_en/isolated_clean_736.tsv",
        "../data/isolated_clean/de_en/self_correction_isolated_training.tsv", 
        "../data/processed/de_en/dev_mini200.tsv",
        "../data/processed/de_en/mix2k_dev.tsv"
    ]
    
    for path in paths:
        try:
            df = pd.read_csv(path, sep='\t')
            sources.extend(df.iloc[:, 0].str.strip().str.lower().tolist())
        except:
            continue
    
    return set(sources)

contamination_sources = load_contamination_sources()
combined_sources = set(combined_final['source_de'].str.strip().str.lower())
overlap = len(combined_sources & contamination_sources)

print(f"Contamination overlap: {overlap}")

Contamination overlap: 0
