In [None]:
import pandas as pd
import gzip
from pathlib import Path
from bs4 import BeautifulSoup
import numpy as np

# paths
base_dir = Path("../data")
raw_dir = base_dir / "raw" / "wmt_zh_en"

print("Checking raw data structure:")
print(f"\nRaw directory exists: {raw_dir.exists()}")

if raw_dir.exists():
    print("\nContents of raw/wmt_zh_en:")
    for item in raw_dir.rglob("*"):
        if item.is_file():
            print(f"  {item.relative_to(raw_dir)}")

Checking raw data structure:

Raw directory exists: True

Contents of raw/wmt_zh_en:
  newscommentary/news-commentary-v15.en-zh.tsv.gz
  newstest/newstest2017-zhen-ref.en.sgm
  newstest/newstest2017-zhen-src.zh.sgm
  newstest/newstest2018-zhen-ref.en.sgm
  newstest/newstest2018-zhen-src.zh.sgm
  newstest/newstest2019-zhen-ref.en.sgm
  newstest/newstest2019-zhen-src.zh.sgm
  newstest/newstestB2020-zhen-ref.en.sgm
  newstest/newstestB2020-zhen-src.zh.sgm


In [2]:
processed_dir = base_dir / "processed" / "zh_en"
eval_dir = base_dir / "evaluation_sets" / "zh_en"
error_dir = base_dir / "error_dataset" / "zh_en"
reserve_dir = base_dir / "reserve" / "zh_en"

for dir_path in [processed_dir, eval_dir, error_dir, reserve_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)

print("Folder structure created")

Folder structure created


In [3]:
train_file = raw_dir / "newscommentary" / "news-commentary-v15.en-zh.tsv.gz"

with gzip.open(train_file, 'rt', encoding='utf-8') as f:
    first_lines = [next(f) for _ in range(5)]
    
print("Sample lines from training data:")
for i, line in enumerate(first_lines, 1):
    fields = line.strip().split('\t')
    print(f"Line {i}: {len(fields)} fields")
    if len(fields) >= 2:
        print(f"  Field 1 (first 80 chars): {fields[0][:80]}")
        print(f"  Field 2 (first 80 chars): {fields[1][:80]}")

Sample lines from training data:
Line 1: 2 fields
  Field 1 (first 80 chars): 1929 or 1989?
  Field 2 (first 80 chars): 1929年还是1989年?
Line 2: 2 fields
  Field 1 (first 80 chars): PARIS – As the economic crisis deepens and widens, the world has been searching 
  Field 2 (first 80 chars): 巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。
Line 3: 2 fields
  Field 1 (first 80 chars): At the start of the crisis, many people likened it to 1982 or 1973, which was re
  Field 2 (first 80 chars): 一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为这两段时期意味着典型的周期性衰退。
Line 4: 2 fields
  Field 1 (first 80 chars): Today, the mood is much grimmer, with references to 1929 and 1931 beginning to a
  Field 2 (first 80 chars): 如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政府的表现仍然似乎把视目前的情况为是典型的而看见的衰退。
Line 5: 2 fields
  Field 1 (first 80 chars): The tendency is either excessive restraint (Europe) or a diffusion of the effort
  Field 2 (first 80 chars): 目前的趋势是，要么是过度的克制（欧洲 ） ， 要么是努力的扩展（美国 ） 。


In [4]:
train_file = raw_dir / "newscommentary" / "news-commentary-v15.en-zh.tsv.gz"

source_texts = []
target_texts = []
problematic_lines = 0
total_lines = 0

with gzip.open(train_file, 'rt', encoding='utf-8') as f:
    for line in f:
        total_lines += 1
        if not line.strip():
            continue
            
        fields = [f.strip() for f in line.strip().split('\t') if f.strip()]
        
        if len(fields) == 2:
            source_texts.append(fields[1])  # Chinese
            target_texts.append(fields[0])  # English
        else:
            problematic_lines += 1

train_df = pd.DataFrame({
    'source_zh': source_texts,
    'target_en': target_texts
})

print(f"Total lines: {total_lines:,}")
print(f"Valid pairs: {len(train_df):,}")
print(f"Problematic lines: {problematic_lines:,}")

Total lines: 320,713
Valid pairs: 312,268
Problematic lines: 498


In [5]:
train_df['source_words'] = train_df['source_zh'].str.split().str.len()
train_df['target_words'] = train_df['target_en'].str.split().str.len()

train_filtered = train_df[
    (train_df['source_words'] >= 10) & 
    (train_df['source_words'] <= 40) &
    (train_df['target_words'] >= 10) & 
    (train_df['target_words'] <= 40)
].copy()

train_filtered = train_filtered.drop(['source_words', 'target_words'], axis=1)
train_clean = train_filtered.drop_duplicates(subset=['source_zh', 'target_en'])

print(f"After word length filtering (10-40): {len(train_filtered):,}")
print(f"After deduplication: {len(train_clean):,}")

After word length filtering (10-40): 1,610
After deduplication: 1,604


In [6]:
def read_sgm_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    soup = BeautifulSoup(content, 'xml')
    return [seg.text.strip() for seg in soup.find_all('seg')]

test_files = [
    ("newstest2019-zhen-src.zh.sgm", "newstest2019-zhen-ref.en.sgm"),
    ("newstest2018-zhen-src.zh.sgm", "newstest2018-zhen-ref.en.sgm"),
    ("newstest2017-zhen-src.zh.sgm", "newstest2017-zhen-ref.en.sgm"),
    ("newstestB2020-zhen-src.zh.sgm", "newstestB2020-zhen-ref.en.sgm"),
]

all_test_data = []
test_dir = raw_dir / "newstest"

for zh_file, en_file in test_files:
    zh_path = test_dir / zh_file
    en_path = test_dir / en_file
    
    if zh_path.exists() and en_path.exists():
        zh_segments = read_sgm_file(zh_path)
        en_segments = read_sgm_file(en_path)
        
        print(f"{zh_file.split('-')[0]}: {len(zh_segments)} pairs")
        
        for zh, en in zip(zh_segments, en_segments):
            all_test_data.append({'source_zh': zh, 'target_en': en})

test_df = pd.DataFrame(all_test_data)
print(f"\nTotal test pairs: {len(test_df):,}")

newstest2019: 2000 pairs
newstest2018: 3981 pairs
newstest2017: 2001 pairs
newstestB2020: 2000 pairs

Total test pairs: 9,982


In [7]:
test_df['source_words'] = test_df['source_zh'].str.split().str.len()
test_df['target_words'] = test_df['target_en'].str.split().str.len()

test_filtered = test_df[
    (test_df['source_words'] >= 10) & 
    (test_df['source_words'] <= 40) &
    (test_df['target_words'] >= 10) & 
    (test_df['target_words'] <= 40)
].copy()

test_filtered = test_filtered.drop(['source_words', 'target_words'], axis=1)
test_clean = test_filtered.drop_duplicates(subset=['source_zh', 'target_en'])

print(f"After word length filtering (10-40): {len(test_filtered):,}")
print(f"After deduplication: {len(test_clean):,}")

After word length filtering (10-40): 21
After deduplication: 21


In [8]:
# Reset to original data
train_df_orig = pd.DataFrame({
    'source_zh': source_texts,
    'target_en': target_texts
})

test_df_orig = pd.DataFrame(all_test_data)

# Calculate word counts
train_df_orig['source_words'] = train_df_orig['source_zh'].str.split().str.len()
train_df_orig['target_words'] = train_df_orig['target_en'].str.split().str.len()

test_df_orig['source_words'] = test_df_orig['source_zh'].str.split().str.len()
test_df_orig['target_words'] = test_df_orig['target_en'].str.split().str.len()

print("Training data word count statistics:")
print(train_df_orig[['source_words', 'target_words']].describe())

print("\nTest data word count statistics:")
print(test_df_orig[['source_words', 'target_words']].describe())

print("\nTrain - samples by word count ranges:")
for min_w, max_w in [(5, 30), (5, 40), (5, 50), (10, 40), (10, 50)]:
    count = len(train_df_orig[
        (train_df_orig['source_words'] >= min_w) & 
        (train_df_orig['source_words'] <= max_w) &
        (train_df_orig['target_words'] >= min_w) & 
        (train_df_orig['target_words'] <= max_w)
    ])
    print(f"  {min_w}-{max_w} words: {count:,}")

print("\nTest - samples by word count ranges:")
for min_w, max_w in [(5, 30), (5, 40), (5, 50), (10, 40), (10, 50)]:
    count = len(test_df_orig[
        (test_df_orig['source_words'] >= min_w) & 
        (test_df_orig['source_words'] <= max_w) &
        (test_df_orig['target_words'] >= min_w) & 
        (test_df_orig['target_words'] <= max_w)
    ])
    print(f"  {min_w}-{max_w} words: {count:,}")

Training data word count statistics:
        source_words   target_words
count  312268.000000  312268.000000
mean        1.794904      22.057806
std         1.955388      10.796307
min         1.000000       1.000000
25%         1.000000      14.000000
50%         1.000000      21.000000
75%         1.000000      29.000000
max        70.000000     153.000000

Test data word count statistics:
       source_words  target_words
count   9982.000000   9982.000000
mean       1.575235     28.473552
std        1.598798     17.519407
min        1.000000      1.000000
25%        1.000000     17.000000
50%        1.000000     25.000000
75%        1.000000     36.000000
max       30.000000    217.000000

Train - samples by word count ranges:
  5-30 words: 11,272
  5-40 words: 18,701
  5-50 words: 22,430
  10-40 words: 1,610
  10-50 words: 2,201

Test - samples by word count ranges:
  5-30 words: 210
  5-40 words: 329
  5-50 words: 402
  10-40 words: 21
  10-50 words: 33


In [9]:
train_df_orig['source_chars'] = train_df_orig['source_zh'].str.len()
train_df_orig['target_chars'] = train_df_orig['target_en'].str.len()

test_df_orig['source_chars'] = test_df_orig['source_zh'].str.len()
test_df_orig['target_chars'] = test_df_orig['target_en'].str.len()

print("Training data character count statistics:")
print(train_df_orig[['source_chars', 'target_chars']].describe())

print("\nTest data character count statistics:")
print(test_df_orig[['source_chars', 'target_chars']].describe())

print("\nTrain - samples by character count ranges:")
for min_c, max_c in [(50, 200), (50, 250), (100, 250), (100, 300)]:
    count = len(train_df_orig[
        (train_df_orig['source_chars'] >= min_c) & 
        (train_df_orig['source_chars'] <= max_c) &
        (train_df_orig['target_chars'] >= min_c) & 
        (train_df_orig['target_chars'] <= max_c)
    ])
    print(f"  {min_c}-{max_c} chars: {count:,}")

print("\nTest - samples by character count ranges:")
for min_c, max_c in [(50, 200), (50, 250), (100, 250), (100, 300)]:
    count = len(test_df_orig[
        (test_df_orig['source_chars'] >= min_c) & 
        (test_df_orig['source_chars'] <= max_c) &
        (test_df_orig['target_chars'] >= min_c) & 
        (test_df_orig['target_chars'] <= max_c)
    ])
    print(f"  {min_c}-{max_c} chars: {count:,}")

Training data character count statistics:
        source_chars   target_chars
count  312268.000000  312268.000000
mean       42.643838     138.316712
std        22.706085      68.917720
min         2.000000       2.000000
25%        26.000000      88.000000
50%        39.000000     131.000000
75%        55.000000     181.000000
max       518.000000    1022.000000

Test data character count statistics:
       source_chars  target_chars
count   9982.000000   9982.000000
mean      45.761571    177.135143
std       26.966578    114.148919
min        1.000000      1.000000
25%       28.000000    102.000000
50%       41.000000    155.000000
75%       57.000000    222.000000
max      446.000000   1427.000000

Train - samples by character count ranges:
  50-200 chars: 47,210
  50-250 chars: 80,892
  100-250 chars: 1,498
  100-300 chars: 3,182

Test - samples by character count ranges:
  50-200 chars: 915
  50-250 chars: 1,783
  100-250 chars: 26
  100-300 chars: 46


In [10]:
train_filtered = train_df_orig[
    (train_df_orig['source_chars'] >= 50) & 
    (train_df_orig['source_chars'] <= 250) &
    (train_df_orig['target_chars'] >= 50) & 
    (train_df_orig['target_chars'] <= 250)
].copy()

train_filtered = train_filtered[['source_zh', 'target_en']]
train_clean = train_filtered.drop_duplicates(subset=['source_zh', 'target_en'])

test_filtered = test_df_orig[
    (test_df_orig['source_chars'] >= 50) & 
    (test_df_orig['source_chars'] <= 250) &
    (test_df_orig['target_chars'] >= 50) & 
    (test_df_orig['target_chars'] <= 250)
].copy()

test_filtered = test_filtered[['source_zh', 'target_en']]
test_clean = test_filtered.drop_duplicates(subset=['source_zh', 'target_en'])

print(f"Training data:")
print(f"  After filtering (50-250 chars): {len(train_filtered):,}")
print(f"  After deduplication: {len(train_clean):,}")

print(f"\nTest data:")
print(f"  After filtering (50-250 chars): {len(test_filtered):,}")
print(f"  After deduplication: {len(test_clean):,}")

Training data:
  After filtering (50-250 chars): 80,892
  After deduplication: 80,834

Test data:
  After filtering (50-250 chars): 1,783
  After deduplication: 1,783


In [None]:
# if there are any zh_en contamination files
contamination_paths = [
    base_dir / "isolated_clean" / "zh_en",
    base_dir / "error_dataset" / "zh_en",
    base_dir / "processed" / "zh_en",
]

existing_files = []
for path in contamination_paths:
    if path.exists():
        for file in path.glob("*.tsv"):
            existing_files.append(file)
            print(f"  Found: {file}")
        for file in path.glob("*.xlsx"):
            existing_files.append(file)
            print(f"  Found: {file}")

if not existing_files:
    print("  No existing contamination sources found")

  No existing contamination sources found


In [14]:
train_shuffled = train_clean.sample(frac=1, random_state=42).reset_index(drop=True)

train_pool = train_shuffled.iloc[:50000].copy()
test_pool = train_shuffled.iloc[50000:55000].copy()
dev_pool = train_shuffled.iloc[55000:57000].copy()
reserve_pool = train_shuffled.iloc[57000:].copy()

print(f"Data split from training:")
print(f"  Training pool: {len(train_pool):,}")
print(f"  Test pool: {len(test_pool):,}")
print(f"  Dev pool: {len(dev_pool):,}")
print(f"  Reserve pool: {len(reserve_pool):,}")
print(f"  Total: {len(train_pool) + len(test_pool) + len(dev_pool) + len(reserve_pool):,}")

Data split from training:
  Training pool: 50,000
  Test pool: 5,000
  Dev pool: 2,000
  Reserve pool: 23,834
  Total: 80,834


In [15]:
train_sizes = [1000, 5000, 10000, 15000, 20000, 30000, 40000, 50000]

for size in train_sizes:
    if size <= len(train_pool):
        train_subset = train_pool.sample(n=size, random_state=42)
        output_path = processed_dir / f"train_{size}.tsv"
        train_subset.to_csv(output_path, sep='\t', index=False)
        print(f"Saved train_{size}.tsv")

output_path = processed_dir / "train_full.tsv"
train_pool.to_csv(output_path, sep='\t', index=False)
print(f"Saved train_full.tsv: {len(train_pool):,} samples")

Saved train_1000.tsv
Saved train_5000.tsv
Saved train_10000.tsv
Saved train_15000.tsv
Saved train_20000.tsv
Saved train_30000.tsv
Saved train_40000.tsv
Saved train_50000.tsv
Saved train_full.tsv: 50,000 samples


In [16]:
test_sizes = [500, 1000, 2000, 5000]

for size in test_sizes:
    if size <= len(test_pool):
        test_subset = test_pool.sample(n=size, random_state=42)
        output_path = eval_dir / f"test_{size}_clean.tsv"
        test_subset.to_csv(output_path, sep='\t', index=False)
        print(f"Saved test_{size}_clean.tsv")

dev_output = processed_dir / "dev_2000.tsv"
dev_pool.to_csv(dev_output, sep='\t', index=False)
print(f"Saved dev_2000.tsv: {len(dev_pool):,} samples")

Saved test_500_clean.tsv
Saved test_1000_clean.tsv
Saved test_2000_clean.tsv
Saved test_5000_clean.tsv
Saved dev_2000.tsv: 2,000 samples


In [17]:
reserve_output = reserve_dir / "reserve_pool.tsv"
reserve_pool.to_csv(reserve_output, sep='\t', index=False)
print(f"Saved reserve_pool.tsv: {len(reserve_pool):,} samples")

Saved reserve_pool.tsv: 23,834 samples


In [18]:
reserve_shuffled = reserve_pool.sample(frac=1, random_state=42).reset_index(drop=True)

error_dataset_pool = reserve_shuffled.iloc[:10000].copy()
remaining_reserve = reserve_shuffled.iloc[10000:].copy()

error_output = reserve_dir / "error_dataset_pool_10000.tsv"
error_dataset_pool.to_csv(error_output, sep='\t', index=False)
print(f"Saved error_dataset_pool_10000.tsv: {len(error_dataset_pool):,} samples")

reserve_output = reserve_dir / "remaining_reserve.tsv"
remaining_reserve.to_csv(reserve_output, sep='\t', index=False)
print(f"Saved remaining_reserve.tsv: {len(remaining_reserve):,} samples")

# Remove the old reserve_pool.tsv
old_reserve = reserve_dir / "reserve_pool.tsv"
if old_reserve.exists():
    old_reserve.unlink()
    print("Removed old reserve_pool.tsv")

Saved error_dataset_pool_10000.tsv: 10,000 samples
Saved remaining_reserve.tsv: 13,834 samples
Removed old reserve_pool.tsv


In [19]:
print("Verification - checking for overlaps:")

train_sample = pd.read_csv(processed_dir / "train_5000.tsv", delimiter='\t')
test_sample = pd.read_csv(eval_dir / "test_2000_clean.tsv", delimiter='\t')
dev_sample = pd.read_csv(processed_dir / "dev_2000.tsv", delimiter='\t')
error_sample = pd.read_csv(reserve_dir / "error_dataset_pool_10000.tsv", delimiter='\t')
reserve_sample = pd.read_csv(reserve_dir / "remaining_reserve.tsv", delimiter='\t')

train_norm = set(train_sample['source_zh'].str.strip().str.lower())
test_norm = set(test_sample['source_zh'].str.strip().str.lower())
dev_norm = set(dev_sample['source_zh'].str.strip().str.lower())
error_norm = set(error_sample['source_zh'].str.strip().str.lower())
reserve_norm = set(reserve_sample['source_zh'].str.strip().str.lower())

print(f"Train vs Test: {len(train_norm & test_norm)} overlaps")
print(f"Train vs Dev: {len(train_norm & dev_norm)} overlaps")
print(f"Train vs Error pool: {len(train_norm & error_norm)} overlaps")
print(f"Train vs Reserve: {len(train_norm & reserve_norm)} overlaps")
print(f"Test vs Dev: {len(test_norm & dev_norm)} overlaps")
print(f"Test vs Error pool: {len(test_norm & error_norm)} overlaps")
print(f"Test vs Reserve: {len(test_norm & reserve_norm)} overlaps")
print(f"Dev vs Error pool: {len(dev_norm & error_norm)} overlaps")
print(f"Dev vs Reserve: {len(dev_norm & reserve_norm)} overlaps")
print(f"Error pool vs Reserve: {len(error_norm & reserve_norm)} overlaps")

total_overlaps = (
    len(train_norm & test_norm) + len(train_norm & dev_norm) + 
    len(train_norm & error_norm) + len(train_norm & reserve_norm) +
    len(test_norm & dev_norm) + len(test_norm & error_norm) + 
    len(test_norm & reserve_norm) + len(dev_norm & error_norm) +
    len(dev_norm & reserve_norm) + len(error_norm & reserve_norm)
)

if total_overlaps == 0:
    print("\nNo contamination detected")
else:
    print(f"\nTotal overlaps detected: {total_overlaps}")

Verification - checking for overlaps:
Train vs Test: 0 overlaps
Train vs Dev: 0 overlaps
Train vs Error pool: 0 overlaps
Train vs Reserve: 0 overlaps
Test vs Dev: 0 overlaps
Test vs Error pool: 0 overlaps
Test vs Reserve: 0 overlaps
Dev vs Error pool: 0 overlaps
Dev vs Reserve: 0 overlaps
Error pool vs Reserve: 0 overlaps

No contamination detected


In [21]:
print("\nProcessed directory (training sets):")
for file in sorted(processed_dir.glob("*.tsv")):
    size = len(pd.read_csv(file, delimiter='\t'))
    print(f"  {file.name}: {size:,} samples")

print("\nEvaluation directory (test sets):")
for file in sorted(eval_dir.glob("*.tsv")):
    size = len(pd.read_csv(file, delimiter='\t'))
    print(f"  {file.name}: {size:,} samples")

print("\nReserve directory:")
for file in sorted(reserve_dir.glob("*.tsv")):
    size = len(pd.read_csv(file, delimiter='\t'))
    print(f"  {file.name}: {size:,} samples")

print("\nTotal samples used: 80,834")
print("Character range: 50-250 characters (both source and target)")
print("Data source: WMT News Commentary v15")


Processed directory (training sets):
  dev_2000.tsv: 2,000 samples
  train_1000.tsv: 1,000 samples
  train_10000.tsv: 10,000 samples
  train_15000.tsv: 15,000 samples
  train_20000.tsv: 20,000 samples
  train_30000.tsv: 30,000 samples
  train_40000.tsv: 40,000 samples
  train_5000.tsv: 5,000 samples
  train_50000.tsv: 50,000 samples
  train_full.tsv: 50,000 samples

Evaluation directory (test sets):
  test_1000_clean.tsv: 1,000 samples
  test_2000_clean.tsv: 2,000 samples
  test_5000_clean.tsv: 5,000 samples
  test_500_clean.tsv: 500 samples

Reserve directory:
  error_dataset_pool_10000.tsv: 10,000 samples
  remaining_reserve.tsv: 13,834 samples

Total samples used: 80,834
Character range: 50-250 characters (both source and target)
Data source: WMT News Commentary v15
