In [1]:
import pandas as pd
from pathlib import Path
import random

random.seed(42)

In [None]:
# base directory
DATA_DIR = Path("../../data/processed/zh_en/")

TRAIN_NEWS = DATA_DIR / "train_50000.tsv"
TRAIN_UN = DATA_DIR / "train_un_version_one_15000.tsv"
DEV_NEWS = DATA_DIR / "dev_2000.tsv"
DEV_UN = DATA_DIR / "dev_un_version_one_2000.tsv"

TRAIN_COMBINED = DATA_DIR / "train_news_un_balanced_30000.tsv"
DEV_COMBINED = DATA_DIR / "mix2k_dev.tsv"

print("Input files:")
print(f"  Train News: {TRAIN_NEWS}")
print(f"  Train UN:   {TRAIN_UN}")
print(f"  Dev News:   {DEV_NEWS}")
print(f"  Dev UN:     {DEV_UN}")
print("\nOutput files:")
print(f"  Train Combined: {TRAIN_COMBINED}")
print(f"  Dev Combined:   {DEV_COMBINED}")

Input files:
  Train News: ../../data/processed/zh_en/train_50000.tsv
  Train UN:   ../../data/processed/zh_en/train_un_version_one_15000.tsv
  Dev News:   ../../data/processed/zh_en/dev_2000.tsv
  Dev UN:     ../../data/processed/zh_en/dev_un_version_one_2000.tsv

Output files:
  Train Combined: ../../data/processed/zh_en/train_news_un_balanced_30000.tsv
  Dev Combined:   ../../data/processed/zh_en/mix2k_dev.tsv


In [None]:
# load all datasets
train_news_df = pd.read_csv(TRAIN_NEWS, sep="\t", dtype=str, keep_default_na=False)
train_un_df = pd.read_csv(TRAIN_UN, sep="\t", dtype=str, keep_default_na=False)
dev_news_df = pd.read_csv(DEV_NEWS, sep="\t", dtype=str, keep_default_na=False)
dev_un_df = pd.read_csv(DEV_UN, sep="\t", dtype=str, keep_default_na=False)

print("Dataset Statistics")
print(f"\nTrain NewsCommentary: {len(train_news_df):,} samples")
print(f"Columns: {train_news_df.columns.tolist()}")
print(f"\nTrain UN Parallel:    {len(train_un_df):,} samples")
print(f"Columns: {train_un_df.columns.tolist()}")
print(f"\nDev NewsCommentary:   {len(dev_news_df):,} samples")
print(f"Columns: {dev_news_df.columns.tolist()}")
print(f"\nDev UN Parallel:      {len(dev_un_df):,} samples")
print(f"Columns: {dev_un_df.columns.tolist()}")

Dataset Statistics

Train NewsCommentary: 50,000 samples
Columns: ['source_zh', 'target_en']

Train UN Parallel:    15,000 samples
Columns: ['source_zh', 'target_en']

Dev NewsCommentary:   2,000 samples
Columns: ['source_zh', 'target_en']

Dev UN Parallel:      2,000 samples
Columns: ['source_zh', 'target_en']


In [4]:
print("Sample Data")

print("\n[Train NewsCommentary - First Sample]")
print(f"Chinese: {train_news_df['source_zh'].iloc[0][:100]}...")
print(f"English: {train_news_df['target_en'].iloc[0][:100]}...")

print("\n[Train UN Parallel - First Sample]")
print(f"Chinese: {train_un_df['source_zh'].iloc[0][:100]}...")
print(f"English: {train_un_df['target_en'].iloc[0][:100]}...")

print("\n[Dev NewsCommentary - First Sample]")
print(f"Chinese: {dev_news_df['source_zh'].iloc[0][:100]}...")
print(f"English: {dev_news_df['target_en'].iloc[0][:100]}...")

print("\n[Dev UN Parallel - First Sample]")
print(f"Chinese: {dev_un_df['source_zh'].iloc[0][:100]}...")
print(f"English: {dev_un_df['target_en'].iloc[0][:100]}...")

Sample Data

[Train NewsCommentary - First Sample]
Chinese: 最鲜为人知的或许是欧洲大型强子对撞机中用于加速粒子的巨型磁体，而该设备则被利用来探索物质的基本原理。...
English: Perhaps the most exotic are the huge magnets used to accelerate particles in the Large Hadron Collid...

[Train UN Parallel - First Sample]
Chinese: 1. Value and Process of National Dialogue, al-Falaq e-journal, January 2010....
English: 1. Value and process of national dialogue, al-Falaq e-journal, January 2010...

[Dev NewsCommentary - First Sample]
Chinese: 但是，另一个事实是强化技术训练并不能提供解决更抽象但极其重要的问题的充分基础。 这些问题最终将指导全球政策和决策。...
English: It is also true, however, that such training does not provide an adequate foundation for addressing ...

[Dev UN Parallel - First Sample]
Chinese: 登记发言请求应发给NancyBeteta女士(传真:1(212)963-5935;电子邮件:beteta@un.org)。...
English: Requests for inscription should be sent to Ms. Nancy Beteta (fax 1 (212) 963-5935; e-mail beteta@un....


In [None]:
samples_per_source = 15000
print("Combined Training Set 30k")

if len(train_news_df) >= samples_per_source:
    train_news_sampled = train_news_df.sample(n=samples_per_source, random_state=42)
    print(f"Sampled {samples_per_source:,} from NewsCommentary")
else:
    train_news_sampled = train_news_df
    print(f"NewsCommentary has only {len(train_news_df):,} samples (wanted {samples_per_source:,})")

# Sample from UN Parallel
if len(train_un_df) >= samples_per_source:
    train_un_sampled = train_un_df.sample(n=samples_per_source, random_state=42)
    print(f"Sampled {samples_per_source:,} from UN Parallel")
else:
    train_un_sampled = train_un_df
    print(f"UN Parallel has only {len(train_un_df):,} samples (wanted {samples_per_source:,})")

# source labels for tracking
train_news_sampled = train_news_sampled.copy()
train_news_sampled['source_corpus'] = 'newscommentary'

train_un_sampled = train_un_sampled.copy()
train_un_sampled['source_corpus'] = 'un_parallel'

# Combine
train_combined = pd.concat([train_news_sampled, train_un_sampled], ignore_index=True)

# Shuffle
train_combined = train_combined.sample(frac=1.0, random_state=42).reset_index(drop=True)

print(f"\n Combined training set: {len(train_combined):,} samples")
print("\nSource distribution:")
print(train_combined['source_corpus'].value_counts())

Combined Training Set 30k
Sampled 15,000 from NewsCommentary
Sampled 15,000 from UN Parallel

 Combined training set: 30,000 samples

Source distribution:
source_corpus
newscommentary    15000
un_parallel       15000
Name: count, dtype: int64


In [6]:
dev_samples_per_source = 1000
print("Combined Dev Set mix2k")

# Sample from NewsCommentary dev
if len(dev_news_df) >= dev_samples_per_source:
    dev_news_sampled = dev_news_df.sample(n=dev_samples_per_source, random_state=42)
    print(f" Sampled {dev_samples_per_source:,} from NewsCommentary dev")
else:
    dev_news_sampled = dev_news_df
    print(f" NewsCommentary dev has only {len(dev_news_df):,} samples (wanted {dev_samples_per_source:,})")

# Sample from UN Parallel dev
if len(dev_un_df) >= dev_samples_per_source:
    dev_un_sampled = dev_un_df.sample(n=dev_samples_per_source, random_state=42)
    print(f" Sampled {dev_samples_per_source:,} from UN Parallel dev")
else:
    dev_un_sampled = dev_un_df
    print(f" UN Parallel dev has only {len(dev_un_df):,} samples (wanted {dev_samples_per_source:,})")

dev_news_sampled = dev_news_sampled.copy()
dev_news_sampled['source_corpus'] = 'newscommentary'

dev_un_sampled = dev_un_sampled.copy()
dev_un_sampled['source_corpus'] = 'un_parallel'

dev_combined = pd.concat([dev_news_sampled, dev_un_sampled], ignore_index=True)

# Shuffle
dev_combined = dev_combined.sample(frac=1.0, random_state=42).reset_index(drop=True)

print(f"\n Combined dev set: {len(dev_combined):,} samples")
print("\nSource distribution:")
print(dev_combined['source_corpus'].value_counts())

Combined Dev Set mix2k
 Sampled 1,000 from NewsCommentary dev
 Sampled 1,000 from UN Parallel dev

 Combined dev set: 2,000 samples

Source distribution:
source_corpus
un_parallel       1000
newscommentary    1000
Name: count, dtype: int64


In [None]:
# training set (without source_corpus column)
train_output = train_combined[['source_zh', 'target_en']]
train_output.to_csv(TRAIN_COMBINED, sep="\t", index=False)
print(f" Saved training set: {TRAIN_COMBINED}")
print(f"  Size: {len(train_output):,} samples")

# dev set (without source_corpus column)
dev_output = dev_combined[['source_zh', 'target_en']]
dev_output.to_csv(DEV_COMBINED, sep="\t", index=False)
print(f" Saved dev set: {DEV_COMBINED}")
print(f"  Size: {len(dev_output):,} samples")

 Saved training set: ../../data/processed/zh_en/train_news_un_balanced_30000.tsv
  Size: 30,000 samples
 Saved dev set: ../../data/processed/zh_en/mix2k_dev.tsv
  Size: 2,000 samples


In [8]:
train_verify = pd.read_csv(TRAIN_COMBINED, sep="\t", dtype=str, keep_default_na=False)
dev_verify = pd.read_csv(DEV_COMBINED, sep="\t", dtype=str, keep_default_na=False)

print(f"\n Train file loaded: {len(train_verify):,} samples")
print(f"  Columns: {train_verify.columns.tolist()}")
print(f"  First row Chinese: {train_verify['source_zh'].iloc[0][:80]}...")
print(f"  First row English: {train_verify['target_en'].iloc[0][:80]}...")

print(f"\n Dev file loaded: {len(dev_verify):,} samples")
print(f"  Columns: {dev_verify.columns.tolist()}")
print(f"  First row Chinese: {dev_verify['source_zh'].iloc[0][:80]}...")
print(f"  First row English: {dev_verify['target_en'].iloc[0][:80]}...")

print("datasets created")

print(f"train_tsv {TRAIN_COMBINED}")
print(f"dev_tsv {DEV_COMBINED}")


 Train file loaded: 30,000 samples
  Columns: ['source_zh', 'target_en']
  First row Chinese: 当一国年轻人身体健康、受到良好教育时，他们就能找到高薪工作，赢得尊严并成功地调整渡过全球劳动力市场的波动期。...
  First row English: When its young people are healthy and well educated, they can find gainful emplo...

 Dev file loaded: 2,000 samples
  Columns: ['source_zh', 'target_en']
  First row Chinese: 126. 截止2013年10月1日,哈萨克斯坦居民总人数为17,098,546名,其中8,845,067人为妇女(占51.8%)。...
  First row English: 126. As of 1 October 2013, women numbered 8,845,067 (51.8 per cent) in the total...
datasets created
train_tsv ../../data/processed/zh_en/train_news_un_balanced_30000.tsv
dev_tsv ../../data/processed/zh_en/mix2k_dev.tsv


In [11]:
print("Summary")
print(f"Training Set: {TRAIN_COMBINED}")
print(f"Total samples: {len(train_combined):,}")
print(f"NewsCommentary: {(train_combined['source_corpus'] == 'newscommentary').sum():,}")
print(f"UN Parallel: {(train_combined['source_corpus'] == 'un_parallel').sum():,}")

print(f"\nDev Set: {DEV_COMBINED}")
print(f"Total samples: {len(dev_combined):,}")
print(f"NewsCommentary: {(dev_combined['source_corpus'] == 'newscommentary').sum():,}")
print(f"UN Parallel: {(dev_combined['source_corpus'] == 'un_parallel').sum():,}")


Summary
Training Set: ../../data/processed/zh_en/train_news_un_balanced_30000.tsv
Total samples: 30,000
NewsCommentary: 15,000
UN Parallel: 15,000

Dev Set: ../../data/processed/zh_en/mix2k_dev.tsv
Total samples: 2,000
NewsCommentary: 1,000
UN Parallel: 1,000
