In [15]:
import pandas as pd
import re
from pathlib import Path
from sklearn.model_selection import train_test_split

LANG_PAIR = "zh_en"  # "de_en", zh_en

INPUT_PATHS = {
    "zh_en": "../../data/isolated_clean/zh_en/chinese_english_analysis_with_clean_analysis_fixed.tsv",
    "de_en": "../../data/isolated_clean/de_en/self_correction_isolated_training_with_clean_analysis.tsv"
}

OUTPUT_DIR = {
    "zh_en": "../../data/processed/zh_en/self_correction_zh_en",
    "de_en": "../../data/processed/de_en/self_correction_de_en"
}

SOURCE_COL = {"zh_en": "source_zh", "de_en": "source_de"}
SEED = 42

In [16]:
def clean_analysis(text, data_type):
    if not text or not text.strip():
        if data_type == "clean_translation":
            return "the translation accurately captures the meaning"
        return ""
    
    text = text.replace("|||END|||", "").strip()
    
    if data_type == "clean_translation":
        return "the translation accurately captures the meaning"
    
    if "The model translation is" in text:
        parts = text.split('."')
        if len(parts) > 1:
            text = parts[1].strip()
        else:
            sentences = text.split(".")
            for i, sent in enumerate(sentences):
                if "model translation" not in sent.lower():
                    text = ". ".join(sentences[i:])
                    break
    
    for pattern in ["correct translation should be", "correct translation is"]:
        if pattern in text.lower():
            idx = text.lower().rfind("the correct translation")
            if idx != -1:
                text = text[:idx].strip()
    
    return text.strip().rstrip(".")


def format_example(row, source_col):
    source = row[source_col].strip()
    target = row["target_en"].strip()
    data_type = row["data_type"]
    analysis = clean_analysis(row.get("analysis", ""), data_type)
    
    lang_name = "Chinese" if "zh" in source_col else "German"
    prompt = f"Translate the following {lang_name} to English:\n{source}"
    
    initial = row["predicted_en"].strip() if data_type == "error_correction" else target
    
    completion = f"""Initial translation: {initial}

Analysis: {analysis}

Corrected translation: {target}"""
    
    return {
        "prompt": prompt,
        "completion": completion,
        "data_type": data_type,
        source_col: source,
        "target_en": target
    }


def create_70_30_split(df, seed=42):
    error_df = df[df["data_type"] == "error_correction"].copy()
    clean_df = df[df["data_type"] == "clean_translation"].copy()
    
    n_error = len(error_df)
    target_clean = int(n_error / 0.7 * 0.3)
    
    if target_clean > len(clean_df):
        clean_sampled = clean_df
    else:
        clean_sampled = clean_df.sample(n=target_clean, random_state=seed)
    
    combined = pd.concat([error_df, clean_sampled], ignore_index=True)
    return combined.sample(frac=1, random_state=seed).reset_index(drop=True)

In [17]:
source_col = SOURCE_COL[LANG_PAIR]

df = pd.read_csv(INPUT_PATHS[LANG_PAIR], sep="\t", dtype=str, keep_default_na=False)
df.columns = [c.strip().lower() for c in df.columns]

n_error = (df["data_type"] == "error_correction").sum()
n_clean = (df["data_type"] == "clean_translation").sum()
print(f"Loaded {len(df)} examples (error: {n_error}, clean: {n_clean})")

error_ex = df[df["data_type"] == "error_correction"].iloc[0]
print("\noriginal format")
print(error_ex.get("analysis", "")[:400])

formatted_ex = format_example(error_ex, source_col)
print("\nnew one")
print(formatted_ex["completion"])

Loaded 1600 examples (error: 800, clean: 800)

original format
The model translation is "If you play the Russian roulette with just one or two bullets, your chances of survival are certainly better than if you play with six, but the stakes are so high – or the value of what you are betting is so low – that it is not a smart bet."
The model adds a comparison to playing with six bullets that is not in the original text. The original statement is about how even 

new one
Initial translation: If you play the Russian roulette with just one or two bullets, your chances of survival are certainly better than if you play with six, but the stakes are so high – or the value of what you are betting is so low – that it is not a smart bet.

Analysis: The model adds a comparison to playing with six bullets that is not in the original text. The original statement is about how even with good survival odds, the stakes make it unwise. The model also changes "you are more likely to survive than not" to "y

In [18]:
formatted = [format_example(row, source_col) for _, row in df.iterrows()]
formatted_df = pd.DataFrame(formatted)

train_df, val_df = train_test_split(
    formatted_df, test_size=0.125, random_state=SEED, stratify=formatted_df["data_type"]
)

train_error = (train_df["data_type"] == "error_correction").sum()
train_clean = (train_df["data_type"] == "clean_translation").sum()
val_error = (val_df["data_type"] == "error_correction").sum()
val_clean = (val_df["data_type"] == "clean_translation").sum()

print("50-50 Split:")
print(f"  Train: {len(train_df)} (error: {train_error}, clean: {train_clean})")
print(f"  Val: {len(val_df)} (error: {val_error}, clean: {val_clean})")

train_70_30 = create_70_30_split(train_df, SEED)
train_error_70 = (train_70_30["data_type"] == "error_correction").sum()
train_clean_70 = (train_70_30["data_type"] == "clean_translation").sum()

print("\n70-30 Split:")
print(f"  Train: {len(train_70_30)} (error: {train_error_70} [{100*train_error_70/len(train_70_30):.1f}%], clean: {train_clean_70} [{100*train_clean_70/len(train_70_30):.1f}%])")
print(f"  Val: {len(val_df)} (balanced)")

50-50 Split:
  Train: 1400 (error: 700, clean: 700)
  Val: 200 (error: 100, clean: 100)

70-30 Split:
  Train: 1000 (error: 700 [70.0%], clean: 300 [30.0%])
  Val: 200 (balanced)


In [19]:
output_dir = Path(OUTPUT_DIR[LANG_PAIR])
output_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(output_dir / "train.tsv", sep="\t", index=False)
val_df.to_csv(output_dir / "val.tsv", sep="\t", index=False)
print(f"Saved 50-50 to {output_dir}")

output_dir_70_30 = Path(str(output_dir) + "_70-30")
output_dir_70_30.mkdir(parents=True, exist_ok=True)

train_70_30.to_csv(output_dir_70_30 / "train.tsv", sep="\t", index=False)
val_df.to_csv(output_dir_70_30 / "val.tsv", sep="\t", index=False)
print(f"Saved 70-30 to {output_dir_70_30}")

Saved 50-50 to ../../data/processed/zh_en/self_correction_zh_en
Saved 70-30 to ../../data/processed/zh_en/self_correction_zh_en_70-30
