In [1]:
import os
import pandas as pd
import re
from collections import Counter

def fix_first_two_columns(row, expected_cols):
    """If row has fewer columns than expected, try splitting the first cell into two."""
    parts = row.split('\t')
    if len(parts) == expected_cols - 1 and '\t' in parts[0]:
        sub = parts[0].split('\t', 1)
        return sub + parts[1:]
    return parts

def is_garbage_sentence(text):
    """Determine if a sentence is anomalous and should be dropped."""
    if not isinstance(text, str) or not text.strip():
        return True
    # R1: template/image tags
    if re.search(r'thumb\d+px|File:|Image:', text, re.IGNORECASE):
        return True
    # R2: escape sequences or markup
    if re.search(r"<nowiki\s*/>|'{2,}|\\[tn]|&[a-z]+;", text):
        return True
    # R3: long-word concatenations
    long_words = re.findall(r'\b\w{12,}\b', text)
    if len(long_words) >= 2:
        return True
    # R4: fragments starting with number, slash, comma, quote
    if re.match(r"^[\d\./,'\"]", text.strip()):
        return True
    # R5: single token repeated many times
    tokens = text.lower().split()
    counts = Counter(tokens)
    if any(c >= 4 for c in counts.values()):
        return True
    # R6: excessive uppercase blocks
    upper_count = sum(1 for t in text.split() if t.isupper() and len(t) > 1)
    if upper_count >= 3:
        return True
    # R7: gibberish repetition
    if len(set(tokens)) == 1 and len(tokens) >= 3:
        return True
    return False

def clean_text(text):
    """Apply in-place cleaning for surviving sentences."""
    if not isinstance(text, str):
        return text
    # Remove markup and excessive quotes
    text = re.sub(r"'{1,5}", '', text)
    # Remove nowiki tags
    text = re.sub(r"<nowiki\s*/>", '', text)
    # Remove stray slashes
    text = re.sub(r"\s*/+\s*", ' ', text)
    # Remove 'KG 76' style codes
    text = re.sub(r"\bKG\s*\d+\b", '', text)
    # Remove leading commas or punctuation
    text = re.sub(r"^[,\s]+", '', text)
    # Normalize whitespace
    text = re.sub(r"\s+", ' ', text)
    # Fix uppercase blocks
    def fix_token(tok):
        return tok.capitalize() if tok.isupper() and len(tok) > 1 else tok
    text = ' '.join(fix_token(t) for t in text.split())
    return text.strip()

def clean_and_process_file(filepath):
    """
    简化版清洗函数 - 自动检测列数
    """
    print(f"\nProcessing: {filepath}")
    
    # 确定分隔符
    sep = '\t' if filepath.endswith('.tsv') else ','
    
    # 读取文件
    try:
        df = pd.read_csv(filepath, sep=sep, header=None, dtype=str, keep_default_na=False)
    except:
        # 备用读取方法
        df = pd.read_csv(filepath, sep=None, engine='python', header=None, 
                        dtype=str, keep_default_na=False)
    
    if df.empty:
        print("File is empty")
        return 0, 0
    
    # 从第一行检测期望的列数
    first_row = df.iloc[0]
    expected_columns = len([col for col in first_row if pd.notna(col) and str(col).strip()])
    print(f"Detected {expected_columns} columns from first row")
    
    cleaned_rows = []
    abnormal_rows = []
    
    for idx, row in df.iterrows():
        # 转换为列表
        row_list = row.tolist()
        
        # 如果列数不匹配，尝试修复
        if len(row_list) < expected_columns:
            # 尝试分割第一个单元格
            row_str = '\t'.join(str(x) for x in row_list)
            parts = fix_first_two_columns(row_str, expected_columns)
            row_list = parts
        
        # 调整列数
        if len(row_list) > expected_columns:
            row_list = row_list[:expected_columns]
        elif len(row_list) < expected_columns:
            row_list = row_list + [''] * (expected_columns - len(row_list))
        
        # 检查前两列是否为垃圾数据
        is_garbage = False
        for i in range(min(2, len(row_list))):
            if is_garbage_sentence(str(row_list[i])):
                is_garbage = True
                break
        
        if is_garbage:
            abnormal_rows.append(row_list)
        else:
            # 清洗文本（仅清洗前两列）
            cleaned_row = []
            for i, cell in enumerate(row_list):
                if i < 2:
                    cleaned_row.append(clean_text(str(cell)))
                else:
                    cleaned_row.append(cell)
            cleaned_rows.append(cleaned_row)
    
    # 保存结果
    base = os.path.basename(filepath)
    dirname = os.path.dirname(filepath)
    
    if cleaned_rows:
        cleaned_path = os.path.join(dirname, f"cleaned_{base}")
        cleaned_df = pd.DataFrame(cleaned_rows)
        cleaned_df.to_csv(cleaned_path, sep='\t', header=False, index=False)
        print(f"✓ Saved {len(cleaned_rows)} cleaned rows to: {cleaned_path}")
    
    if abnormal_rows:
        abnormal_path = os.path.join(dirname, f"abnormal_{base}")
        abnormal_df = pd.DataFrame(abnormal_rows)
        abnormal_df.to_csv(abnormal_path, sep='\t', header=False, index=False)
        print(f"✗ Saved {len(abnormal_rows)} abnormal rows to: {abnormal_path}")
    
    return len(cleaned_rows), len(abnormal_rows)

def batch_clean_data(root_path):
    """
    批量处理所有文件 - 自动检测每个文件的列数
    """
    print(f"Starting batch processing in: {root_path}")
    print("=" * 50)
    
    total_cleaned = 0
    total_abnormal = 0
    processed_files = 0
    
    for subdir, _, files in os.walk(root_path):
        for file in files:
            if file.endswith('.tsv') or file.endswith('.csv'):
                # 跳过已处理的文件
                if file.startswith('cleaned_') or file.startswith('abnormal_'):
                    continue
                
                filepath = os.path.join(subdir, file)
                
                try:
                    cleaned, abnormal = clean_and_process_file(filepath)
                    total_cleaned += cleaned
                    total_abnormal += abnormal
                    processed_files += 1
                except Exception as e:
                    print(f"ERROR processing {filepath}: {str(e)}")
                    continue
    
    print("\n" + "=" * 50)
    print(f"Processing complete!")
    print(f"Files processed: {processed_files}")
    print(f"Total cleaned rows: {total_cleaned}")
    print(f"Total abnormal rows: {total_abnormal}")
    print("=" * 50)

In [2]:
batch_clean_data("../data")

Starting batch processing in: ../data

Processing: ../data/da/train.enzh.df.short.tsv
ERROR processing ../data/da/train.enzh.df.short.tsv: '	' expected after '"'

Processing: ../data/da/train.roen.df.short.tsv
Detected 8 columns from first row
✓ Saved 1 cleaned rows to: ../data/da/cleaned_train.roen.df.short.tsv
✗ Saved 6776 abnormal rows to: ../data/da/abnormal_train.roen.df.short.tsv

Processing: ../data/da/train.neen.df.short.tsv
ERROR processing ../data/da/train.neen.df.short.tsv: '	' expected after '"'

Processing: ../data/da/train.ende.df.short.tsv
Detected 8 columns from first row
✓ Saved 1 cleaned rows to: ../data/da/cleaned_train.ende.df.short.tsv
✗ Saved 7000 abnormal rows to: ../data/da/abnormal_train.ende.df.short.tsv

Processing: ../data/mqm/zh-en/zh-en-mqm.2021-ted.csv
ERROR processing ../data/mqm/zh-en/zh-en-mqm.2021-ted.csv: Expected 0 fields in line 4, saw 1

Processing: ../data/mqm/zh-en/zh-en-mqm.2020.csv
Detected 8 columns from first row
✗ Saved 20001 abnormal rows 

In [None]:
import os
import pandas as pd
import re
from collections import Counter

def fix_first_two_columns(row, expected_cols):
    """If row has fewer columns than expected, try splitting the first cell into two."""
    parts = row.split('\t')
    if len(parts) == expected_cols - 1 and '\t' in parts[0]:
        sub = parts[0].split('\t', 1)
        return sub + parts[1:]
    return parts

def split_row_fields(row):
    """Extract text and numeric/list fields if they are concatenated in one field."""
    text_part = row
    rest = ''
    m = re.match(r'^(.*?)(\s*)(\[.*\].*)$', row)
    if m:
        text_part = m.group(1).strip()
        rest = m.group(3).strip()
    parts = [text_part]
    if rest:
        parts += re.findall(r'\[[^\]]+\]|-?\d+\.\d+', rest)
    return parts

def is_garbage_sentence(text):
    """Determine if a sentence is anomalous and should be dropped."""
    if not isinstance(text, str) or not text.strip():
        return True
    # R1: template/image tags
    if re.search(r'thumb\d+px|File:|Image:', text, re.IGNORECASE):
        return True
    # R2: escape sequences or markup
    if re.search(r"<nowiki\s*/>|'{2,}|\\[tn]|&[a-z]+;", text):
        return True
    # R3: long-word concatenations
    long_words = re.findall(r'\b\w{12,}\b', text)
    if len(long_words) >= 2:
        return True
    # R4: fragments starting with number, slash, comma, quote
    if re.match(r"^[\d\./,'\"]", text.strip()):
        return True
    # R5: single token repeated many times
    tokens = text.lower().split()
    counts = Counter(tokens)
    if any(c >= 4 for c in counts.values()):
        return True
    # R6: excessive uppercase blocks
    upper_count = sum(1 for t in text.split() if t.isupper() and len(t) > 1)
    if upper_count >= 3:
        return True
    # R7: gibberish repetition (single unique token repeated)
    if len(set(tokens)) == 1 and len(tokens) >= 3:
        return True
    return False

def clean_text(text):
    """Apply in-place cleaning for surviving sentences."""
    if not isinstance(text, str):
        return text
    # Remove markup and excessive quotes
    text = re.sub(r"'{1,5}", '', text)
    # Remove nowiki tags
    text = re.sub(r"<nowiki\s*/>", '', text)
    # Remove stray slashes
    text = re.sub(r"\s*/+\s*", ' ', text)
    # Remove 'KG 76' style codes
    text = re.sub(r"\bKG\s*\d+\b", '', text)
    # Remove leading commas or punctuation
    text = re.sub(r"^[,\s]+", '', text)
    # Normalize whitespace
    text = re.sub(r"\s+", ' ', text)
    # Fix uppercase blocks by title-casing those tokens
    def fix_token(tok):
        return tok.capitalize() if tok.isupper() and len(tok) > 1 else tok
    text = ' '.join(fix_token(t) for t in text.split())
    return text.strip()

def fix_row_column_count(row, expected_columns):
    """Try to fix rows with less columns, check if include \t"""
    # If row is a string in a list
    if len(row) == 1 and isinstance(row[0], str):
        split = row[0].split('\t')
        if len(split) >= expected_columns:
            return split[:expected_columns]
        else:
            return split

    # Check if any tab is included in any column
    if len(row) < expected_columns:
        new_row = []
        for cell in row:
            if isinstance(cell, str) and '\t' in cell:
                parts = cell.split('\t')
                new_row.extend(parts)
            else:
                new_row.append(cell)
        
        if len(new_row) >= expected_columns:
            return new_row[:expected_columns]
        else:
            return new_row + [''] * (expected_columns - len(new_row))
    
    elif len(row) > expected_columns:
        return row[:expected_columns]
    
    return row

def clean_and_process_file(filepath, expected_columns=None):
    """
    Clean and process a single file with all integrated cleaning functions
    """
    # Try to determine separator (TSV or CSV)
    if filepath.endswith('.tsv'):
        sep = '\t'
    else:
        sep = ','
    
    try:
        # First attempt: read normally
        df = pd.read_csv(filepath, sep=sep, header=None, dtype=str, keep_default_na=False)
    except:
        # Second attempt: flexible reading for problematic files
        df = pd.read_csv(filepath, sep=None, engine='python', header=None, 
                        quoting=3, on_bad_lines='skip', dtype=str, keep_default_na=False)
    
    # Determine expected columns if not provided
    if expected_columns is None:
        # Use mode of column counts as expected columns
        col_counts = df.apply(lambda x: len(x.dropna()), axis=1)
        expected_columns = col_counts.mode()[0] if not col_counts.empty else 3
    
    cleaned_rows = []
    abnormal_rows = []
    
    for idx, row in df.iterrows():
        # Convert to list and join for processing
        if isinstance(row, pd.Series):
            row_list = row.tolist()
        else:
            row_list = [row]
        
        # Join row for processing
        row_str = '\t'.join(str(x) for x in row_list if pd.notna(x))
        
        # Apply column fixing functions
        parts = fix_first_two_columns(row_str, expected_columns)
        
        # If still wrong columns, try split_row_fields
        if len(parts) != expected_columns:
            parts = split_row_fields(row_str)
        
        # Apply fix_row_column_count
        parts = fix_row_column_count(parts, expected_columns)
        
        # Check column count
        if len(parts) != expected_columns:
            abnormal_rows.append(parts)
            continue
        
        # Check for garbage sentences in first two columns
        is_garbage = False
        for i in range(min(2, len(parts))):
            if is_garbage_sentence(parts[i]):
                is_garbage = True
                break
        
        if is_garbage:
            abnormal_rows.append(parts)
            continue
        
        # Clean all text fields
        cleaned_parts = []
        for i, part in enumerate(parts):
            if i < 2:  # Only clean first two columns (assumed to be text)
                cleaned_parts.append(clean_text(part))
            else:
                cleaned_parts.append(part)
        
        cleaned_rows.append(cleaned_parts)
    
    # Save results
    base = os.path.basename(filepath)
    dirname = os.path.dirname(filepath)
    cleaned_path = os.path.join(dirname, f"cleaned_{base}")
    abnormal_path = os.path.join(dirname, f"abnormal_{base}")
    
    # Create DataFrames with appropriate column names
    columns = [f'col{i}' for i in range(expected_columns)]
    
    if cleaned_rows:
        cleaned_df = pd.DataFrame(cleaned_rows, columns=columns)
        cleaned_df.to_csv(cleaned_path, sep='\t', header=False, index=False)
        print(f"✓ Cleaned data saved: {cleaned_path} ({len(cleaned_rows)} rows)")
    
    if abnormal_rows:
        # Pad abnormal rows to have consistent columns
        max_cols = max(len(row) for row in abnormal_rows)
        padded_abnormal = []
        for row in abnormal_rows:
            padded_row = row + [''] * (max_cols - len(row))
            padded_abnormal.append(padded_row)
        
        abnormal_columns = [f'col{i}' for i in range(max_cols)]
        abnormal_df = pd.DataFrame(padded_abnormal, columns=abnormal_columns)
        abnormal_df.to_csv(abnormal_path, sep='\t', header=False, index=False)
        print(f"✗ Abnormal data saved: {abnormal_path} ({len(abnormal_rows)} rows)")
    
    return len(cleaned_rows), len(abnormal_rows)

def batch_clean_data(root_path, expected_columns=None):
    """
    Batch process all TSV and CSV files in directory tree
    """
    total_cleaned = 0
    total_abnormal = 0
    processed_files = 0
    
    print(f"Starting batch processing in: {root_path}")
    print("-" * 50)
    
    for subdir, _, files in os.walk(root_path):
        for file in files:
            if file.endswith('.tsv') or file.endswith('.csv'):
                # Skip already processed files
                if file.startswith('cleaned_') or file.startswith('abnormal_'):
                    continue
                
                filepath = os.path.join(subdir, file)
                print(f"\nProcessing: {filepath}")
                
                try:
                    cleaned, abnormal = clean_and_process_file(filepath, expected_columns)
                    total_cleaned += cleaned
                    total_abnormal += abnormal
                    processed_files += 1
                except Exception as e:
                    print(f"ERROR processing {filepath}: {str(e)}")
                    continue
    
    print("\n" + "=" * 50)
    print(f"Batch processing complete!")
    print(f"Files processed: {processed_files}")
    print(f"Total cleaned rows: {total_cleaned}")
    print(f"Total abnormal rows: {total_abnormal}")
    print("=" * 50)



In [4]:
!mtdata echo Statmt-commoncrawl_wmt13-1-ces-eng > statmt_commoncrawl_wmt13-1-ces-eng.tsv

2025-05-14 23:37:14 main.echo_data:92 INFO:: Total rows=161,838


In [5]:
lang = "ces-eng"
ids = [
    "Statmt-europarl-10-ces-eng",
    "ParaCrawl-paracrawl-9-eng-ces",
    "Statmt-commoncrawl_wmt13-1-ces-eng",
    "Statmt-news_commentary-18.1-ces-eng"
]

In [6]:
output_dir = f"../data/{lang}"
os.makedirs(output_dir, exist_ok=True)

for id in ids:
    output_file = os.path.join(output_dir, f"{id}.tsv")
    !mtdata echo {id} > "{output_file}"

2025-05-15 00:09:10 cache.download:212 INFO:: Downloading: http://www.statmt.org/europarl/v10/training/europarl-v10.cs-en.tsv.gz → /Users/ningmac/.mtdata/www.statmt.org/25dc/31ec77aca23b312f9f89f50e5b8a/europarl.tsv.gz
2025-05-15 00:09:30 main.echo_data:92 INFO:: Total rows=645,330
2025-05-15 00:09:31 cache.download:212 INFO:: Downloading: https://s3.amazonaws.com/web-language-models/paracrawl/release9/en-cs/en-cs.txt.gz → /Users/ningmac/.mtdata/s3.amazonaws.com/24b2/decfb7905a16f1e59bd6a3b055e5/paracrawl.tsv.gz
2025-05-15 00:47:01 main.echo_data:92 INFO:: Total rows=50,632,492
2025-05-15 00:47:04 main.echo_data:92 INFO:: Total rows=161,838
2025-05-15 00:47:08 main.echo_data:92 INFO:: Total rows=272,440


In [7]:
lang = "ces-eng"
ids_ces_eng = [
    "Statmt-wikititles-3-ces-eng",
    "Facebook-wikimatrix-1-ces-eng",
    "Tilde-eesc-2017-ces-eng",
    "Tilde-ema-2016-ces-eng",
    "Tilde-ecb-2017-ces-eng",
    "Tilde-rapid-2019-ces-eng"]
def save_mtdata_to_csv(lang, ids):
    output_dir = f"../data/{lang}"
    os.makedirs(output_dir, exist_ok=True)

    for id in ids:
        output_file = os.path.join(output_dir, f"{id}.tsv")
        !mtdata echo {id} > "{output_file}"
    return None
save_mtdata_to_csv(lang, ids_ces_eng)

2025-05-15 00:47:11 main.echo_data:92 INFO:: Total rows=410,978
2025-05-15 00:47:13 cache.download:212 INFO:: Downloading: https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.cs-en.tsv.gz → /Users/ningmac/.mtdata/dl.fbaipublicfiles.com/8d48/b1bee8b8682a74869fadbc923b6a/wikimatrix.tsv.gz
2025-05-15 00:47:44 main.echo_data:92 INFO:: Total rows=2,094,650
2025-05-15 00:47:56 tmx.parse_tmx:41 INFO:: 0:00:10 :: Parsed: 853,560
2025-05-15 00:48:02 tmx.read_tmx:87 INFO:: Extracted 1329010 pairs from TMX ZipPath(root=PosixPath('/Users/ningmac/.mtdata/tilde-model.s3-eu-west-1.amazonaws.com/168b/c72c2e4df1f10445666b8e81d6c7/eesc.zip'), name='EESC.cs-en.tmx')
2025-05-15 00:48:02 main.echo_data:92 INFO:: Total rows=1,329,010
2025-05-15 00:48:03 cache.download:212 INFO:: Downloading: https://tilde-model.s3-eu-west-1.amazonaws.com/EMA2016.cs-en.tmx.zip → /Users/ningmac/.mtdata/tilde-model.s3-eu-west-1.amazonaws.com/3339/0b1834dad80a90f0bdc9edf5ae44/ema.zip
2025-05-15 00:48:11 tmx.read_tmx:8

In [8]:
lang2 = "eng-zho"
ids_eng_zho = [
    "ParaCrawl-paracrawl-1_bonus-eng-zho",
    "Statmt-news_commentary-18.1-eng-zho",
    "Statmt-wikititles-3-zho-eng",
    "OPUS-unpc-v1.0-eng-zho",
    "Facebook-wikimatrix-1-eng-zho",
    "Statmt-backtrans_enzh-wmt20-eng-zho"]
save_mtdata_to_csv(lang2, ids_eng_zho)

2025-05-15 00:48:22 cache.download:212 INFO:: Downloading: https://s3.amazonaws.com/web-language-models/paracrawl/bonus/en-zh-v1.txt.gz → /Users/ningmac/.mtdata/s3.amazonaws.com/f873/a9dcc827e92e62ac138830dea908/paracrawl.tsv.gz
2025-05-15 00:54:32 main.echo_data:92 INFO:: Total rows=14,170,585
2025-05-15 00:54:34 cache.download:212 INFO:: Downloading: http://data.statmt.org/news-commentary/v18.1/training/news-commentary-v18.en-zh.tsv.gz → /Users/ningmac/.mtdata/data.statmt.org/a5b5/5d06ca91b6900fceca3f186042b9/news_commentary.tsv.gz
2025-05-15 00:54:41 main.echo_data:92 INFO:: Total rows=454,824
2025-05-15 00:54:43 cache.download:212 INFO:: Downloading: http://data.statmt.org/wikititles/v3/wikititles-v3.zh-en.tsv → /Users/ningmac/.mtdata/data.statmt.org/e444/6989fe6b6d718223f6ca762d5b1f/wikititles.tsv
2025-05-15 00:54:49 main.echo_data:92 INFO:: Total rows=922,195
2025-05-15 00:54:51 cache.download:212 INFO:: Downloading: https://object.pouta.csc.fi/OPUS-UNPC/v1.0/moses/en-zh.txt.zip 

In [9]:
lang3 = "eng-deu"
ids_eng_deu = [
    "Statmt-europarl-10-deu-eng",
    "ParaCrawl-paracrawl-9-eng-deu",
    "Statmt-commoncrawl_wmt13-1-deu-eng",
    "Statmt-news_commentary-18.1-deu-eng",
    "Statmt-wikititles-3-deu-eng",
    "Facebook-wikimatrix-1-deu-eng",
    "Tilde-eesc-2017-deu-eng",
    "Tilde-ema-2016-deu-eng",
    "Tilde-airbaltic-1-deu-eng",
    "Tilde-czechtourism-1-deu-eng",
    "Tilde-ecb-2017-deu-eng",
    "Tilde-rapid-2016-deu-eng",
    "Tilde-rapid-2019-deu-eng"]
save_mtdata_to_csv(lang3, ids_eng_deu)

2025-05-15 01:06:43 cache.download:212 INFO:: Downloading: http://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz → /Users/ningmac/.mtdata/www.statmt.org/bec3/4b8f5740a25562da9cc9a06ecd1d/europarl.tsv.gz
2025-05-15 01:07:18 main.echo_data:92 INFO:: Total rows=1,828,521
2025-05-15 01:07:20 cache.download:212 INFO:: Downloading: https://s3.amazonaws.com/web-language-models/paracrawl/release9/en-de/en-de.txt.gz → /Users/ningmac/.mtdata/s3.amazonaws.com/4fd6/6719600a2e8880ba41ed442ac730/paracrawl.tsv.gz
2025-05-15 02:40:53 main.echo_data:92 INFO:: Total rows=278,310,907
2025-05-15 02:41:07 main.echo_data:92 INFO:: Total rows=2,399,123
2025-05-15 02:41:08 cache.download:212 INFO:: Downloading: http://data.statmt.org/news-commentary/v18.1/training/news-commentary-v18.de-en.tsv.gz → /Users/ningmac/.mtdata/data.statmt.org/963a/5f82977774b97f7a360517698777/news_commentary.tsv.gz
2025-05-15 02:41:16 main.echo_data:92 INFO:: Total rows=449,333
2025-05-15 02:41:18 cache.download:212

In [10]:
lang4 = "eng-spa"
ids_eng_spa = [
    "Statmt-commoncrawl_wmt13-1-spa-eng",
    "Statmt-europarl-7-spa-eng",
    "Statmt-news_commentary-18.1-eng-spa",
    "Statmt-ccaligned-1-eng-spa",
    "ParaCrawl-paracrawl-9-eng-spa",
    "Tilde-eesc-2017-eng-spa",
    "Tilde-ema-2016-eng-spa",
    "Tilde-czechtourism-1-eng-spa",
    "Tilde-ecb-2017-eng-spa",
    "Tilde-rapid-2016-eng-spa",
    "Tilde-worldbank-1-eng-spa",
    "Facebook-wikimatrix-1-eng-spa",
    "EU-ecdc-1-eng-spa",
    "EU-eac_forms-1-eng-spa",
    "EU-eac_reference-1-eng-spa",
    "EU-dcep-1-eng-spa",
    "LinguaTools-wikititles-2014-eng-spa",
    "Neulab-tedtalks_train-1-eng-spa",
    "OPUS-books-v1-eng-spa",
    "OPUS-dgt-v2019-eng-spa",
    "OPUS-dgt-v4-eng-spa",
    "OPUS-ecb-v1-eng-spa",
    "OPUS-elitr_eca-v1-eng-spa",
    "OPUS-elra_w0147-v1-eng-spa",
    "OPUS-elra_w0305-v1-eng-spa",
    "OPUS-elrc_1076_euipo_law-v1-eng-spa",
    "OPUS-elrc_1082_cnio-v1-eng-spa",
    "OPUS-elrc_1083_aecosan-v1-eng-spa",
    "OPUS-elrc_1084_agencia_tributaria-v1-eng-spa",
    "OPUS-elrc_1096_euipo_list-v1-eng-spa",
    "OPUS-elrc_1125_cordis_news-v1-eng-spa"]
save_mtdata_to_csv(lang4, ids_eng_spa)

2025-05-15 02:44:59 main.echo_data:92 INFO:: Total rows=1,845,286
2025-05-15 02:45:01 cache.download:212 INFO:: Downloading: http://www.statmt.org/europarl/v7/es-en.tgz → /Users/ningmac/.mtdata/www.statmt.org/f9bf/02547baf31a5af3fec93edad91b9/europarl.tgz
2025-05-15 02:45:25 utils.extract:215 INFO:: extracting /Users/ningmac/.mtdata/www.statmt.org/f9bf/02547baf31a5af3fec93edad91b9/europarl.tgz
2025-05-15 02:45:38 main.echo_data:92 INFO:: Total rows=1,965,734
2025-05-15 02:45:40 cache.download:212 INFO:: Downloading: http://data.statmt.org/news-commentary/v18.1/training/news-commentary-v18.en-es.tsv.gz → /Users/ningmac/.mtdata/data.statmt.org/ec45/dc1e599f26aec7a35c0d56b2bcdb/news_commentary.tsv.gz
2025-05-15 02:45:48 main.echo_data:92 INFO:: Total rows=513,608
2025-05-15 02:45:50 cache.download:212 INFO:: Downloading: http://data.statmt.org/cc-aligned/sentence-aligned/en_XX-es_XX.tsv.xz → /Users/ningmac/.mtdata/data.statmt.org/e806/b7d69a7a2e7eb4a1615c7936e76e/ccaligned.tsv.xz
2025-05-

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



2025-05-15 03:15:26 cache.download:212 INFO:: Downloading: http://data.statmt.org/DCEP/EN-ES.tsv.xz → /Users/ningmac/.mtdata/data.statmt.org/4c18/83605d69e3040c0c76896531a7bd/dcep.tsv.xz
2025-05-15 03:16:00 main.echo_data:92 INFO:: Total rows=3,710,534
2025-05-15 03:16:01 cache.download:212 INFO:: Downloading: https://www.dropbox.com/s/7kb21jpaegtdprs/wikititles-2014_enes.tgz?dl=1 → /Users/ningmac/.mtdata/www.dropbox.com/56c8/f6fb64fbca1eeb727c2fe150218c/wikititles.tgz
2025-05-15 03:16:09 utils.extract:215 INFO:: extracting /Users/ningmac/.mtdata/www.dropbox.com/56c8/f6fb64fbca1eeb727c2fe150218c/wikititles.tgz
2025-05-15 03:17:22 main.echo_data:92 INFO:: Total rows=16,598,521
2025-05-15 03:17:24 cache.download:212 INFO:: Downloading: http://phontron.com/data/ted_talks.tar.gz → /Users/ningmac/.mtdata/phontron.com/c251/c510aec60c779b4c603b717c6a49/neulab_ted_talksv1.tar.gz
2025-05-15 03:18:15 utils.extract:215 INFO:: extracting /Users/ningmac/.mtdata/phontron.com/c251/c510aec60c779b4c603

In [11]:
ids_eng_spa2 = [
    "OPUS-elrc_1126_cordis_results_brief-v1-eng-spa",
    "OPUS-elrc_2015_euipo_2017-v1-eng-spa",
    "OPUS-elrc_2410_portal_oficial_turis-v1-eng-spa",
    "OPUS-elrc_2478_glossrio_pt_en-v1-eng-spa",
    "OPUS-elrc_2479_lei_orgnica_2-v1-eng-spa",
    "OPUS-elrc_2480_estatuto_dos_deputad-v1-eng-spa",
    "OPUS-elrc_2481_constituio_da_repbli-v1-eng-spa",
    "OPUS-elrc_2498_plan_nacional_e-v1-eng-spa",
    "OPUS-elrc_2502_termitur-v1-eng-spa",
    "OPUS-elrc_2503_descripciones_vulner-v1-eng-spa",
    "OPUS-elrc_2536_estatuto_da_vtima-v1-eng-spa",
    "OPUS-elrc_2538_lei_25_2009-v1-eng-spa",
    "OPUS-elrc_2543_inteliterm-v1-eng-spa",
    "OPUS-elrc_2558_government_websites_-v1-eng-spa",
    "OPUS-elrc_2612_artigos_visitportuga-v1-eng-spa",
    "OPUS-elrc_2614_localidades_2007-v1-eng-spa",
    "OPUS-elrc_2616_museus_2007-v1-eng-spa",
    "OPUS-elrc_2622_arquitectura_2007-v1-eng-spa",
    "OPUS-elrc_2623_patrimnio_aores_2006-v1-eng-spa",
    "OPUS-elrc_2638_monumentos_2007-v1-eng-spa",
    "OPUS-elrc_2639_parques_e_reservas-v1-eng-spa",
    "OPUS-elrc_2641_praias_2007-v1-eng-spa",
    "OPUS-elrc_2722_emea-v1-eng-spa",
    "OPUS-elrc_2738_vaccination-v1-eng-spa",
    "OPUS-elrc_2881_eu_publications_medi-v1-eng-spa",
    "OPUS-elrc_3077_wikipedia_health-v1-eng-spa",
    "OPUS-elrc_3210_antibiotic-v1-eng-spa",
    "OPUS-elrc_3299_europarl_covid-v1-eng-spa",
    "OPUS-elrc_3470_ec_europa_covid-v1-eng-spa",
    "OPUS-elrc_3571_eur_lex_covid-v1-eng-spa",
    "OPUS-elrc_3612_presscorner_covid-v1-eng-spa"]
save_mtdata_to_csv(lang4, ids_eng_spa2)

2025-05-15 03:20:34 cache.download:212 INFO:: Downloading: https://object.pouta.csc.fi/OPUS-ELRC-1126-CORDIS_Results_Brief/v1/moses/en-es.txt.zip → /Users/ningmac/.mtdata/object.pouta.csc.fi/38c7/5f735fa869f892e46d37386ea08b/elrc_1126_cordis_results_brief.zip
2025-05-15 03:20:38 main.echo_data:92 INFO:: Total rows=242,561
2025-05-15 03:20:39 cache.download:212 INFO:: Downloading: https://object.pouta.csc.fi/OPUS-ELRC-2015-EUIPO_2017/v1/moses/en-es.txt.zip → /Users/ningmac/.mtdata/object.pouta.csc.fi/9894/aa8956ebf0c55da62d3c6e801780/elrc_2015_euipo_2017.zip
2025-05-15 03:20:43 main.echo_data:92 INFO:: Total rows=16,440
2025-05-15 03:20:45 cache.download:212 INFO:: Downloading: https://object.pouta.csc.fi/OPUS-ELRC-2410-Portal_oficial_turis/v1/moses/en-es.txt.zip → /Users/ningmac/.mtdata/object.pouta.csc.fi/dce2/b27dfbf0e5c10b969d6aa9fc4962/elrc_2410_portal_oficial_turis.zip
2025-05-15 03:20:48 main.echo_data:92 INFO:: Total rows=198,583
2025-05-15 03:20:49 cache.download:212 INFO:: Dow

In [12]:
lang5 = "eng-jpn"
ids_eng_jpn = [
    "Statmt-news_commentary-18.1-eng-jpn",
    "KECL-paracrawl-3-eng-jpn",
    "Statmt-wikititles-3-jpn-eng",
    "Facebook-wikimatrix-1-eng-jpn",
    "Statmt-ted-wmt20-eng-jpn",
    "StanfordNLP-jesc_train-1-eng-jpn",
    "Phontron-kftt_train-1-eng-jpn"]
save_mtdata_to_csv(lang5, ids_eng_jpn)

2025-05-15 03:21:59 cache.download:212 INFO:: Downloading: http://data.statmt.org/news-commentary/v18.1/training/news-commentary-v18.en-ja.tsv.gz → /Users/ningmac/.mtdata/data.statmt.org/ac66/7af63ac503c02ad4838afac71212/news_commentary.tsv.gz
2025-05-15 03:21:59 main.echo_data:92 INFO:: Total rows=1,947
2025-05-15 03:22:01 cache.download:212 INFO:: Downloading: http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/3.0/bitext/en-ja.tar.gz → /Users/ningmac/.mtdata/www.kecl.ntt.co.jp/6dd5/d4e5231e790b2bafe2da0d29f721/paracrawl.tar.gz
2025-05-15 05:03:55 utils.extract:215 INFO:: extracting /Users/ningmac/.mtdata/www.kecl.ntt.co.jp/6dd5/d4e5231e790b2bafe2da0d29f721/paracrawl.tar.gz
2025-05-15 05:07:02 main.echo_data:92 INFO:: Total rows=25,740,835
2025-05-15 05:07:04 cache.download:212 INFO:: Downloading: http://data.statmt.org/wikititles/v3/wikititles-v3.ja-en.tsv → /Users/ningmac/.mtdata/data.statmt.org/8dfe/019165dec34dc96df2ccf41719c4/wikititles.tsv
2025-05-15 05:07:10 main.echo_data:9

In [13]:
lang6 = "eng-rus"
ids_eng_rus = [
    "ParaCrawl-paracrawl-1_bonus-eng-rus",
    "Statmt-commoncrawl_wmt13-1-rus-eng",
    "Statmt-news_commentary-18.1-eng-rus",
    "Statmt-yandex-wmt22-eng-rus",
    "Statmt-wikititles-3-rus-eng",
    "OPUS-unpc-v1.0-eng-rus",
    "Facebook-wikimatrix-1-eng-rus",
    "Tilde-airbaltic-1-eng-rus",
    "Tilde-czechtourism-1-eng-rus",
    "Tilde-worldbank-1-eng-rus",
    "Statmt-backtrans_ruen-wmt20-rus-eng"]
save_mtdata_to_csv(lang6, ids_eng_rus)

2025-05-15 05:09:28 cache.download:212 INFO:: Downloading: https://s3.amazonaws.com/web-language-models/paracrawl/bonus/en-ru.txt.gz → /Users/ningmac/.mtdata/s3.amazonaws.com/da1b/956a6fd52a9d52be6b72ce521151/paracrawl.tsv.gz
2025-05-15 05:11:04 main.echo_data:92 INFO:: Total rows=5,377,911
2025-05-15 05:11:10 main.echo_data:92 INFO:: Total rows=878,386
2025-05-15 05:11:12 cache.download:212 INFO:: Downloading: http://data.statmt.org/news-commentary/v18.1/training/news-commentary-v18.en-ru.tsv.gz → /Users/ningmac/.mtdata/data.statmt.org/b499/f85656e4cf924f9cb7c447e54530/news_commentary.tsv.gz
2025-05-15 05:11:19 main.echo_data:92 INFO:: Total rows=390,384
2025-05-15 05:11:21 cache.download:212 INFO:: Downloading: https://github.com/mashashma/WMT2022-data/archive/refs/heads/main.zip → /Users/ningmac/.mtdata/github.com/706f/86c1b4601262c7f0a8f5948cfc38/yandex.zip
2025-05-15 05:11:58 main.echo_data:92 INFO:: Total rows=1,000,000
2025-05-15 05:12:00 cache.download:212 INFO:: Downloading: h

In [14]:
lang7 = "ces-ukr"
ids_ces_ukr = [
    "Facebook-wikimatrix-1-ces-ukr",
    "ELRC-acts_ukrainian-1-ces-ukr",
    "OPUS-ccmatrix-v1-ces-ukr",
    "OPUS-elrc_5179_acts_ukrainian-v1-ces-ukr",
    "OPUS-elrc_wikipedia_health-v1-ces-ukr",
    "OPUS-eubookshop-v2-ces-ukr",
    "OPUS-gnome-v1-ces-ukr",
    "OPUS-kde4-v2-ces-ukr",
    "OPUS-multiccaligned-v1.1-ces-ukr",
    "OPUS-multiparacrawl-v9b-ces-ukr",
    "OPUS-opensubtitles-v2018-ces-ukr",
    "OPUS-qed-v2.0a-ces-ukr",
    "OPUS-ted2020-v1-ces-ukr",
    "OPUS-tatoeba-v20220303-ces-ukr",
    "OPUS-ubuntu-v14.10-ces-ukr",
    "OPUS-xlent-v1.1-ces-ukr",
    "OPUS-bible_uedin-v1-ces-ukr",
    "OPUS-wikimedia-v20210402-ces-ukr"]
save_mtdata_to_csv(lang7, ids_ces_ukr)

2025-05-15 05:29:32 cache.download:212 INFO:: Downloading: https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.cs-uk.tsv.gz → /Users/ningmac/.mtdata/dl.fbaipublicfiles.com/0f4b/984d38e986b8503f541dca71bfb4/wikimatrix.tsv.gz
2025-05-15 05:29:45 main.echo_data:92 INFO:: Total rows=848,961
2025-05-15 05:29:47 cache.download:212 INFO:: Downloading: https://elrc-share.eu/repository/download/71205868ae7011ec9c1a00155d026706d86232eb1bba43b691bdb6e8a8ec3ccf/ → /Users/ningmac/.mtdata/elrc-share.eu/e303/b4de967c1825731af26ef6c2b148/ELRC_5179.zip
2025-05-15 05:30:33 tmx.read_tmx:87 INFO:: Extracted 130003 pairs from TMX ZipPath(root=PosixPath('/Users/ningmac/.mtdata/elrc-share.eu/e303/b4de967c1825731af26ef6c2b148/ELRC_5179.zip'), name='tmx/cs-uk.tmx')
2025-05-15 05:30:33 main.echo_data:92 INFO:: Total rows=130,003
2025-05-15 05:30:34 cache.download:212 INFO:: Downloading: https://object.pouta.csc.fi/OPUS-CCMatrix/v1/moses/cs-uk.txt.zip → /Users/ningmac/.mtdata/object.pouta.csc.fi/a19a/7

In [15]:
lang8 = "eng-ukr"
ids_eng_ukr = [
    "ParaCrawl-paracrawl-1_bonus-eng-ukr",
    "Tilde-worldbank-1-eng-ukr",
    "Facebook-wikimatrix-1-eng-ukr",
    "ELRC-acts_ukrainian-1-eng-ukr",
    "Statmt-ccaligned-1-eng-ukr_UA"]
save_mtdata_to_csv(lang8, ids_eng_ukr)

2025-05-15 05:33:17 cache.download:212 INFO:: Downloading: https://s3.amazonaws.com/web-language-models/paracrawl/bonus/en-uk-v1.txt.gz → /Users/ningmac/.mtdata/s3.amazonaws.com/82b5/cc6b449a4251393761fe5ffabf82/paracrawl.tsv.gz
2025-05-15 05:39:41 main.echo_data:92 INFO:: Total rows=13,354,365
2025-05-15 05:39:43 cache.download:212 INFO:: Downloading: https://tilde-model.s3-eu-west-1.amazonaws.com/worldbank.en-uk.tmx.zip → /Users/ningmac/.mtdata/tilde-model.s3-eu-west-1.amazonaws.com/950c/a2c2ed07bd1e45179ae8902ce693/worldbank.zip
2025-05-15 05:39:43 tmx.read_tmx:87 INFO:: Extracted 1628 pairs from TMX ZipPath(root=PosixPath('/Users/ningmac/.mtdata/tilde-model.s3-eu-west-1.amazonaws.com/950c/a2c2ed07bd1e45179ae8902ce693/worldbank.zip'), name='worldbank.UNIQUE.en-uk.tmx')
2025-05-15 05:39:43 main.echo_data:92 INFO:: Total rows=1,628
2025-05-15 05:39:45 cache.download:212 INFO:: Downloading: https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-uk.tsv.gz → /Users/ningmac/.mtd

In [16]:
lang9 = "ces-ukr"
ids_ces_ukr = [
    "Facebook-wikimatrix-1-ces-ukr",
    "ELRC-acts_ukrainian-1-ces-ukr",
    "OPUS-ccmatrix-v1-ces-ukr",
    "OPUS-elrc_5179_acts_ukrainian-v1-ces-ukr",
    "OPUS-elrc_wikipedia_health-v1-ces-ukr",
    "OPUS-eubookshop-v2-ces-ukr",
    "OPUS-gnome-v1-ces-ukr",
    "OPUS-kde4-v2-ces-ukr",
    "OPUS-multiccaligned-v1.1-ces-ukr",
    "OPUS-multiparacrawl-v9b-ces-ukr",
    "OPUS-opensubtitles-v2018-ces-ukr",
    "OPUS-qed-v2.0a-ces-ukr",
    "OPUS-ted2020-v1-ces-ukr",
    "OPUS-tatoeba-v20220303-ces-ukr",
    "OPUS-ubuntu-v14.10-ces-ukr",
    "OPUS-xlent-v1.1-ces-ukr",
    "OPUS-bible_uedin-v1-ces-ukr",
    "OPUS-wikimedia-v20210402-ces-ukr"]
save_mtdata_to_csv(lang9, ids_ces_ukr)

2025-05-15 05:42:28 main.echo_data:92 INFO:: Total rows=848,961
2025-05-15 05:42:32 tmx.read_tmx:87 INFO:: Extracted 130003 pairs from TMX ZipPath(root=PosixPath('/Users/ningmac/.mtdata/elrc-share.eu/e303/b4de967c1825731af26ef6c2b148/ELRC_5179.zip'), name='tmx/cs-uk.tmx')
2025-05-15 05:42:32 main.echo_data:92 INFO:: Total rows=130,003
2025-05-15 05:42:56 main.echo_data:92 INFO:: Total rows=3,991,954
2025-05-15 05:42:58 main.echo_data:92 INFO:: Total rows=130,004
2025-05-15 05:43:00 main.echo_data:92 INFO:: Total rows=192
2025-05-15 05:43:02 main.echo_data:92 INFO:: Total rows=1,506
2025-05-15 05:43:03 main.echo_data:92 INFO:: Total rows=150
2025-05-15 05:43:06 main.echo_data:92 INFO:: Total rows=133,673
2025-05-15 05:43:17 main.echo_data:92 INFO:: Total rows=1,606,502
2025-05-15 05:43:31 main.echo_data:92 INFO:: Total rows=2,200,277
2025-05-15 05:43:35 main.echo_data:92 INFO:: Total rows=730,804
2025-05-15 05:43:38 main.echo_data:92 INFO:: Total rows=161,020
2025-05-15 05:43:40 main.ec

In [17]:
lang10 = "jpn-zho"
ids_jpn_zho = [
    "Statmt-news_commentary-18.1-jpn-zho",
    "KECL-paracrawl-2-zho-jpn",
    "KECL-paracrawl-2wmt24-zho-jpn",
    "Facebook-wikimatrix-1-jpn-zho",
    "Neulab-tedtalks_train-1-jpn-zho",
    "LinguaTools-wikititles-2014-jpn-zho",
    "OPUS-ccmatrix-v1-jpn-zho",
    "OPUS-gnome-v1-jpn-zho_CN",
    "OPUS-kde4-v2-jpn-zho_CN",
    "OPUS-multiccaligned-v1-jpn-zho_CN",
    "OPUS-openoffice-v3-jpn-zho_CN",
    "OPUS-opensubtitles-v2018-jpn-zho_CN",
    "OPUS-php-v1-jpn-zho",
    "OPUS-qed-v2.0a-jpn-zho",
    "OPUS-ted2020-v1-jpn-zho",
    "OPUS-tanzil-v1-jpn-zho",
    "OPUS-ubuntu-v14.10-jpn-zho",
    "OPUS-ubuntu-v14.10-jpn-zho_CN",
    "OPUS-xlent-v1.1-jpn-zho",
    "OPUS-bible_uedin-v1-jpn-zho",
    "OPUS-wikimedia-v20210402-jpn-zho"]
save_mtdata_to_csv(lang10, ids_jpn_zho)

2025-05-15 05:43:53 cache.download:212 INFO:: Downloading: http://data.statmt.org/news-commentary/v18.1/training/news-commentary-v18.ja-zh.tsv.gz → /Users/ningmac/.mtdata/data.statmt.org/b9a6/a062e7cab34fa811dcbb34dea550/news_commentary.tsv.gz
2025-05-15 05:43:53 main.echo_data:92 INFO:: Total rows=1,677
2025-05-15 05:43:55 cache.download:212 INFO:: Downloading: http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/zh/2.0/bitext/zh-ja.tar.gz → /Users/ningmac/.mtdata/www.kecl.ntt.co.jp/ee19/9ebb6f81a2b5eb1a7f08b867634b/jparacrawl-2.0-zh-ja.tar.gz
2025-05-15 06:02:44 utils.extract:215 INFO:: extracting /Users/ningmac/.mtdata/www.kecl.ntt.co.jp/ee19/9ebb6f81a2b5eb1a7f08b867634b/jparacrawl-2.0-zh-ja.tar.gz
2025-05-15 06:02:54 main.echo_data:92 INFO:: Total rows=83,892
2025-05-15 06:03:20 main.echo_data:92 INFO:: Total rows=4,602,328
2025-05-15 06:03:21 cache.download:212 INFO:: Downloading: https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.ja-zh.tsv.gz → /Users/ningmac/.mtdata/

In [18]:
lang11 = "eng-isl"
ids_eng_isl = [
    "Statmt-wikititles-3-isl-eng",
    "Statmt-ccaligned-1-eng-isl_IS",
    "ParaCrawl-paracrawl-9-eng-isl",
    "Tilde-eesc-2017-eng-isl",
    "Tilde-ema-2016-eng-isl",
    "Tilde-rapid-2016-eng-isl",
    "Facebook-wikimatrix-1-eng-isl",
    "ParIce-eea_train-20.05-eng-isl",
    "ParIce-ema_train-20.05-eng-isl",
    "EU-ecdc-1-eng-isl",
    "EU-eac_forms-1-eng-isl",
    "EU-eac_reference-1-eng-isl",
    "OPUS-ccmatrix-v1-eng-isl",
    "OPUS-elrc_2718_emea-v1-eng-isl",
    "OPUS-elrc_3206_antibiotic-v1-eng-isl",
    "OPUS-elrc_4295_www.malfong.is-v1-eng-isl",
    "OPUS-elrc_4324_government_offices_i-v1-eng-isl",
    "OPUS-elrc_4327_government_offices_i-v1-eng-isl",
    "OPUS-elrc_4334_rkiskaup_2020-v1-eng-isl",
    "OPUS-elrc_4338_university_iceland-v1-eng-isl",
    "OPUS-elrc_502_icelandic_financial_-v1-eng-isl",
    "OPUS-elrc_504_www.iceida.is-v1-eng-isl",
    "OPUS-elrc_505_www.pfs.is-v1-eng-isl",
    "OPUS-elrc_506_www.lanamal.is-v1-eng-isl",
    "OPUS-elrc_5067_scipar-v1-eng-isl",
    "OPUS-elrc_508_tilde_statistics_ice-v1-eng-isl",
    "OPUS-elrc_509_gallery_iceland-v1-eng-isl",
    "OPUS-elrc_510_harpa_reykjavik_conc-v1-eng-isl",
    "OPUS-elrc_511_bokmenntaborgin_is-v1-eng-isl",
    "OPUS-elrc_516_icelandic_medicines-v1-eng-isl",
    "OPUS-elrc_517_icelandic_directorat-v1-eng-isl",
    "OPUS-elrc_597_www.nordisketax.net-v1-eng-isl",
    "OPUS-elrc_718_statistics_iceland-v1-eng-isl",
    "OPUS-elrc_728_www.norden.org-v1-eng-isl",
    "OPUS-elrc_emea-v1-eng-isl",
    "OPUS-elrc_antibiotic-v1-eng-isl",
    "OPUS-elrc_www.norden.org-v1-eng-isl",
    "OPUS-elrc_www.nordisketax.net-v1-eng-isl",
    "OPUS-eubookshop-v2-eng-isl",
    "OPUS-multiccaligned-v1-eng-isl",
    "OPUS-multiparacrawl-v7.1-eng-isl",
    "OPUS-opensubtitles-v2018-eng-isl",
    "OPUS-ted2020-v1-eng-isl",
    "OPUS-tatoeba-v20220303-eng-isl",
    "OPUS-ubuntu-v14.10-eng-isl",
    "OPUS-wikimatrix-v1-eng-isl",
    "OPUS-wikititles-v3-eng-isl",
    "OPUS-xlent-v1.1-eng-isl",
    "OPUS-wikimedia-v20210402-eng-isl"]
save_mtdata_to_csv(lang11, ids_eng_isl)

2025-05-15 06:07:55 cache.download:212 INFO:: Downloading: http://data.statmt.org/wikititles/v3/wikititles-v3.is-en.tsv → /Users/ningmac/.mtdata/data.statmt.org/c7ae/6c7bad829f93a72ca71556f99b8a/wikititles.tsv
2025-05-15 06:07:55 main.echo_data:92 INFO:: Total rows=50,182
2025-05-15 06:07:57 cache.download:212 INFO:: Downloading: http://data.statmt.org/cc-aligned/sentence-aligned/en_XX-is_IS.tsv.xz → /Users/ningmac/.mtdata/data.statmt.org/0170/ce5fb98f8f02d7d482460855dc51/ccaligned.tsv.xz
2025-05-15 06:08:11 main.echo_data:92 INFO:: Total rows=1,192,536
2025-05-15 06:08:13 cache.download:212 INFO:: Downloading: https://s3.amazonaws.com/web-language-models/paracrawl/release9/en-is/en-is.txt.gz → /Users/ningmac/.mtdata/s3.amazonaws.com/ceb0/49e90004a0200acacff85b78e4d6/paracrawl.tsv.gz
2025-05-15 06:08:49 main.echo_data:92 INFO:: Total rows=2,967,519
2025-05-15 06:08:51 cache.download:212 INFO:: Downloading: https://tilde-model.s3-eu-west-1.amazonaws.com/EESC2017.en-is.tmx.zip → /Users/n

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



2025-05-15 06:09:43 cache.download:212 INFO:: Downloading: https://object.pouta.csc.fi/OPUS-CCMatrix/v1/moses/en-is.txt.zip → /Users/ningmac/.mtdata/object.pouta.csc.fi/9573/0fd22750f8fb14359b312cdecb76/ccmatrix.zip
2025-05-15 06:11:03 main.echo_data:92 INFO:: Total rows=8,723,145
2025-05-15 06:11:05 cache.download:212 INFO:: Downloading: https://object.pouta.csc.fi/OPUS-ELRC-2718-EMEA/v1/moses/en-is.txt.zip → /Users/ningmac/.mtdata/object.pouta.csc.fi/9a78/47e2ccd0975154b8d9116b7e8803/elrc_2718_emea.zip
2025-05-15 06:11:05 cache.get_local_path:149 ERROR:: Error downloading OPUS-elrc_2718_emea-v1-eng-isl
URL: https://object.pouta.csc.fi/OPUS-ELRC-2718-EMEA/v1/moses/en-is.txt.zip
Path:/Users/ningmac/.mtdata/object.pouta.csc.fi/9a78/47e2ccd0975154b8d9116b7e8803/elrc_2718_emea.zip
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^

In [20]:
lang12 = "eng-hin"
ids_eng_hin = [
    "Statmt-news_commentary-18.1-eng-hin",
    "Statmt-pmindia-1-eng-hin",
    "Statmt-ccaligned-1-eng-hin_IN",
    "JoshuaDec-indian_training-1-hin-eng",
    "Facebook-wikimatrix-1-eng-hin",
    "IITB-hien_train-1.5-hin-eng",
    "Neulab-tedtalks_train-1-eng-hin",
    "ELRC-wikipedia_health-1-eng-hin",
    "AI4Bharath-samananthar-0.2-eng-hin",
    "Anuvaad-internal_judicial_2021-v1-eng-hin",
    "Anuvaad-legal_terms_2021-v1-eng-hin",
    "Anuvaad-pib_2017-2020-eng-hin",
    "Anuvaad-pibarchives_2009-2016-eng-hin",
    "Anuvaad-wikipedia-20210201-eng-hin",
    "Anuvaad-drivespark-20210303-eng-hin",
    "Anuvaad-nativeplanet-20210315-eng-hin",
    "Anuvaad-catchnews-20210320-eng-hin",
    "Anuvaad-dwnews_2008-2020-eng-hin",
    "Anuvaad-oneindia-20210320-eng-hin",
    "Anuvaad-mk-20210320-eng-hin",
    "Anuvaad-goodreturns-20210320-eng-hin",
    "Anuvaad-ie_sports-20210320-eng-hin",
    "Anuvaad-ie_tech-20210320-eng-hin",
    "Anuvaad-ie_news-20210320-eng-hin",
    "Anuvaad-ie_lifestyle-20210320-eng-hin",
    "Anuvaad-ie_general-20210320-eng-hin",
    "Anuvaad-ie_entertainment-20210320-eng-hin",
    "Anuvaad-ie_education-20210320-eng-hin",
    "Anuvaad-ie_business-20210320-eng-hin",
    "Anuvaad-fin_express-20210320-eng-hin",
    "Anuvaad-thewire-20210320-eng-hin",
    "Anuvaad-tribune-20210320-eng-hin",
    "Anuvaad-zeebiz-20210320-eng-hin",
    "Anuvaad-pa_govt-20210320-eng-hin",
    "Anuvaad-betterindia-20210320-eng-hin",
    "Anuvaad-jagran_news-20210320-eng-hin",
    "Anuvaad-jagran_tech-20210320-eng-hin",
    "Anuvaad-jagran_education-20210320-eng-hin",
    "Anuvaad-jagran_entertainment-20210320-eng-hin",
    "Anuvaad-jagran_business-20210320-eng-hin",
    "Anuvaad-jagran_sports-20210320-eng-hin",
    "Anuvaad-jagran_lifestyle-20210320-eng-hin",
    "Anuvaad-asianetnews-20210320-eng-hin",
    "Anuvaad-business_standard-20210320-eng-hin",
    "Anuvaad-pranabmukherjee-20210320-eng-hin",
    "Anuvaad-lokmat_entertainment-20210501-eng-hin",
    "Anuvaad-lokmat_news-20210501-eng-hin",
    "Anuvaad-lokmat_lifestyle-20210501-eng-hin",
    "Anuvaad-lokmat_sports-20210501-eng-hin",
    "Anuvaad-lokmat_tech-20210501-eng-hin",
    "Anuvaad-lokmat_financial-20210501-eng-hin",
    "Anuvaad-lokmat_healthcare-20210501-eng-hin",
    "AllenAi-nllb-1-eng-hin",
    "OPUS-elrc_wikipedia_health-v1-eng-hin",
    "OPUS-elrc_2922-v1-eng-hin",
    "OPUS-globalvoices-v2018q4-eng-hin",
    "OPUS-iitb-v2.0-eng-hin",
    "OPUS-multiccaligned-v1-eng-hin",
    "OPUS-opensubtitles-v2018-eng-hin",
    "OPUS-ted2020-v1-eng-hin",
    "OPUS-tanzil-v1-eng-hin",
    "OPUS-tatoeba-v20220303-eng-hin",
    "OPUS-ubuntu-v14.10-eng-hin",
    "OPUS-xlent-v1.1-eng-hin",
    "OPUS-tico_19-v20201028-eng-hin",
    "OPUS-wikimedia-v20210402-eng-hin"]
save_mtdata_to_csv(lang12, ids_eng_hin)

2025-05-15 13:32:40 cache.download:212 INFO:: Downloading: http://data.statmt.org/news-commentary/v18.1/training/news-commentary-v18.en-hi.tsv.gz → /Users/ningmac/.mtdata/data.statmt.org/5bce/710fff734f71f5a58a20c8b57867/news_commentary.tsv.gz
2025-05-15 13:32:40 main.echo_data:92 INFO:: Total rows=5,773
2025-05-15 13:32:42 cache.download:212 INFO:: Downloading: http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.hi-en.tsv → /Users/ningmac/.mtdata/data.statmt.org/7653/8db1e9b9d6844705d43cdff22597/pmindia.tsv
2025-05-15 13:32:44 main.echo_data:92 INFO:: Total rows=56,831
2025-05-15 13:32:46 cache.download:212 INFO:: Downloading: http://data.statmt.org/cc-aligned/sentence-aligned/en_XX-hi_IN.tsv.xz → /Users/ningmac/.mtdata/data.statmt.org/3c1d/ee652c19c894c6bca19e2d86d89d/ccaligned.tsv.xz
2025-05-15 13:34:36 main.echo_data:92 INFO:: Total rows=8,181,584
2025-05-15 13:34:38 cache.download:212 INFO:: Downloading: https://github.com/joshua-decoder/indian-parallel-corpora/archive/a2cd1a99.