In [1]:
import numpy as np
import pandas as pd
import datasets
from seacrowd import SEACrowdConfigHelper, Tasks

from itertools import chain
import json
from collections import Counter

In [2]:
config2lang = {
	'aya_collection_ceb_seacrowd_t2t': 'ceb',
	'aya_collection_eng_seacrowd_t2t': 'eng',
	'aya_collection_ind_seacrowd_t2t': 'ind',
	'aya_collection_jav_seacrowd_t2t': 'jav',
	'aya_collection_khm_seacrowd_t2t': 'khm',
	'aya_collection_lao_seacrowd_t2t': 'lao',
	'aya_collection_mya_seacrowd_t2t': 'mya',
	'aya_collection_sun_seacrowd_t2t': 'sun',
	'aya_collection_tha_seacrowd_t2t': 'tha',
	'aya_collection_vie_seacrowd_t2t': 'vie',
	'aya_collection_zsm_seacrowd_t2t': 'zsm',
	'aya_dataset_ceb_seacrowd_t2t': 'ceb',
	'aya_dataset_ind_seacrowd_t2t': 'ind',
	'aya_dataset_jav_seacrowd_t2t': 'jav',
	'aya_dataset_mya_seacrowd_t2t': 'mya',
	'aya_dataset_sun_seacrowd_t2t': 'sun',
	'aya_dataset_tha_seacrowd_t2t': 'tha',
	'aya_dataset_vie_seacrowd_t2t': 'vie',
	'aya_dataset_zsm_seacrowd_t2t': 'zsm',
	'cc3m_mt_ind_seacrowd_imtext': 'ind',
	'cc3m_mt_tgl_seacrowd_imtext': 'tgl',
	'cc3m_mt_tha_seacrowd_imtext': 'tha',
	'cc3m_mt_vie_seacrowd_imtext': 'vie',
	'tlunified_ner_seacrowd_seq_label': 'tgl',
    'filipino_slang_norm_seacrowd_t2t': 'tgl',
	'mkqa_khm_seacrowd_qa': 'khm',
	'mkqa_tha_seacrowd_qa': 'tha',
	'mkqa_vie_seacrowd_qa': 'vie',
	'mkqa_zsm_seacrowd_qa': 'zsm',
	'mozilla_pontoon_eng_ceb_seacrowd_t2t': 'ceb',
	'mozilla_pontoon_eng_ind_seacrowd_t2t': 'ind',
	'mozilla_pontoon_eng_jav_seacrowd_t2t': 'jav',
	'mozilla_pontoon_eng_khm_seacrowd_t2t': 'khm',
	'mozilla_pontoon_eng_lao_seacrowd_t2t': 'lao',
	'mozilla_pontoon_eng_mya_seacrowd_t2t': 'mya',
	'mozilla_pontoon_eng_tgl_seacrowd_t2t': 'tgl',
	'mozilla_pontoon_eng_tha_seacrowd_t2t': 'tha',
	'mozilla_pontoon_eng_vie_seacrowd_t2t': 'vie',
    'mozilla_pontoon_eng_zlm_seacrowd_t2t': 'zsm',
	'ntrex_128_ind_eng-US_seacrowd_t2t': 'ind',
	'ntrex_128_khm_eng-US_seacrowd_t2t': 'khm',
	'ntrex_128_lao_eng-US_seacrowd_t2t': 'lao',
	'ntrex_128_mya_eng-US_seacrowd_t2t': 'mya',
	'ntrex_128_tha_eng-US_seacrowd_t2t': 'tha',
	'ntrex_128_vie_eng-US_seacrowd_t2t': 'vie',
    'ntrex_128_zlm_eng-US_seacrowd_t2t': 'vie',
	'nusax_mt_jav_ind_seacrowd_t2t': 'jav',
	'nusax_mt_sun_ind_seacrowd_t2t': 'sun',
	'paracotta_id_seacrowd_t2t': 'ind',
	'tatoeba_ind_eng_seacrowd_t2t': 'ind',
	'tatoeba_jav_eng_seacrowd_t2t': 'jav',
	'tatoeba_tgl_eng_seacrowd_t2t': 'tgl',
	'tatoeba_tha_eng_seacrowd_t2t': 'tha',
	'tatoeba_vie_eng_seacrowd_t2t': 'vie',
    'tatoeba_eng_vie_seacrowd_t2t': 'eng',
	'xcopa_ind_seacrowd_qa': 'ind',
	'xcopa_tha_seacrowd_qa': 'tha',
	'xcopa_vie_seacrowd_qa': 'vie',
	'yunshan_cup_2020_seacrowd_seq_label': 'lao',
	'ara_close_bcl_seacrowd_text' : 'bcl',
	'ara_close_ceb_seacrowd_text' : 'ceb',
	'aya_evaluation_suite_ceb_seacrowd_t2t' : 'ceb',
	'aya_evaluation_suite_eng_seacrowd_t2t' : 'eng',
	'aya_evaluation_suite_ind_seacrowd_t2t' : 'ind',
	'aya_evaluation_suite_jav_seacrowd_t2t' : 'jav',
	'aya_evaluation_suite_khm_seacrowd_t2t' : 'khm',
	'aya_evaluation_suite_lao_seacrowd_t2t' : 'lao',
	'aya_evaluation_suite_mya_seacrowd_t2t' : 'mya',
	'aya_evaluation_suite_sun_seacrowd_t2t' : 'sun',
	'aya_evaluation_suite_tha_seacrowd_t2t' : 'tha',
	'aya_evaluation_suite_vie_seacrowd_t2t' : 'vie',
	'aya_evaluation_suite_zsm_seacrowd_t2t' : 'zsm',
	'bactrian_x_id_seacrowd_t2t' : 'ind',
	'bactrian_x_km_seacrowd_t2t' : 'khm',
	'bactrian_x_my_seacrowd_t2t' : 'mya',
	'bactrian_x_th_seacrowd_t2t' : 'tha',
	'bactrian_x_tl_seacrowd_t2t' : 'tgl',
	'bactrian_x_vi_seacrowd_t2t' : 'vie',
	'belebele_ceb_latn_seacrowd_qa' : 'ceb',
	'belebele_ind_latn_seacrowd_qa' : 'ind',
	'belebele_jav_latn_seacrowd_qa' : 'jav',
	'belebele_khm_khmr_seacrowd_qa' : 'khm',
	'belebele_lao_laoo_seacrowd_qa' : 'lao',
	'belebele_mya_mymr_seacrowd_qa' : 'mya',
	'belebele_sun_latn_seacrowd_qa' : 'sun',
	'belebele_tgl_latn_seacrowd_qa' : 'tgl',
	'belebele_tha_thai_seacrowd_qa' : 'tha',
	'belebele_vie_latn_seacrowd_qa' : 'vie',
	'belebele_zsm_latn_seacrowd_qa' : 'zsm',
	'cosem_seacrowd_ssp' : 'eng',
	'emotes_3k_tgl_seacrowd_t2t' : 'tgl',
	'emotes_3k_eng_seacrowd_t2t' : 'eng',
	'khmer_alt_pos_seacrowd_seq_label' : 'khm',
	'korpus_nusantara_jav_ind_seacrowd_t2t' : 'jav',
	'korpus_nusantara_sun_ind_seacrowd_t2t' : 'sun',
	'nusaparagraph_rhetoric_jav_seacrowd_text' : 'jav',
	'nusaparagraph_rhetoric_sun_seacrowd_text' : 'sun',
	'nusatranslation_mt_jav_ind_seacrowd_t2t' : 'jav',
	'nusatranslation_mt_sun_ind_seacrowd_t2t' : 'sun',
	'parallel_asian_treebank_ind_eng_seacrowd_t2t' : 'ind',
	'parallel_asian_treebank_khm_eng_seacrowd_t2t' : 'khm',
	'parallel_asian_treebank_lao_eng_seacrowd_t2t' : 'lao',
	'parallel_asian_treebank_mya_eng_seacrowd_t2t' : 'mya',
	'parallel_asian_treebank_tha_eng_seacrowd_t2t' : 'tha',
	'parallel_asian_treebank_vie_eng_seacrowd_t2t' : 'vie',
    'parallel_asian_treebank_zlm_eng_seacrowd_t2t' : 'zsm',
	'sea_bench_ind_seacrowd_t2t' : 'ind',
	'sea_bench_khm_seacrowd_t2t' : 'khm',
	'sea_bench_lao_seacrowd_t2t' : 'lao',
	'sea_bench_mya_seacrowd_t2t' : 'mya',
	'sea_bench_tgl_seacrowd_t2t' : 'tgl',
	'sea_bench_tha_seacrowd_t2t' : 'tha',
	'sea_bench_vie_seacrowd_t2t' : 'vie',
    'sea_bench_zlm_seacrowd_t2t' : 'zsm',
	'seaeval_cross_logiqa_ind_seacrowd_qa' : 'ind',
	'seaeval_cross_logiqa_vie_seacrowd_qa' : 'vie',
    'seaeval_cross_logiqa_zlm_seacrowd_qa' : 'zsm',
    'seaeval_cross_logiqa_tgl_seacrowd_qa' : 'tgl',
	'seaeval_cross_mmlu_ind_seacrowd_qa' : 'ind',
	'seaeval_cross_mmlu_vie_seacrowd_qa' : 'vie',
    'seaeval_cross_mmlu_zlm_seacrowd_qa' : 'zsm',
    'seaeval_cross_mmlu_tgl_seacrowd_qa' : 'tgl',
    'tatoeba_eng_vie_seacrowd_t2t': 'eng',
    'tlunified_ner_seacrowd_seq_label': 'tgl',
    'filipino_slang_norm_seacrowd_t2t': 'tgl',
    # 'multilingual_alpaca_seacrowd_t2t': 'vie',
    'cebuaner_seacrowd_seq_label': 'ceb',
    'copal_seacrowd_qa': 'ind',
    'copal_colloquial_seacrowd_qa': 'ind',
    'mabl_jav_seacrowd_qa': 'jav',
    'mabl_sun_seacrowd_qa': 'sun',
    'my_paraphrase_all_seacrowd_t2t': 'mya',
    'gklmip_newsclass_seacrowd_text': 'khm',
    'gklmip_sentiment_seacrowd_text': 'mya',
    'vistec_tp_th_21_seacrowd_seq_label': 'tha'
}

In [3]:
dset_df = pd.read_csv('dataset_list.csv')

In [4]:
sc_conhelp = SEACrowdConfigHelper()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 340/340 [00:39<00:00,  8.55it/s]


# SEACrowd Dataset

In [5]:
def retrieve_text(dset, config_name):
    if 'paracotta_id' in config_name:
        return dset['train']['text_1'] + dset['train']['text_2']

    if 'aya_dataset' in config_name:
        return dset['train']['text_1'] + dset['train']['text_2']

    if 'tlunified_ner' in config_name:
        return list(map(lambda x: ' '.join(x['tokens']), dset['train']))
    
    if 'filipino_slang_norm' in config_name:
        return dset['train']['text_1']

    if 'mkqa' in config_name:
        return dset['train']['question']

    if 'yunshan_cup_2020' in config_name:
        return list(map(lambda x: ' '.join(x['tokens']), dset['train'])) + list(map(lambda x: ' '.join(x['tokens']), dset['validation']))

    if 'mozilla_pontoon' in config_name:
        return dset['train']['text_2']

    if 'ntrex_128' in config_name:
        return dset['test']['text_1']

    if 'nusax_mt' in config_name:
        return dset['train']['text_1']

    if 'tatoeba' in config_name:
        return dset['validation']['text_1']

    if 'xcopa' in config_name:
        return dset['validation']['context'] + dset['validation']['context'] + list(chain.from_iterable(list(dset['validation']['choices'])))

    if 'bactrian_x' in config_name:
        return dset['train']['text_1'] + dset['train']['text_2']

    if 'ara_close' in config_name:
        return dset['train']['text']

    if 'cosem' in config_name:
        return dset['train']['text']

    if 'nusaparagraph_rhetoric' in config_name:
        return dset['test']['text']

    if 'emotes_3k' in config_name and 'seacrowd_t2t' in config_name:
        return dset['test']['text_1'] + dset['test']['text_2']

    if 'sea_bench' in config_name:
        return dset['train']['text_1'] + dset['train']['text_2']

    if 'seaeval_cross_mmlu' in config_name:
        return dset['test']['question'] + list(map(lambda x: x[0], dset['test']['answer']))

    if 'seaeval_cross_logiqa' in config_name:
        return dset['test']['question'] + list(map(lambda x: x[0], dset['test']['answer'])) + list(chain.from_iterable(list(dset['test']['choices'])))

    if 'belebele' in config_name:
        return dset['test']['question'] + dset['test']['context'] + list(chain.from_iterable(list(dset['test']['choices'])))

    if 'khmer_alt_pos' in config_name:
        return list(map(lambda x: ' '.join(x['tokens']), dset['train']))

    if 'korpus_nusantara' in config_name:
        return dset['train']['text_1']

    if 'nusatranslation_mt' in config_name:
        return dset['test']['text_1']

    if 'parallel_asian_treebank' in config_name:
        return dset['test']['text_1']
    
    # if 'multilingual_alpaca_seacrowd_t2t' in config_name:
    #     return dset['train']['text_1'] + dset['train']['text_2']

    raise ValueError(f'Unknown Config Name `{config_name}`')

In [6]:
valid_config_names = {}
for split in ['Train', 'Test']:
    valid_config_names[split] = {}
    for label_type in ['Machine-translated', 'Natural', 'Human-translated']:
        valid_config_names[split][label_type] = []
        sc_dset_names = dset_df.loc[(dset_df['split'] == split) & (dset_df['type'] == label_type), 'dataloader_name'].tolist()
        for con in sc_conhelp.filtered(
            lambda x: (x.dataset_name in sc_dset_names) and ('_source' not in x.config.name)
        ):
            if 'cc3m_35l' in con.dataset_name:
                continue

            lang_list = dset_df.loc[dset_df['dataloader_name'] == con.dataset_name, 'lang'].values[0].split(',')
            if 'zsm' in lang_list:
                lang_list.append('zlm')

            if len(lang_list) > 1:
                if 'bactrian_x' in con.dataset_name:
                    # Bactrian config name use ISO639-2 instead of ISO639-3, so just force it in
                    valid_config_names[split][label_type].append(con.config.name)
                else:
                    for lang in lang_list:
                        if Tasks.MACHINE_TRANSLATION in list(con.tasks):
                            # Skip Redundant MT Config
                            if ('ntrex' in con.config.name and 'eng-US_seacrowd' not in con.config.name) or \
                                ('nusax_mt' in con.config.name and 'ind_seacrowd' not in con.config.name) or \
                                ('parallel_asian_treebank' in con.config.name and 'eng_seacrowd' not in con.config.name) or \
                                ('korpus_nusantara' in con.config.name and 'ind_seacrowd' not in con.config.name) or \
                                ('tatoeba' in con.config.name and 'eng_vie_seacrowd' not in con.config.name and 'eng_seacrowd' not in con.config.name) or \
                                ('nusatranslation_mt_' in con.config.name and 'ind_seacrowd' not in con.config.name):
                                continue
                            if lang in con.config.name:
                                valid_config_names[split][label_type].append(con.config.name)
                        else:
                            if lang in con.config.name:
                                valid_config_names[split][label_type].append(con.config.name)
            else:
                if 'emotes' in con.config.name:
                    if 'seacrowd_text' in con.config.name:
                        continue              
                    if 'tgl' in con.config.name and label_type == 'Natural':
                        valid_config_names[split][label_type].append(con.config.name)
                    elif 'eng' in con.config.name and label_type == 'Human-translated':
                        valid_config_names[split][label_type].append(con.config.name)
                elif 'ara_close' in con.config.name:
                    if lang_list[0] in con.config.name:
                        valid_config_names[split][label_type].append(con.config.name)
                elif Tasks.MACHINE_TRANSLATION in list(con.tasks):
                    # Skip Redundant MT Config
                    if ('ntrex' in con.config.name and 'eng-US_seacrowd' not in con.config.name) or \
                        ('nusax_mt' in con.config.name and 'ind_seacrowd' not in con.config.name) or \
                        ('parallel_asian_treebank' in con.config.name and 'eng_seacrowd' not in con.config.name) or \
                        ('korpus_nusantara' in con.config.name and 'ind_seacrowd' not in con.config.name):
                        continue
                    valid_config_names[split][label_type].append(con.config.name)
                else:
                    valid_config_names[split][label_type].append(con.config.name)

In [70]:
%%time
config_stats = {}
train_data = {'text': [], 'label': [], 'config': []}
test_data = {'text': [], 'label': [], 'config': []}

data = {'Train': train_data, 'Test': test_data}
for split in valid_config_names.keys():
    for label_type in valid_config_names[split].keys():
        for config_name in valid_config_names[split][label_type]:
            texts = retrieve_text(sc_conhelp.for_config_name(config_name).load_dataset(), config_name)
            labels = [label_type for _ in range(len(texts))]
            configs = [config_name for _ in range(len(texts))]
            
            config_stats[config_name] = len(texts)
            data[split]['text'] += texts
            data[split]['label'] += labels
            data[split]['config'] += configs
config_stats

CPU times: user 22.7 s, sys: 1.83 s, total: 24.5 s
Wall time: 29.9 s


{'paracotta_id_seacrowd_t2t': 12000000,
 'aya_dataset_ceb_seacrowd_t2t': 1454,
 'aya_dataset_ind_seacrowd_t2t': 1572,
 'aya_dataset_jav_seacrowd_t2t': 494,
 'aya_dataset_mya_seacrowd_t2t': 944,
 'aya_dataset_sun_seacrowd_t2t': 388,
 'aya_dataset_tha_seacrowd_t2t': 1448,
 'aya_dataset_vie_seacrowd_t2t': 17352,
 'aya_dataset_zsm_seacrowd_t2t': 20146,
 'filipino_slang_norm_seacrowd_t2t': 303,
 'mkqa_khm_seacrowd_qa': 10000,
 'mkqa_zsm_seacrowd_qa': 10000,
 'mkqa_tha_seacrowd_qa': 10000,
 'mkqa_vie_seacrowd_qa': 10000,
 'tlunified_ner_seacrowd_seq_label': 6252,
 'yunshan_cup_2020_seacrowd_seq_label': 8000,
 'mozilla_pontoon_eng_mya_seacrowd_t2t': 13699,
 'mozilla_pontoon_eng_ceb_seacrowd_t2t': 470,
 'mozilla_pontoon_eng_ind_seacrowd_t2t': 43789,
 'mozilla_pontoon_eng_jav_seacrowd_t2t': 685,
 'mozilla_pontoon_eng_khm_seacrowd_t2t': 14481,
 'mozilla_pontoon_eng_lao_seacrowd_t2t': 15302,
 'mozilla_pontoon_eng_zlm_seacrowd_t2t': 29549,
 'mozilla_pontoon_eng_tgl_seacrowd_t2t': 12252,
 'mozilla_

# Custom Data 
- Aya Collection
- Aya Evaluation Suite
- CC3M-35L

In [71]:
aya_col_dset = datasets.load_dataset('CohereForAI/aya_collection', name='translated_dolly')
aya_col_dset = aya_col_dset.filter(lambda x: x['language'] in ['ceb','tha','mya','zsm','jav','ind','vie','sun','khm','lao','eng'])
texts = aya_col_dset['train']['inputs'] + aya_col_dset['train']['targets']
labels = ['Machine-translated' for _ in range(len(texts))]
configs = list(map(lambda lang: 'aya_collection_'+lang+'_seacrowd_t2t', aya_col_dset['train']['language'])) * 2

train_data['text'] += texts
train_data['label'] += labels
train_data['config'] += configs

In [72]:
aya_eval_dset = datasets.load_dataset('CohereForAI/aya_evaluation_suite', name='dolly_machine_translated')
aya_eval_dset = aya_eval_dset.filter(lambda x: x['language'] in ['ceb','tha','mya','zsm','jav','ind','vie','sun','khm','lao','eng'])
texts = aya_eval_dset['test']['inputs'] + aya_eval_dset['test']['targets']
labels = ['Machine-translated' for _ in range(len(texts))]
configs = list(map(lambda lang: 'aya_evaluation_suite_'+lang+'_seacrowd_t2t', aya_eval_dset['test']['language'])) * 2

test_data['text'] += texts
test_data['label'] += labels
test_data['config'] += configs

In [None]:
%%time
texts, labels, configs = [], [], []
lang_map = {'id': 'ind', 'th': 'tha', 'fil': 'tgl', 'vi': 'vie'}
with open('./cc3m_mt_train.jsonl') as f:
    for i, line in enumerate(f):
        row = json.loads(line)
        if row['trg_lang'] in ['fil', 'tgl', 'ind', 'tha', 'vie', 'tl','id','th','vi']:
            texts.append(row['translation_tokenized'])
            labels.append('Machine-translated')
            configs.append('cc3m_mt_' + lang_map[row['trg_lang']] + '_seacrowd_imtext')

train_data['text'] += texts
train_data['label'] += labels
train_data['config'] += configs

CPU times: user 526 ms, sys: 13 ms, total: 539 ms
Wall time: 546 ms


In [74]:
# Sanity Check
print('Train: ', len(train_data['text']), len(train_data['label']), len(train_data['config']))
print('Test: ', len(test_data['text']), len(test_data['label']), len(test_data['config']))

Train:  12661274 12661274 12661274
Test:  2328921 2328921 2328921


In [75]:
import datasets
train_dset = datasets.Dataset.from_dict(train_data)
test_dset = datasets.Dataset.from_dict(test_data)

In [76]:
datasets.DatasetDict({
    'train': train_dset,
    'test': test_dset
}).push_to_hub("SEACrowd/sea_translationese")

Creating parquet from Arrow format: 100%|██████████| 3166/3166 [00:02<00:00, 1360.69ba/s]
Creating parquet from Arrow format: 100%|██████████| 3166/3166 [00:02<00:00, 1271.60ba/s]
Creating parquet from Arrow format: 100%|██████████| 3166/3166 [00:02<00:00, 1387.74ba/s]
Creating parquet from Arrow format: 100%|██████████| 3166/3166 [00:03<00:00, 935.46ba/s]
Uploading the dataset shards: 100%|██████████| 4/4 [00:14<00:00,  3.70s/it]
Creating parquet from Arrow format: 100%|██████████| 1165/1165 [00:02<00:00, 440.02ba/s]
Creating parquet from Arrow format: 100%|██████████| 1165/1165 [00:00<00:00, 1482.24ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [00:46<00:00, 23.41s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/SEACrowd/sea_translationese/commit/fca20a07e6eac4a3a131ffe63098461ed964d5c5', commit_message='Upload dataset', commit_description='', oid='fca20a07e6eac4a3a131ffe63098461ed964d5c5', pr_url=None, pr_revision=None, pr_num=None)

In [82]:
train_df = train_dset.to_pandas()
test_df = test_dset.to_pandas()

In [83]:
test_df = test_df.loc[~test_df['config'].isin(['korpus_nusantara_ind_jav_seacrowd_t2t','korpus_nusantara_ind_sun_seacrowd_t2t'])]

In [84]:
train_df['lang'] = train_df['config'].apply(lambda x: config2lang[x])
test_df['lang'] = test_df['config'].apply(lambda x: config2lang[x])

In [87]:
train_df.groupby('config').size()

config
aya_collection_ceb_seacrowd_t2t            29616
aya_collection_eng_seacrowd_t2t            29616
aya_collection_ind_seacrowd_t2t            29616
aya_collection_jav_seacrowd_t2t            29616
aya_collection_khm_seacrowd_t2t            29616
aya_collection_lao_seacrowd_t2t            29616
aya_collection_mya_seacrowd_t2t            29616
aya_collection_sun_seacrowd_t2t            29616
aya_collection_tha_seacrowd_t2t            29616
aya_collection_vie_seacrowd_t2t            29616
aya_collection_zsm_seacrowd_t2t            29616
aya_dataset_ceb_seacrowd_t2t                1454
aya_dataset_ind_seacrowd_t2t                1572
aya_dataset_jav_seacrowd_t2t                 494
aya_dataset_mya_seacrowd_t2t                 944
aya_dataset_sun_seacrowd_t2t                 388
aya_dataset_tha_seacrowd_t2t                1448
aya_dataset_vie_seacrowd_t2t               17352
aya_dataset_zsm_seacrowd_t2t               20146
filipino_slang_norm_seacrowd_t2t             303
mkqa_khm_seac

In [88]:
train_df.groupby(['label','lang']).size()

label               lang
Human-translated    ceb          470
                    eng         2000
                    ind        48186
                    jav         1595
                    khm        16478
                    lao        17299
                    mya        15696
                    sun          500
                    tgl        14252
                    tha        41184
                    vie        49936
                    zsm        29549
Machine-translated  ceb        29616
                    eng        29616
                    ind     12029616
                    jav        29616
                    khm        29616
                    lao        29616
                    mya        29616
                    sun        29616
                    tha        29616
                    vie        29616
                    zsm        29616
Natural             ceb         1454
                    ind         1572
                    jav          494
             

In [89]:
train_df.sort_values('lang').groupby(['label'])['lang'].unique().to_dict()

{'Human-translated': array(['ceb', 'eng', 'ind', 'jav', 'khm', 'lao', 'mya', 'sun', 'tgl',
        'tha', 'vie', 'zsm'], dtype=object),
 'Machine-translated': array(['ceb', 'eng', 'ind', 'jav', 'khm', 'lao', 'mya', 'sun', 'tha',
        'vie', 'zsm'], dtype=object),
 'Natural': array(['ceb', 'ind', 'jav', 'khm', 'lao', 'mya', 'sun', 'tgl', 'tha',
        'vie', 'zsm'], dtype=object)}

In [90]:
test_df.sort_values('lang').groupby(['label'])['lang'].unique().to_dict()

{'Human-translated': array(['ceb', 'eng', 'ind', 'jav', 'khm', 'lao', 'mya', 'sun', 'tgl',
        'tha', 'vie', 'zsm'], dtype=object),
 'Machine-translated': array(['ceb', 'eng', 'ind', 'jav', 'khm', 'lao', 'mya', 'sun', 'tgl',
        'tha', 'vie', 'zsm'], dtype=object),
 'Natural': array(['ceb', 'eng', 'ind', 'jav', 'khm', 'lao', 'mya', 'sun', 'tgl',
        'tha', 'vie', 'zsm'], dtype=object)}

In [96]:
train_df.groupby(['label', 'lang', 'config']).size().to_frame('count').reset_index().to_csv('train_translationese.csv', index=False)
test_df.groupby(['label', 'lang', 'config']).size().to_frame('count').reset_index().to_csv('test_translationese.csv', index=False)

# Sampling

In [5]:
trans_dset = datasets.load_dataset("SEACrowd/sea_translationese")

In [6]:
train_df = trans_dset['train'].to_pandas()
test_df = trans_dset['test'].to_pandas()

In [7]:
train_df['lang'] = train_df['config'].apply(lambda x: config2lang[x])
test_df['lang'] = test_df['config'].apply(lambda x: config2lang[x])

### Add additional Datasets

In [8]:
%%time
# CC3M_35L
texts, labels, langs, configs = [], [], [], []
lang_map = {'id': 'ind', 'th': 'tha', 'fil': 'tgl', 'vi': 'vie'}
with open('./cc3m_mt_train.jsonl') as f:
    for i, line in enumerate(f):
        row = json.loads(line)
        if row['trg_lang'] in ['fil', 'tgl', 'ind', 'tha', 'vie', 'tl','id','th','vi']:
            texts.append(row['translation_tokenized'])
            labels.append('Machine-translated')
            langs.append(lang_map[row['trg_lang']])
            configs.append('cc3m_mt_' + lang_map[row['trg_lang']] + '_seacrowd_imtext')
            if len(texts) == 100000:
                break

cc3m_df = pd.DataFrame({
    'text': texts,
    'label': labels,
    'config': configs,
    'lang': langs,
})

CPU times: user 4.4 s, sys: 192 ms, total: 4.59 s
Wall time: 4.86 s


In [9]:
%%time
import xml.etree.ElementTree as ET
def xml_element_to_dict(element: ET.Element):
        """Converts an xml element to a dictionary."""
        element_dict = {}

        # add text with key '$', attributes with '@' prefix
        element_dict["$"] = element.text
        for attrib, value in element.attrib.items():
            element_dict[f"@{attrib}"] = value

        # recursively
        for child in element:
            child_dict = xml_element_to_dict(child)
            element_dict[child.tag] = child_dict

        return element_dict

data_root = ET.parse('smsCorpus_en_2015.03.09_all.xml').getroot()
data_list = [xml_element_to_dict(child) for child in data_root]

texts, labels, langs, configs = [], [], [], []
for row in data_list:
    texts.append(row['text']['$'])
    langs.append('eng')
    labels.append('Natural')
    configs.append('nus_sms_corpus_seacrowd_ssp')

nus_sms_df = pd.DataFrame({
    'text': texts,
    'label': labels,
    'config': configs,
    'lang': langs,
})

CPU times: user 3.36 s, sys: 376 ms, total: 3.73 s
Wall time: 3.77 s


#### Additional Data - 25 May 2024

In [10]:
valid_config_names = {
    'Train': {
        'Natural': [
           'cebuaner_seacrowd_seq_label',
           'copal_seacrowd_qa',
           'copal_colloquial_seacrowd_qa',
           'mabl_jav_seacrowd_qa',
           'mabl_sun_seacrowd_qa',
           'my_paraphrase_all_seacrowd_t2t'
        ]
    },
    'Test': {
        'Natural': [
            'gklmip_newsclass_seacrowd_text',
            'gklmip_sentiment_seacrowd_text',
            'vistec_tp_th_21_seacrowd_seq_label'
        ]
    }
}

In [11]:
def retrieve_text(dset, config_name):
    if 'cebuaner' in config_name:
        return list(map(lambda x: ' '.join(x['tokens']), dset['train']))

    if 'copal' in config_name:
        return dset['test']['context'] + list(chain.from_iterable(list(dset['test']['choices'])))

    if 'mabl' in config_name:
        return dset['test']['question'] + list(chain.from_iterable(list(dset['test']['choices'])))
        
    if 'gklmip' in config_name:
        return dset['test']['text']
    
    if 'vistec_tp_th_21' in config_name:
        return list(map(lambda x: ' '.join(x['tokens']), dset['test']))

    raise ValueError(f'Unknown Config Name `{config_name}`')

In [12]:
%%time
config_stats = {}
train_data = {'text': [], 'label': [], 'config': []}
test_data = {'text': [], 'label': [], 'config': []}

data = {'Train': train_data, 'Test': test_data}
for split in valid_config_names.keys():
    for label_type in valid_config_names[split].keys():
        for config_name in valid_config_names[split][label_type]:
            if 'my_paraphrase' in config_name:
                df = pd.read_csv('https://github.com/ye-kyaw-thu/myParaphrase/raw/main/corpus/ver1.0/csv-qqp/train.csv', header=None)
                texts = list(set(df[1].tolist() + df[2].tolist()))
                labels = [label_type for _ in range(len(texts))]
                configs = [config_name for _ in range(len(texts))]                
            else:
                texts = retrieve_text(sc_conhelp.for_config_name(config_name).load_dataset(), config_name)
                labels = [label_type for _ in range(len(texts))]
                configs = [config_name for _ in range(len(texts))]
            
            config_stats[config_name] = len(texts)
            data[split]['text'] += texts
            data[split]['label'] += labels
            data[split]['config'] += configs
config_stats

CPU times: user 2.54 s, sys: 97.5 ms, total: 2.64 s
Wall time: 4.63 s


{'cebuaner_seacrowd_seq_label': 2980,
 'copal_seacrowd_qa': 1677,
 'copal_colloquial_seacrowd_qa': 1677,
 'mabl_jav_seacrowd_qa': 1800,
 'mabl_sun_seacrowd_qa': 1800,
 'my_paraphrase_all_seacrowd_t2t': 56275,
 'gklmip_newsclass_seacrowd_text': 1436,
 'gklmip_sentiment_seacrowd_text': 716,
 'vistec_tp_th_21_seacrowd_seq_label': 10000}

In [17]:
extra_train_df = pd.DataFrame(train_data)
extra_test_df = pd.DataFrame(test_data)

In [18]:
extra_train_df['lang'] = extra_train_df['config'].apply(lambda x: config2lang[x])
extra_test_df['lang'] = extra_test_df['config'].apply(lambda x: config2lang[x])

##### Merge All

In [19]:
train_df = pd.concat([train_df, nus_sms_df, cc3m_df, extra_train_df])
test_df = pd.concat([test_df, extra_test_df])

In [21]:
train_dfs = []
for key, tdf in train_df.groupby(['lang', 'label']):
    if tdf.shape[0] > 5000: # Down Sampling
        train_dfs.append(tdf.sample(5000, random_state=0))
    elif tdf.shape[0] < 2000: # Up Sampling
        train_dfs.append(pd.concat([
            tdf, tdf.sample(2000 - tdf.shape[0], random_state=0, replace=True)
        ]))
    else: # Keep
        train_dfs.append(tdf)       
s_train_df = pd.concat(train_dfs)

In [23]:
test_dfs = []
for key, tdf in test_df.groupby(['lang', 'label']):
    if tdf.shape[0] > 2000: # Down Sampling
        test_dfs.append(tdf.sample(2000, random_state=0))
    # elif tdf.shape[0] < 1000: # Up Sampling
    #     test_dfs.append(pd.concat([
    #         tdf, tdf.sample(1000 - tdf.shape[0], random_state=0, replace=True)
    #     ]))
    else: # Keep
        test_dfs.append(tdf)       
s_test_df = pd.concat(test_dfs)

In [29]:
s_train_df.shape, s_test_df.shape

((161842, 4), (56009, 4))

In [30]:
train_df.shape, test_df.shape

((12883318, 4), (2341073, 4))

In [31]:
s_train_df.groupby(['lang', 'label']).size()

lang  label             
ceb   Human-translated      2000
      Machine-translated    5000
      Natural               4434
eng   Human-translated      2000
      Machine-translated    5000
      Natural               5000
ind   Human-translated      5000
      Machine-translated    5000
      Natural               4926
jav   Human-translated      2000
      Machine-translated    5000
      Natural               2294
khm   Human-translated      5000
      Machine-translated    5000
      Natural               5000
lao   Human-translated      5000
      Machine-translated    5000
      Natural               5000
mya   Human-translated      5000
      Machine-translated    5000
      Natural               5000
sun   Human-translated      2000
      Machine-translated    5000
      Natural               2188
tgl   Human-translated      5000
      Machine-translated    5000
      Natural               5000
tha   Human-translated      5000
      Machine-translated    5000
      Natural     

In [32]:
s_test_df.groupby(['lang', 'label']).size()

lang  label             
ceb   Human-translated      2000
      Machine-translated     400
      Natural                349
eng   Human-translated      2000
      Machine-translated     400
      Natural               2000
ind   Human-translated      2000
      Machine-translated    2000
      Natural               1556
jav   Human-translated      2000
      Machine-translated     400
      Natural               2000
khm   Human-translated      2000
      Machine-translated    2000
      Natural               1596
lao   Human-translated      2000
      Machine-translated     400
      Natural                160
mya   Human-translated      2000
      Machine-translated    2000
      Natural                876
sun   Human-translated      2000
      Machine-translated     400
      Natural               2000
tgl   Human-translated      2000
      Machine-translated    2000
      Natural               2000
tha   Human-translated      2000
      Machine-translated    2000
      Natural     

In [33]:
trans_dset_dict = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(s_train_df.set_index(['label', 'config', 'lang'])),
    'test': datasets.Dataset.from_pandas(s_test_df.set_index(['label', 'config', 'lang']))
})
trans_dset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'config', 'lang'],
        num_rows: 161842
    })
    test: Dataset({
        features: ['text', 'label', 'config', 'lang'],
        num_rows: 56009
    })
})

In [34]:
trans_dset_dict.push_to_hub("SEACrowd/sea_translationese_resampling_v2")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/57 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SEACrowd/sea_translationese_resampling_v2/commit/cef02f6bafe0d7e678ece196b907d3e0a0ef6c65', commit_message='Upload dataset', commit_description='', oid='cef02f6bafe0d7e678ece196b907d3e0a0ef6c65', pr_url=None, pr_revision=None, pr_num=None)

# Sandbox

In [97]:
dset = sc_conhelp.for_config_name('paracotta_id_seacrowd_t2t').load_dataset()
# dset['train']['text_1'] + dset['train']['text_2']

In [98]:
dset = sc_conhelp.for_config_name('aya_dataset_ceb_seacrowd_t2t').load_dataset()
# dset['train']['text_1'] + dset['train']['text_2']

In [99]:
dset = sc_conhelp.for_config_name('filipino_slang_norm_seacrowd_t2t').load_dataset()
# dset['train']['text_1']

In [100]:
dset = sc_conhelp.for_config_name('tlunified_ner_seacrowd_seq_label').load_dataset()
# list(map(lambda x: ' '.join(x['tokens']), dset['train']))

In [101]:
dset = sc_conhelp.for_config_name('mkqa_khm_seacrowd_qa').load_dataset()
# dset['train']['question']

In [102]:
dset = sc_conhelp.for_config_name('yunshan_cup_2020_seacrowd_seq_label').load_dataset()
# list(map(lambda x: ' '.join(x['tokens']), dset['train'])) + list(map(lambda x: ' '.join(x['tokens']), dset['validation']))

In [103]:
dset = sc_conhelp.for_config_name('mozilla_pontoon_eng_lao_seacrowd_t2t').load_dataset()
# dset['train']['text_2']

In [104]:
dset = sc_conhelp.for_config_name('ntrex_128_lao_eng-US_seacrowd_t2t').load_dataset()
# dset['train']['text_1']

In [105]:
dset = sc_conhelp.for_config_name('nusax_mt_jav_ind_seacrowd_t2t').load_dataset()
# dset['train']['text_1']

In [106]:
dset = sc_conhelp.for_config_name('tatoeba_ind_eng_seacrowd_t2t').load_dataset()
# dset['validation']['text_1']

In [107]:
dset = sc_conhelp.for_config_name('xcopa_vie_seacrowd_qa').load_dataset()
# dset['validation']['context'] + dset['validation']['context'] + list(chain.from_iterable(list(dset['validation']['choices'])))

In [108]:
dset = sc_conhelp.for_config_name('bactrian_x_my_seacrowd_t2t').load_dataset()
# dset['train']['text_1'] + dset['train']['text_2']

In [109]:
dset = sc_conhelp.for_config_name('ara_close_bcl_seacrowd_text').load_dataset()
# dset['train']['text']

Downloading data: 329kB [00:00, 27.5MB/s]                   
Generating train split: 150 examples [00:00, 4007.02 examples/s]


In [110]:
dset = sc_conhelp.for_config_name('cosem_seacrowd_ssp').load_dataset()
# dset['train']['text']

In [111]:
dset = sc_conhelp.for_config_name('nusaparagraph_rhetoric_jav_seacrowd_text').load_dataset()
# dset['test']['text']

In [112]:
dset = sc_conhelp.for_config_name('emotes_3k_eng_seacrowd_text').load_dataset()
# dset['test']['text']

In [113]:
dset = sc_conhelp.for_config_name('emotes_3k_eng_seacrowd_t2t').load_dataset()
# dset['test']['text_1'] + dset['test']['text_2']

In [114]:
dset = sc_conhelp.for_config_name('sea_bench_tgl_seacrowd_t2t').load_dataset()
# dset['train']['text_1'] + dset['train']['text_2']

In [115]:
dset = sc_conhelp.for_config_name('seaeval_cross_mmlu_ind_seacrowd_qa').load_dataset()
# dset['test']['question'] + list(map(lambda x: x[0], dset['test']['answer']))

In [116]:
dset = sc_conhelp.for_config_name('seaeval_cross_logiqa_ind_seacrowd_qa').load_dataset()
# dset['test']['question'] + list(map(lambda x: x[0], dset['test']['answer'])) + list(chain.from_iterable(list(dset['test']['choices'])))

In [117]:
dset = sc_conhelp.for_config_name('belebele_ceb_latn_seacrowd_qa').load_dataset()
# dset['test']['question'] + dset['test']['context'] + list(chain.from_iterable(list(dset['test']['choices'])))

In [118]:
dset = sc_conhelp.for_config_name('khmer_alt_pos_seacrowd_seq_label').load_dataset()
# list(map(lambda x: ' '.join(x['tokens']), dset['train']))

In [None]:
dset = sc_conhelp.for_config_name('nusatranslation_mt_jav_ind_seacrowd_t2t').load_dataset()
# dset['test']['text_1']

In [None]:
dset = sc_conhelp.for_config_name('parallel_asian_treebank_khm_eng_seacrowd_t2t').load_dataset()
# dset['test']['text_1']

In [None]:
dset['train'][:3]

In [None]:
dset_df.loc[(~dset_df['split'].isna()), 'dataloader_name'].tolist()

In [None]:
%%time
c = Counter()
with open('./cc3m_mt_train.jsonl') as f:
    for line in f:
        row = json.loads(line)
        if row['trg_lang'] in ['fil', 'tgl', 'ind', 'tha', 'vie', 'tl','id','th','vi']:
            c[row['trg_lang']] += 1

In [None]:
c