In [1]:
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer, M2M100Model
from datasets import load_metric, load_dataset
import datasets
import pickle
import pandas
import os 

Everything from: https://huggingface.co/

In [21]:
cz_dataset = load_dataset('wmt19', 'cs-en', split='train')



In [22]:
de_dataset = load_dataset('wmt19', 'de-en', split='train')



# Process Data

In [14]:
def process_data(dataset, lang_code, max_length, start_idx, end_idx, use_whole_dataset):
    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", 
                                                src_lang=lang_code,
                                                tgt_lang="en", 
                                                padding_side='right', 
                                                truncation_side='right')
    if use_whole_dataset:
        end_idx = len(dataset)
    x = tokenizer([dataset[i]['translation'][lang_code] for i in range(start_idx, end_idx)], 
                  return_tensors="pt", 
                  truncation=True, 
                  padding='max_length', 
                  max_length=max_length)
    with tokenizer.as_target_tokenizer(): 
        y = tokenizer([dataset[i]['translation']['en'] for i in range(start_idx, end_idx)], 
                      return_tensors="pt", 
                      truncation=True, 
                      padding='max_length', 
                      max_length=max_length).input_ids
    return {'x':x, 
            'y':y}

In [19]:
cz_train = process_data(cz_dataset, 'cs', 8, 4000, 0, True)
with open('cs_dataset_train_full_'+str(8)+'.pkl', 'wb') as handle:
    pickle.dump(cz_train, handle)

In [None]:
de_train = process_data(de_dataset, 'de', 8, 4000, 0, True)
with open('de_dataset_train_full_'+str(8)+'.pkl', 'wb') as handle:
    pickle.dump(de_train, handle)

In [4]:
def subset_data(train_data, lang_code, idxs, path):
    evo = select_subset(idxs[0][0], idxs[0][1], train_data)   
    with open(path, 'wb') as handle:
        pickle.dump(evo, handle)
        
def select_subset(start_idx, end_idx, data):
    return {'x':{'input_ids': data['x']['input_ids'][start_idx: end_idx], 
                 'attention_mask': data['x']['attention_mask'][start_idx: end_idx]}, 
            'y':data['y'][start_idx: end_idx]}

In [2]:
checkpoint_size = 8
    
with open('cs_dataset_train_full_'+str(checkpoint_size)+'.pkl', 'rb') as handle:
    cs_train = pickle.load(handle)
with open('de_dataset_train_full_'+str(checkpoint_size)+'.pkl', 'rb') as handle:
    de_train = pickle.load(handle)

In [24]:
idxs = [(4000, 14000)]
subset_data(de_train, 'de', idxs, './MFEA/data/de_evo.pkl')
subset_data(cs_train, 'cs', idxs, './MFEA/data/cs_evo.pkl')

In [25]:
idxs = [(4000, 7500000)]
subset_data(de_train, 'de', idxs, './train_data/de_train.pkl')
subset_data(cs_train, 'cs', idxs, './train_data/cs_train.pkl')

In [26]:
idxs = [(0, 4000)]
subset_data(de_train, 'de', idxs, './train_data/de_test.pkl')
subset_data(cs_train, 'cs', idxs, './train_data/cs_test.pkl')