In [9]:
from datasets import load_dataset,concatenate_datasets,Dataset
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
tqdm.pandas() 

_train_ubuntu=load_dataset("opus_ubuntu", lang1="en", lang2="ja",split='train')  
_train_gnome =load_dataset("opus_gnome", lang1="en", lang2="ja",split='train') 
_train_office=load_dataset("opus_openoffice", "en_GB-ja",split='train')         
_train_kde   =load_dataset("kde4", lang1="en", lang2="ja",split='train')     

from ast import arg

idx=-1
def custom_column(example,src_code,tgt_code,ds):
    global idx
    idx+=1
    is_Start=False
    is_End=False
    if(ds[0]==example):
        is_Start=True

    if(ds[-1]==example):
        is_End=True
    if(tgt_code=='en_GB'):
        return {"idx":idx,'en':example['translation'][tgt_code],src_code:example['translation'][src_code],"is_Start":is_Start,"is_End":is_End,'label':1}
    else:
        return {"idx":idx,tgt_code:example['translation'][tgt_code],src_code:example['translation'][src_code],"is_Start":is_Start,"is_End":is_End,'label':1}

def func_train_ubuntu(example):
    return custom_column(example,src_code="ja",tgt_code='en',ds=_train_ubuntu)
def func_train_gnome (example):
    return custom_column(example,src_code="ja",tgt_code='en',ds=_train_gnome )
def func_train_office(example):
    return custom_column(example,src_code="ja",tgt_code='en_GB',ds=_train_office)
def func_train_kde   (example):
    return custom_column(example,src_code="ja",tgt_code='en',ds=_train_kde   )

train_ubuntu=_train_ubuntu .map(func_train_ubuntu).remove_columns(["translation"])
train_gnome =_train_gnome  .map(func_train_gnome ).remove_columns(["translation"])
train_office=_train_office .map(func_train_office).remove_columns(["translation"])
train_kde   =_train_kde    .map(func_train_kde   ).remove_columns(["translation"])

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
def merge_and_tokenize_function(x):
    en_ids=tokenizer(x["en"],add_special_tokens=False, padding="max_length", max_length=256,truncation=True)['input_ids']
    de_ids=tokenizer(x["ja"],add_special_tokens=False, padding="max_length", max_length=256,truncation=True)['input_ids']

    return {'id':x['id'],'idx':x['idx'],'en':x['en'],"ja":x["ja"],'en_ids':en_ids,'ja_ids':de_ids,'is_Start':x['is_Start'],'is_End':x['is_End'],'label':1}

dataset_positive_train = concatenate_datasets([train_ubuntu,train_gnome,train_office,train_kde]).map(merge_and_tokenize_function)

pd_dataset_positive_train=pd.DataFrame(dataset_positive_train)

pd_dataset_negative_train=pd_dataset_positive_train

pd_dataset_negative_train['ja_next']=pd_dataset_positive_train['ja_ids'].shift()
pd_dataset_negative_train['ja_nnext']=pd_dataset_positive_train['ja_ids'].shift(2)
pd_dataset_negative_train['ja_prev']=pd_dataset_positive_train['ja_ids'].shift(-1)
pd_dataset_negative_train['ja_pprev']=pd_dataset_positive_train['ja_ids'].shift(-2)

pd_dataset_negative_train['next_is_End']=pd_dataset_positive_train['is_End'].shift()
pd_dataset_negative_train['prev_is_Start']=pd_dataset_positive_train['is_Start'].shift(-1)


def swap(sentence,num,z_index):
    if num%2==1:
        num-=1
    indexes=list(range(z_index))
    random.shuffle(indexes)
    for i in range(0,num-1,2):
        sentence[indexes[i]],sentence[indexes[i+1]]=sentence[indexes[i+1]],sentence[indexes[i]]
    return sentence

def sampler(sentence):
    idx=sentence['idx']
    src=sentence['en_ids']
    tgt=sentence['ja_ids']
    n=random.random()
    #dt=time.time()
    p=None
    if(n>2/3):
        p=1
        #print(p)
        """
        Randomly select a target sentence from its adjacent sentences within a window size of k (where k = 2 in our experiments).
        """
        tgt=random.choice([sentence['ja_pprev'],sentence['ja_prev'],sentence['ja_next'],sentence['ja_nnext']])
        if(sentence["is_Start"]):
            tgt=random.choice([sentence['ja_next'],sentence['ja_nnext']])
        elif(sentence["is_End"]):
            tgt=random.choice([sentence['ja_prev'],sentence['ja_pprev']])
        elif(sentence['ja_pprev']==None or sentence['prev_is_Start']==True):
            tgt=random.choice([sentence['ja_prev'],sentence['ja_next'],sentence['ja_nnext']])
        elif(sentence['ja_nnext']==None or sentence['next_is_End']==True):
            tgt=random.choice([sentence['ja_pprev'],sentence['ja_prev'],sentence['ja_next']])
        
    elif(n>1/3):
        p=2
        #print(p)
        """
        Randomly truncate 30%-70% of the source or target sentence.
        """
        u=random.random()
        r=random.uniform(0.3,0.7)
        if(u<0.5):
            zindex=len(sentence['en_ids'])-1
            if 0 in sentence['en_ids']:
                zindex=sentence['en_ids'].index(0)
            src=sentence['en_ids'][:int(len(sentence['en_ids'][:zindex])*r)]+[0]*(len(sentence['en_ids'])-int(len(sentence['en_ids'][:zindex])*r))
        else:
            zindex=len(sentence['ja_ids'])-1
            if 0 in sentence['ja_ids']:
                zindex=sentence['ja_ids'].index(0)
            tgt=sentence['ja_ids'][:int(len(sentence['ja_ids'][:zindex])*r)]+[0]*(len(sentence['ja_ids'])-int(len(sentence['ja_ids'][:zindex])*r))
    else:
        p=3
        #print(p)
        """
        Swap  the  order  of  30%-70%  words  of  the source or target sentence.
        """
        u=random.random()
        r=random.uniform(0.3,0.7)
        if(u<0.5):
            zindex=len(sentence['en_ids'])-1
            if 0 in sentence['en_ids']:
                zindex=sentence['en_ids'].index(0)
            swapnum=int(zindex*r)
            src=swap(sentence['en_ids'],swapnum,zindex)
        else:
            zindex=len(sentence['ja_ids'])-1
            if 0 in sentence['ja_ids']:
                zindex=sentence['ja_ids'].index(0)
            swapnum=int(zindex*r)
            tgt=swap(sentence['ja_ids'],swapnum,zindex)
    #print((time.time()-dt)*1000)
    return src,tgt #en<sep>tgt

def negative_creater(sentence): 
    sentence['en_ids'],sentence['ja_ids']=sampler(sentence)
    sentence['label']=0
    return  sentence

def func(e):
    return negative_creater(e)

pd_dataset_negative_train=pd_dataset_negative_train.parallel_apply(func,axis=1)

pd_data=pd.concat([pd_dataset_positive_train.loc[:,['en_ids','ja_ids','label']],pd_dataset_negative_train.loc[:,['en_ids','ja_ids','label']]]).dropna()
# negativeとpositiveの結合
data= Dataset.from_pandas(pd_data)
data.remove_columns(['__index_level_0__']).push_to_hub('ahclab/acceptability_filtering_data_en-ja')

INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Using custom data configuration en-ja-lang1=en,lang2=ja
Reusing dataset opus_ubuntu (/home/is/koki-tan/.cache/huggingface/datasets/opus_ubuntu/en-ja-lang1=en,lang2=ja/0.0.0/7ac83b46edf6d0b6ff96bc86d5aadfb8b877c2f136a94af490988c442d3814b8)
Using custom data configuration en-ja-lang1=en,lang2=ja
Reusing dataset opus_gnome (/home/is/koki-tan/.cache/huggingface/datasets/opus_gnome/en-ja-lang1=en,lang2=ja/0.0.0/c00e5dfb1b3b508d7898e160feee1d391e67a3651a06570b45d54ab6a8886217)
Reusing dataset opus_openoffice (/home/is/koki-tan/.cache/huggingface/datasets/opus_openoffice/en_GB-ja/1.0.0/e891f281b0d9d5d57027b62c759ddc0826ecb289101e88b0ae004c5fe07162ca)
Using custom data configuration en-ja-lang1=en,lang2=ja
Reusing dataset kde4 (/home/is/koki-tan/.cache/huggingface/datasets/kde4/en-ja-lang1=en,lang2=ja/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac)


  0%|          | 0/4925 [00:00<?, ?ex/s]

  0%|          | 0/50 [00:00<?, ?ex/s]

  0%|          | 0/69149 [00:00<?, ?ex/s]

  0%|          | 0/131429 [00:00<?, ?ex/s]

  0%|          | 0/205553 [00:00<?, ?ex/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4283), Label(value='0 / 4283'))), …

Pushing dataset shards to the dataset hub:   0%|          | 0/4 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Updating downloaded metadata with the new split.
