In [25]:
import json
import gzip
import os
from pathlib import Path
import copy
import nltk
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize
from transformers import (
    AutoProcessor,
    CLIPImageProcessor,
    BertTokenizerFast,
    RobertaTokenizer,
    AutoTokenizer,
    AutoImageProcessor,
)
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"
text_processor = RobertaTokenizer.from_pretrained(
    "roberta-base"
)
sub_processor = AutoTokenizer.from_pretrained(
    "sentence-transformers/all-mpnet-base-v2"
)
RAND_MIN = 25
RAND_MAX = 100000
MIN_DEPTH = 0.0
MAX_DEPTH = 10.0
DEPTH_SCALE = 1000.0
LEN = 256
PAD_IDX = 1
SUB_PAD_IDX = 1
SUB_LEN = 50
SUB_NUM = 12
EMPTY_SUB = [SUB_PAD_IDX] * SUB_LEN

In [32]:
# meev1, FSASub
# meev2, sent_tokenize
FOLDER = Path("/root/EvoEnc/data/datasets/R2R_VLNCE_NRSub_T")
splits = ["train","val_seen","val_unseen","test","joint_train_envdrop","envdrop"]
for split in splits:
    raw_file = FOLDER / split / "{}_sub.json.gz".format(split)
    new_file = FOLDER / split / "{}_meev1.json.gz".format(split)
    with gzip.open(raw_file, "r") as f:
        data = json.loads(f.read())
    episodes = data["episodes"]
    new_data = copy.deepcopy(data)
    new_episodes = []
    for ep in tqdm(episodes, desc=split):
        inst = ep["instruction"]["instruction_text"]
        sub = ep["sub_instruction"]

        inst_ids = text_processor(inst, padding="max_length", truncation=True, max_length=LEN).input_ids
        ep["instruction"]["instruction_tokens"] = inst_ids

        sub_ids = sub_processor(sub, padding="max_length", truncation=True, max_length=SUB_LEN).input_ids
        if len(sub_ids)<SUB_NUM:
            n = len(sub_ids)
            for _ in range(SUB_NUM-n):
                sub_ids.append(EMPTY_SUB)
        else:
            sub_ids = sub_ids[:SUB_NUM]
        ep["sub_instruction_tokens"] = sub_ids

        new_episodes.append(ep)
    new_data["episodes"] = new_episodes
    with gzip.open(new_file, "w") as f:
        f.write(json.dumps(new_data).encode("utf-8"))


train:   0%|          | 0/10819 [00:00<?, ?it/s]

val_seen:   0%|          | 0/778 [00:00<?, ?it/s]

val_unseen:   0%|          | 0/1839 [00:00<?, ?it/s]

test:   0%|          | 0/3408 [00:00<?, ?it/s]

joint_train_envdrop:   0%|          | 0/157232 [00:00<?, ?it/s]

envdrop:   0%|          | 0/146413 [00:00<?, ?it/s]

In [33]:
# meev1, FSASub
# meev2, sent_tokenize
FOLDER = Path("/root/EvoEnc/data/datasets/R2R_VLNCE_NRSub_T")
splits = ["train","val_seen","val_unseen","test","joint_train_envdrop","envdrop"]
for split in splits:
    raw_file = FOLDER / split / "{}_sub.json.gz".format(split)
    new_file = FOLDER / split / "{}_meev2.json.gz".format(split)
    with gzip.open(raw_file, "r") as f:
        data = json.loads(f.read())
    episodes = data["episodes"]
    new_data = copy.deepcopy(data)
    new_episodes = []
    for ep in tqdm(episodes, desc=split):
        inst = ep["instruction"]["instruction_text"]
        # sub = ep["sub_instruction"]
        sub = sent_tokenize(inst)

        inst_ids = text_processor(inst, padding="max_length", truncation=True, max_length=LEN).input_ids
        ep["instruction"]["instruction_tokens"] = inst_ids

        sub_ids = sub_processor(sub, padding="max_length", truncation=True, max_length=SUB_LEN).input_ids
        if len(sub_ids)<SUB_NUM:
            n = len(sub_ids)
            for _ in range(SUB_NUM-n):
                sub_ids.append(EMPTY_SUB)
        else:
            sub_ids = sub_ids[:SUB_NUM]
        ep["sub_instruction_tokens"] = sub_ids

        new_episodes.append(ep)
    new_data["episodes"] = new_episodes
    with gzip.open(new_file, "w") as f:
        f.write(json.dumps(new_data).encode("utf-8"))


train:   0%|          | 0/10819 [00:00<?, ?it/s]

val_seen:   0%|          | 0/778 [00:00<?, ?it/s]

val_unseen:   0%|          | 0/1839 [00:00<?, ?it/s]

test:   0%|          | 0/3408 [00:00<?, ?it/s]

joint_train_envdrop:   0%|          | 0/157232 [00:00<?, ?it/s]

envdrop:   0%|          | 0/146413 [00:00<?, ?it/s]

In [37]:
new_data["episodes"][0]["instruction"].keys()

dict_keys(['instruction_text', 'instruction_tokens'])

In [23]:
text_processor([inst], padding="max_length", truncation=True, max_length=LEN)

{'input_ids': [[0, 30093, 198, 8, 1656, 62, 5, 16745, 4, 9693, 751, 8, 2067, 95, 235, 751, 5, 1883, 4, 1437, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,