In [1]:
import copy
import gzip
import itertools
import json
import pickle
import random
import re
import threading
import time
from collections import Counter

import clip
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
from nltk.tokenize import PunktSentenceTokenizer
from numba import jit
from PIL import Image
from tqdm.notebook import tqdm

plt.rcParams['axes.unicode_minus'] = False
# nltk.download("punkt")
# nltk.download("averaged_perceptron_tagger")
# sns.set()
# plt.style.use('ggplot')

## train, val_seen and val_unseen

In [2]:
splits = ["train","val_seen","val_unseen"]
ep = {}
gt = {}
for split in splits:
    sub_data_path = "../data/datasets/R2R_VLNCE_FSASub/%s/%s_sub.json.gz"%(split,split)
    with gzip.open(sub_data_path, "r") as f:
        ep[split] = json.loads(f.read())
    sub_gt_path = "../data/datasets/R2R_VLNCE_FSASub/%s/%s_gt.json.gz"%(split,split)
    with gzip.open(sub_gt_path, "r") as f:
        gt[split] = json.loads(f.read())

need shift episode id. 
keep trajectory id

ground truth use episode id to index

In [3]:
ep_id = {}
for split in splits:
    ids = []
    for v in ep[split]["episodes"]:
        ids.append(v["episode_id"])
    ep_id[split] = ids
tr_id = {}
for split in splits:
    ids = []
    for v in ep[split]["episodes"]:
        ids.append(v["trajectory_id"])
    tr_id[split] = ids
gt_id = {}
for split in splits:
    ids = []
    for k,v in gt[split].items():
        ids.append(int(k))
    gt_id[split] = ids

In [4]:
train_val_ep = []
train_val_gt = {}

for v in ep["train"]["episodes"]:
    train_val_ep.append(v)
for k,v in gt["train"].items():
    train_val_gt[k]=v
    
shift=max(ep_id["train"])
print(shift)
for v in ep["val_seen"]["episodes"]:
    v["episode_id"]+=shift
    train_val_ep.append(v)
for k,v in gt["val_seen"].items():
    k = str(int(k)+shift)
    assert k not in train_val_gt
    train_val_gt[k]=v

shift=max(ep_id["train"])+max(ep_id["val_seen"])
print(shift)
for v in ep["val_unseen"]["episodes"]:
    v["episode_id"]+=shift
    train_val_ep.append(v)
for k,v in gt["val_unseen"].items():
    k = str(int(k)+shift)
    assert k not in train_val_gt
    train_val_gt[k]=v

10837
11615


In [5]:
import os
os.makedirs("../data/datasets/R2R_VLNCE_FSASub/train_val")

FileExistsError: [Errno 17] File exists: '../data/datasets/R2R_VLNCE_FSASub/train_val'

In [6]:
data = {"episodes":train_val_ep, "instruction_vocab":ep["train"]["instruction_vocab"]}
sub_data_path = "../data/datasets/R2R_VLNCE_FSASub/train_val/train_val_sub.json.gz"
with gzip.open(sub_data_path, "w") as f:
    f.write(json.dumps(data).encode("utf-8"))
sub_gt_path = "../data/datasets/R2R_VLNCE_FSASub/train_val/train_val_gt.json.gz"
with gzip.open(sub_gt_path, "w") as f:
    f.write(json.dumps(train_val_gt).encode("utf-8"))

In [13]:
len(train_val_ep)

13436

In [7]:
len(ep["val_seen"]["instruction_vocab"]["word_list"])

2504

In [28]:
1819+778

2597

In [16]:
import math
math.sqrt(2.5)

1.5811388300841898

## val_seen and val_unseen

In [17]:
splits = ["val_seen","val_unseen"]
ep = {}
gt = {}
for split in splits:
    sub_data_path = "../data/datasets/R2R_VLNCE_NRSub_T/%s/%s_sub.json.gz"%(split,split)
    with gzip.open(sub_data_path, "r") as f:
        ep[split] = json.loads(f.read())
    sub_gt_path = "../data/datasets/R2R_VLNCE_NRSub_T/%s/%s_gt.json.gz"%(split,split)
    with gzip.open(sub_gt_path, "r") as f:
        gt[split] = json.loads(f.read())

In [18]:
ep_id = {}
for split in splits:
    ids = []
    for v in ep[split]["episodes"]:
        ids.append(v["episode_id"])
    ep_id[split] = ids
tr_id = {}
for split in splits:
    ids = []
    for v in ep[split]["episodes"]:
        ids.append(v["trajectory_id"])
    tr_id[split] = ids
gt_id = {}
for split in splits:
    ids = []
    for k,v in gt[split].items():
        ids.append(int(k))
    gt_id[split] = ids

In [19]:
train_val_ep = []
train_val_gt = {}

for v in ep["val_seen"]["episodes"]:
    train_val_ep.append(v)
for k,v in gt["val_seen"].items():
    train_val_gt[k]=v

shift=max(ep_id["val_seen"])
print(shift)
for v in ep["val_unseen"]["episodes"]:
    v["episode_id"]+=shift
    train_val_ep.append(v)
for k,v in gt["val_unseen"].items():
    k = str(int(k)+shift)
    assert k not in train_val_gt
    train_val_gt[k]=v

778


In [20]:
import os
os.makedirs("../data/datasets/R2R_VLNCE_NRSub_T/val_all")

In [21]:
data = {"episodes":train_val_ep, "instruction_vocab":ep["val_seen"]["instruction_vocab"]}
sub_data_path = "../data/datasets/R2R_VLNCE_NRSub_T/val_all/val_all_sub.json.gz"
with gzip.open(sub_data_path, "w") as f:
    f.write(json.dumps(data).encode("utf-8"))
sub_gt_path = "../data/datasets/R2R_VLNCE_NRSub_T/val_all/val_all_gt.json.gz"
with gzip.open(sub_gt_path, "w") as f:
    f.write(json.dumps(train_val_gt).encode("utf-8"))

## train seen

In [32]:
splits = ["val_seen","train"]
ep = {}
gt = {}
for split in splits:
    sub_data_path = "../data/datasets/R2R_VLNCE_NRSub_T/%s/%s_sub.json.gz"%(split,split)
    with gzip.open(sub_data_path, "r") as f:
        ep[split] = json.loads(f.read())
    sub_gt_path = "../data/datasets/R2R_VLNCE_NRSub_T/%s/%s_gt.json.gz"%(split,split)
    with gzip.open(sub_gt_path, "r") as f:
        gt[split] = json.loads(f.read())

In [33]:
ep_id = {}
for split in splits:
    ids = []
    for v in ep[split]["episodes"]:
        ids.append(v["episode_id"])
    ep_id[split] = ids
tr_id = {}
for split in splits:
    ids = []
    for v in ep[split]["episodes"]:
        ids.append(v["trajectory_id"])
    tr_id[split] = ids
gt_id = {}
for split in splits:
    ids = []
    for k,v in gt[split].items():
        ids.append(int(k))
    gt_id[split] = ids

In [34]:
ep["train"]["episodes"] = random.sample(ep["train"]["episodes"],1500)

In [35]:
train_val_ep = []
train_val_gt = {}

for v in ep["train"]["episodes"]:
    train_val_ep.append(v)
for k,v in gt["train"].items():
    train_val_gt[k]=v

shift=max(ep_id["train"])
print(shift)
for v in ep["val_seen"]["episodes"]:
    v["episode_id"]+=shift
    train_val_ep.append(v)
for k,v in gt["val_seen"].items():
    k = str(int(k)+shift)
    assert k not in train_val_gt
    train_val_gt[k]=v

10837


In [36]:
import os
os.makedirs("../data/datasets/R2R_VLNCE_NRSub_T/train_seen")

FileExistsError: [Errno 17] File exists: '../data/datasets/R2R_VLNCE_NRSub_T/train_seen'

In [37]:
data = {"episodes":train_val_ep, "instruction_vocab":ep["val_seen"]["instruction_vocab"]}
sub_data_path = "../data/datasets/R2R_VLNCE_NRSub_T/train_seen/train_seen_sub.json.gz"
with gzip.open(sub_data_path, "w") as f:
    f.write(json.dumps(data).encode("utf-8"))
sub_gt_path = "../data/datasets/R2R_VLNCE_NRSub_T/train_seen/train_seen_gt.json.gz"
with gzip.open(sub_gt_path, "w") as f:
    f.write(json.dumps(train_val_gt).encode("utf-8"))

In [38]:
len(train_val_ep)

2278

In [29]:
import random
random.sample([1,2,3,4,5,],2)

[2, 3]