# Visualizing Our Datasets

In [None]:
import os
import sys
import csv
import json
from tqdm import tqdm
import numpy as np
import random
import pprint
import shutil

import cv2
import pprint
from IPython.display import Video
from visualization_utils import *

sys.path.insert(0, "../data_processing")
from preconds_utils import *
from sequencing_utils import show_one_sampled_data as show_one_sampled_data_wikihow
from sequencing_data_processors import WikiHowGeneralProcessor

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

## Multimodal WikiHow

### Raw Dataset Structure

Our raw dataset is a jsonl file.
(Check `data_processing/sequencing_data_processors.py` the function `_read_json` how we read in.)

In [None]:
raw_wikihow_jsonl = (
    "/local1/telinwu/research/resources/wikihow/"
    "wikihow-acl22_human_multiref_multimodal-test.json"
)

raw_wikihow = read_jsonl_file(raw_wikihow_jsonl)
print("Data count: {}".format(len(raw_wikihow)))

print("Below is the data schema")
pprint.pprint(raw_wikihow[0])

### Data Partitions
Here we decide which data subsets we are inspecting and we will use a processor to process the data.  
(This will also be easier for training later on.)

Each data after processed is as a `MultimodalWikiHowExample` instance  
(Check `data_processing/sequencing_data_processors.py` for details)

In [None]:
# version text is the string in between "wikihow" and "train/test/dev.json"
# E.g., "wikihow-acl22_human_multiref_multimodal-test.json"'s version text is:
# "acl22_human_multiref_multimodal"
version_text_to_use = "acl22_human_multiref_multimodal"

data_root_dir = "/local1/telinwu/research/resources/wikihow/"

wikihow_processor = WikiHowGeneralProcessor(
    data_dir=data_root_dir,
    version_text=version_text_to_use,
    paired_with_image=True,
    images_dir=data_root_dir,
    min_story_length=5,
    max_story_length=5,
    save_missing_images_info=False,
)

# data_wikihow_train = wikihow_processor.get_train_examples()
data_wikihow_train = []

# data_wikihow_dev = wikihow_processor.get_dev_examples()
data_wikihow_dev = []

data_wikihow_test = wikihow_processor.get_test_examples()

data_wikihow = data_wikihow_train + data_wikihow_dev + data_wikihow_test
print("Total Valid Data Sequences: {}".format(len(data_wikihow)))

### Show One Sample

In [None]:
# Randomly select a data point.
rand_idx = np.random.randint(len(data_wikihow))

print("Data Index: {}".format(rand_idx))
print("GT Order References: {}".format(data_wikihow[rand_idx].multiref_gt))

show_one_sampled_data_wikihow(
    data_wikihow[rand_idx], 
    scrambled=False,
    title_max_len=250,
    show_url=True,
    img_size=4,
    font_size=9,
    # unimodal="text",
)

We can print out the data texts and image assets.

In [None]:
print("-"*50)
for step_idx in range(len(data_wikihow[rand_idx].text_seq)):
    print("Step: {}".format(step_idx+1))
    print(data_wikihow[rand_idx].text_seq[step_idx])
    print(data_wikihow[rand_idx].img_path_seq[step_idx])
    print("-"*50)

# END