In [3]:
# !pip install datasets ijson huggingface-hub

In [12]:
from huggingface_hub import HfApi, hf_hub_download
import json

# Initialize the API client
api = HfApi()
dataset_name = "neulab/PangeaInstruct"

# Retrieve and download all files in the dataset
files = api.list_repo_files(repo_id=dataset_name, repo_type="dataset")

for file in files:
    if file.startswith("PangeaIns.json"):
        hf_hub_download(repo_id=dataset_name, filename=file, repo_type="dataset", cache_dir = "downloads")
        print(f"File downloaded: {file}")

File downloaded: PangeaIns.json


In [13]:
def is_valid_sample(sample, tasks, target_languages=['en']):
    """
    Check if any task in `tasks` is a substring of `sample['id']`
    and if `sample['language']` is in the list of target languages.

    Parameters:
    - sample (dict): The sample to validate.
    - tasks (list of str): List of task substrings to look for in the ID.
    - target_languages (list of str): Accepted language values.

    Returns:
    - bool: True if conditions are met, False otherwise.
    """
    # Validate structure
    if not isinstance(sample, dict): #or 'id' not in sample or 'language' not in sample:
        return False

    sample_id = str(sample.get('image', ''))
    sample_lang = str(sample.get('language', ''))

    if not isinstance(sample_id, str) or not isinstance(sample_lang, str):
        return False

    # Normalize inputs
    sample_id = sample_id.lower()
    sample_lang = sample_lang.lower()
    tasks = [lang.lower() for lang in tasks]
    target_languages = [lang.lower() for lang in target_languages]

    # Check if any task is a substring of the sample ID
    id_contains_task = any(task.lower() in sample_id for task in tasks)

    # Check if language is in target languages
    is_target_language = sample_lang in target_languages

    return id_contains_task and is_target_language


In [14]:
import ijson
import os
from tqdm import tqdm
json_path = "downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/PangeaIns.json"

# Task and language settings
# tasks = ['ChartQA', 'doc-vqa', 'table-vqa', 'allava_laion', 'cambrian', 'laion_gpt4v', 'GQA']
tasks = ['cambrian', 'ALLaVA-4V','allava_vflan', 'MTVQA',  'nvlr2-llava', "translation", 'ChartQA', 'Viet-ShareGPT-4o-Text-VQA', 'Viet-OCR-VQA', 'Viet-Doc-VQA', 'table-vqa', 'doc-vqa',
        "laion-caption","NuminaMath-CoT", "OpenHermes-2.5", 'text_only' , 'ocr', 'cultural/laion-cultural-150k']
target_language = ['arabic','bengali','bn','hindi', 'ja', 'hi', 'russian', 'ru', 'spanish', 'es','vietnamese', 'vi', 'zh_simplified','ar','en','english', 'fr', 'Japanese', 'French']

# Load and filter items
filtered_items = []
with open(json_path, "r", encoding="utf-8") as f:
    objects = ijson.items(f, "item")
    for obj in tqdm(objects, desc="Filtering"):
        # calculate_language_distributions(obj, )
        if is_valid_sample(obj, tasks, target_languages = target_language):
            filtered_items.append(obj)
        # print(obj)
        # break

# Now `filtered_items` contains only the valid objects
print(f"Total valid items: {len(filtered_items)}")

Filtering: 5433377it [01:30, 59865.64it/s] 


KeyboardInterrupt: 

In [6]:
output_path = "filtered_PangeaIns.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(filtered_items, f, ensure_ascii=False, indent=2)

print(f"Saved {len(filtered_items)} filtered items to {output_path}")

Saved 1588164 filtered items to filtered_PangeaIns.json


In [8]:
paths = [
    "general/cambrian/",
    "general/ALLAVA-4V/",
    "general/allava_vflan/",
    "general/MTVQA/",
    "general/nvlr2-llava/",
    "translation/",
    "doc+chart/ChartQA/",
    "general/Viet-ShareGPT-4o-Text-VQA/",
    "doc+chart/Viet-OCR-VQA/",
    "doc+chart/Viet-Doc-VQA/",
    "doc+chart/table-vqa/",
    "doc+chart/doc-vqa/",
    "text-only/NuminaMath-CoT/",
    "text-only/Openhermes-2.5/",
    "text-only/",  # generic path for text_only task
    "ocr/webui_multilingual_ocr/",
    "cultural/laion-cultural-150k/"
]


In [None]:
from huggingface_hub import HfApi, hf_hub_download
import os

# Initialize the API client
api = HfApi()
dataset_name = "neulab/PangeaInstruct"

# Retrieve all files in the dataset
files = api.list_repo_files(repo_id=dataset_name, repo_type="dataset")

# List of target subdirectories
target_paths = [
    "general/cambrian/",
    "general/ALLaVA-4V/",
    "general/allava_vflan/",
    "general/MTVQA/",
    "general/nvlr2-llava/",
    "translation/",
    "doc+chart/ChartQA/",
    "general/Viet-ShareGPT-4o-Text-VQA/",
    "doc+chart/Viet-OCR-VQA/",
    "doc+chart/Viet-Doc-VQA/",
    "doc+chart/Viet-DOC-VQA-II/",
    "doc+chart/table-vqa/",
    "doc+chart/doc-vqa/",
    "text-only/NuminaMath-CoT/",
    "text-only/Openhermes-2.5/",
    "ocr/webui_multilingual_ocr/",
    "cultural/laion-cultural-150k/"
]

# Download non-JSON files only from specified paths
for file in files:
    if any(file.startswith(path) for path in target_paths) and not file.endswith(".json"):
        local_file = hf_hub_download(
            repo_id=dataset_name,
            filename=file,
            repo_type="dataset",
            cache_dir="downloads"
        )
        print(f"Downloaded: {file} -> {local_file}")


In [4]:
print('done')

done


In [5]:
import os
import tarfile
import re
from glob import glob
from tqdm import tqdm

root_dir = "downloads"

# Regex patterns for identifying split tar files
split_patterns = [
    re.compile(r"(.+\.tar)\.part\d+$"),         # e.g., file.tar.part01
    re.compile(r"(.+\.tar\.gz)\.\d+$"),         # e.g., file.tar.gz.001
    re.compile(r"(.+\.tar)\.part[a-z]{2}$"),    # e.g., file.tar.partaa
]

def find_and_reconstruct_splits():
    seen = set()

    print("🔍 Scanning for tarballs and split archives...")
    for dirpath, _, filenames in tqdm(os.walk(root_dir), desc="Walking directories"):
        for filename in tqdm(filenames, leave=False, desc="Checking files"):
            full_path = os.path.join(dirpath, filename)

            for pattern in split_patterns:
                match = pattern.match(filename)
                if match:
                    base_name = match.group(1)
                    if (dirpath, base_name) in seen:
                        continue

                    seen.add((dirpath, base_name))
                    base_path = os.path.join(dirpath, base_name)

                    # Collect all matching parts
                    parts = sorted([
                        f for f in os.listdir(dirpath)
                        if f.startswith(os.path.basename(base_name)) and "combined" not in f
                    ])
                    parts = [os.path.join(dirpath, f) for f in parts]

                    print(f"\n📦 Reconstructing: {base_name} from {len(parts)} parts")
                    combined_path = base_path + ".combined"

                    with open(combined_path, "wb") as outfile:
                        for part in tqdm(parts, desc="Merging parts", leave=False):
                            with open(part, "rb") as infile:
                                outfile.write(infile.read())

                    extract_tar(combined_path, dirpath)
                    os.remove(combined_path)
                    for part in parts:
                        os.remove(part)
                    break

            # Handle normal tarballs
            if filename.endswith((".tar", ".tar.gz", ".tgz")):
                extracted = extract_tar(full_path, dirpath)
                if extracted:
                    os.remove(full_path)

def extract_tar(tar_path, extract_to):
    try:
        print(f"📂 Extracting: {tar_path}")
        with tarfile.open(tar_path, "r:*") as tar:
            members = tar.getmembers()
            for member in members:
                tar.extract(member, path=extract_to)
        print(f"✅ Done: {extract_to}")
        return True
    except Exception as e:
        print(f"❌ Failed to extract {tar_path}: {e}")
        return False

# Run it
find_and_reconstruct_splits()


🔍 Scanning for tarballs and split archives...


Walking directories: 0it [00:00, ?it/s]
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files:   0%|          | 0/29 [00:00<?, ?it/s][A
                                                      [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                     [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                     [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking

📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images/textvqa.tar.gz


Walking directories: 41it [00:19, 21.97it/s]
Checking files:  12%|█▎        | 1/8 [01:51<12:59, 111.41s/it][A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images
📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images/ocr_vqa.tar.gz



Checking files:  25%|██▌       | 2/8 [06:23<20:37, 206.22s/it][A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images
📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images/gqa.tar.gz



Checking files:  38%|███▊      | 3/8 [12:02<22:14, 266.85s/it][A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images
📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images/dvqa.tar.gz



Checking files:  50%|█████     | 4/8 [24:26<30:20, 455.15s/it][A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images
📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images/docvqa.tar.gz



Checking files:  62%|██████▎   | 5/8 [25:59<16:13, 324.34s/it][A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images
📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images/coco.tar.gz



Checking files:  75%|███████▌  | 6/8 [35:25<13:33, 406.58s/it][A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images
📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images/chartqa.tar.gz



Checking files:  88%|████████▊ | 7/8 [36:43<04:59, 299.05s/it][A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images
📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images/ai2d.tar.gz



Checking files: 100%|██████████| 8/8 [37:13<00:00, 213.45s/it][A
Walking directories: 44it [37:14, 205.95s/it]                 [A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/cambrian/images



Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files:   0%|          | 0/80000 [00:00<?, ?it/s][A
Checking files:  90%|████████▉ | 71751/80000 [00:00<00:00, 717493.58it/s][A
Walking directories: 46it [37:15, 163.03s/it]                            [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files:   0%|          | 0/21953 [00:00<?, ?it/s][A
                                                         [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files:   0%|          | 0/33074 [00:00<?, ?it/s][A
Walking directories: 51it [37:15, 93.46s/it]             [A
Checking files:   0%|          | 0/53343 [00:00<?, ?it/s][A
                                                         [A
Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                     [A


📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/Viet-ShareGPT-4o-Text-VQA/images.tar


Walking directories: 51it [37:30, 93.46s/it]
Checking files: 100%|██████████| 1/1 [03:19<00:00, 199.69s/it][A
Walking directories: 54it [40:35, 86.27s/it]                  [A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/Viet-ShareGPT-4o-Text-VQA



Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                     [A
Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                     [A
Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                     [A
Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A

📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/Viet-OCR-VQA/images.tar



Checking files: 100%|██████████| 1/1 [09:52<00:00, 592.68s/it][A
Walking directories: 59it [50:27, 98.84s/it]                  [A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/Viet-OCR-VQA



Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                     [A
Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A

📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/ChartQA/images.tar



Checking files: 100%|██████████| 1/1 [01:38<00:00, 98.76s/it][A
Walking directories: 61it [52:06, 89.84s/it]                 [A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/ChartQA



Checking files: 0it [00:00, ?it/s][A
                                  [A
Checking files:   0%|          | 0/1 [00:00<?, ?it/s][A

📂 Extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/cultural/laion-cultural-150k/images.tar



Checking files: 100%|██████████| 1/1 [16:55<00:00, 1015.42s/it][A
Walking directories: 63it [1:09:02, 176.07s/it]                [A

✅ Done: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/cultural/laion-cultural-150k



Checking files:   0%|          | 0/29 [00:00<?, ?it/s][A
Walking directories: 64it [1:09:02, 64.72s/it]        [A


In [7]:
print('done')

done


In [17]:
import zipfile

def extract_zip_files(root_dir="downloads"):
    print("🔍 Scanning for zip files...")
    for dirpath, _, filenames in tqdm(os.walk(root_dir), desc="Walking directories for ZIPs"):
        for filename in tqdm(filenames, leave=False, desc="Checking ZIP files"):
            if filename.endswith(".zip"):
                zip_path = os.path.join(dirpath, filename)
                print(f"\n📦 Extracting ZIP: {zip_path}")

                try:
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        members = zip_ref.infolist()
                        for member in members:
                            zip_ref.extract(member, path=dirpath)

                    print(f"✅ Done extracting: {zip_path}")
                    os.remove(zip_path)
                except Exception as e:
                    print(f"❌ Failed to extract {zip_path}: {e}")
                    
extract_zip_files()            # zip extractor

🔍 Scanning for zip files...


Walking directories for ZIPs: 0it [00:00, ?it/s]
Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files:   0%|          | 0/29 [00:00<?, ?it/s][A
                                                          [A
Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                         [A
Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                         [A
Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files: 0it [00:00, ?it/s][A
                                     


📦 Extracting ZIP: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/allava_vflan/images.zip


Walking directories for ZIPs: 74it [00:16, 27.22it/s]
Checking ZIP files: 100%|██████████| 1/1 [14:54<00:00, 894.66s/it][A
Walking directories for ZIPs: 77it [14:56, 50.07s/it]             [A

✅ Done extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/allava_vflan/images.zip



Checking ZIP files:   0%|          | 0/181393 [00:00<?, ?it/s][A
Walking directories for ZIPs: 78it [14:57, 46.41s/it]         [A
Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files:   0%|          | 0/42778 [00:00<?, ?it/s][A
                                                             [A
Checking ZIP files:   0%|          | 0/1 [00:00<?, ?it/s][A


📦 Extracting ZIP: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/MTVQA/images.zip


Walking directories for ZIPs: 78it [15:16, 46.41s/it]
Checking ZIP files: 100%|██████████| 1/1 [00:47<00:00, 47.10s/it][A
Walking directories for ZIPs: 81it [15:44, 39.02s/it]            [A

✅ Done extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/general/MTVQA/images.zip



Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files:   0%|          | 0/1 [00:00<?, ?it/s][A


📦 Extracting ZIP: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/table-vqa/images.zip



Checking ZIP files: 100%|██████████| 1/1 [01:00<00:00, 60.12s/it][A
Walking directories for ZIPs: 83it [16:44, 37.35s/it]            [A

✅ Done extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/table-vqa/images.zip



Checking ZIP files:   0%|          | 0/1 [00:00<?, ?it/s][A


📦 Extracting ZIP: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/doc-vqa/images.zip



Checking ZIP files: 100%|██████████| 1/1 [00:49<00:00, 49.13s/it][A
Walking directories for ZIPs: 84it [17:33, 38.74s/it]            [A

✅ Done extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/doc-vqa/images.zip



Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files:   0%|          | 0/137098 [00:00<?, ?it/s][A
Walking directories for ZIPs: 86it [17:34, 29.03s/it]         [A
Checking ZIP files:   0%|          | 0/1 [00:00<?, ?it/s][A


📦 Extracting ZIP: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/Viet-Doc-VQA/images.zip


Walking directories for ZIPs: 86it [17:46, 29.03s/it]
Checking ZIP files: 100%|██████████| 1/1 [03:43<00:00, 223.84s/it][A
Walking directories for ZIPs: 87it [21:17, 58.69s/it]             [A

✅ Done extracting: downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4/doc+chart/Viet-Doc-VQA/images.zip



Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files:   0%|          | 0/28299 [00:00<?, ?it/s][A
                                                             [A
Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files: 0it [00:00, ?it/s][A
                                      [A
Checking ZIP files:   0%|          | 0/232478 [00:00<?, ?it/s][A
Walking directories for ZIPs: 92it [21:18, 28.16s/it]         [A
Checking ZIP files:   0%|          | 0/29 [00:00<?, ?it/s][A
Walking directories for ZIPs: 93it [21:18, 13.75s/it]     [A


In [9]:
import json
import ijson
filtered_items = []
with open('filtered_PangeaIns.json', "r", encoding="utf-8") as f:
    objects = ijson.items(f, "item")
    for obj in objects:
        filtered_items.append(obj)
output_path = "sample_PangeaIns.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(filtered_items[-5:-3], f, ensure_ascii=False, indent=2)

print(f"Saved {len(filtered_items)} filtered items to {output_path}")

Saved 1588164 filtered items to sample_PangeaIns.json


In [2]:
# import os
# import zipfile

# # Base path where your files are located
# base_dir = "downloads/datasets--neulab--PangeaInstruct/snapshots/d0819917abe1cae38c008de0ca172f885f1f26a4"

# # Paths of the two target images relative to base_dir
# files_to_include = [
#     "cambrian/images/coco/train2017/000000132137.jpg",
#     "cambrian/images/vg/VG_100K/2319635.jpg"
# ]

# # Output zip file name
# output_zip = "selected_images.zip"

# # Create the zip
# with zipfile.ZipFile(output_zip, "w") as zipf:
#     for file_rel_path in files_to_include:
#         full_path = os.path.join(base_dir, file_rel_path)
#         if os.path.exists(full_path):
#             zipf.write(full_path, arcname=file_rel_path)
#         else:
#             print(f"File not found: {full_path}")

# print(f"Created zip: {output_zip}")
