In [1]:
# Notebook: Data Inspector
import numpy as np
import pickle
import json
from pathlib import Path

# =============================================================================
# Quick inspect functions
# =============================================================================

def peek_npy(file_path):
    """Quick peek at .npy file."""
    data = np.load(file_path, allow_pickle=True)
    print(f"📄 {Path(file_path).name}")
    print(f"   Shape: {data.shape} | Dtype: {data.dtype} | Size: {data.nbytes/(1024**2):.1f}MB")
    if np.issubdtype(data.dtype, np.number):
        print(f"   Range: [{data.min():.3f}, {data.max():.3f}] | Mean: {data.mean():.3f}")
    print(f"   Sample: {data[:2]}\n")
    return data

def peek_pkl(file_path):
    """Quick peek at .pkl file."""
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    print(f"📄 {Path(file_path).name}")
    print(f"   Type: {type(data).__name__}")
    
    if isinstance(data, dict):
        keys = list(data.keys())
        print(f"   Keys: {len(keys)} entries")
        print(f"   Sample key: {keys[0]}")
        print(f"   Sample value: {data[keys[0]]}\n")
    elif isinstance(data, list):
        print(f"   Length: {len(data)}")
        print(f"   Sample: {data[0]}\n")
    return data

def peek_json(file_path):
    """Quick peek at .json file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"📄 {Path(file_path).name}")
    print(f"   Type: {type(data).__name__}")
    
    if isinstance(data, dict):
        print(f"   Keys: {list(data.keys())}")
        first_key = list(data.keys())[0]
        print(f"   '{first_key}': {type(data[first_key]).__name__} with {len(data[first_key]) if hasattr(data[first_key], '__len__') else 'N/A'} items\n")
    elif isinstance(data, list):
        print(f"   Length: {len(data)}")
        print(f"   First item: {data[0] if data else 'Empty'}\n")
    return data

# =============================================================================
# Inspect all files
# =============================================================================

base = Path("data/processed")

# NPY files
print("="*60)
print("🔍 NUMPY FILES (.npy)")
print("="*60)
img_features = peek_npy(base / "coco_clip_new/coco_clip_vitb16_train2017_aokvqa_convertedidx_image.npy")
q_features = peek_npy(base / "coco_clip_new/coco_clip_vitb16_train2017_aokvqa_question.npy")

# PKL files
print("="*60)
print("🔍 PICKLE FILES (.pkl)")
print("="*60)
obj_sim = peek_pkl(base / "object_similarity/train_object_select_aokvqa_answer.pkl")

# JSON files
print("="*60)
print("🔍 JSON FILES (.json)")
print("="*60)
line2sample = peek_json(base / "coco_clip_new/aokvqa_qa_line2sample_idx_train2017.json")
captions = peek_json(base / "captions_train2017.json")

print("✅ Done! Variables: img_features, q_features, obj_sim, line2sample, captions")

🔍 NUMPY FILES (.npy)


FileNotFoundError: [Errno 2] No such file or directory: 'data\\processed\\coco_clip_new\\coco_clip_vitb16_train2017_aokvqa_convertedidx_image.npy'