In [13]:
import numpy as np
import pandas as pd
from pathlib import Path
import torch

In [2]:
EMBED_DIR = Path("/scratch/vjh9526/bdml_2025/project/datasets/ZuCo2/2urht/osfstorage/task1 - NR/npy_file/embeds")

In [14]:
def shape_tuple(emb):
    """Return a Python tuple of the emb’s shape, regardless of ndarray or tensor."""
    if emb is None:
        return None
    if isinstance(emb, np.ndarray):
        return emb.shape
    if torch.is_tensor(emb):
        return tuple(emb.shape)
    return None

def is_all_zero(emb):
    """Return True if all elements of emb are zero (ndarray or tensor)."""
    if emb is None:
        return True
    if isinstance(emb, np.ndarray):
        return emb.size > 0 and np.all(emb == 0)
    if torch.is_tensor(emb):
        return bool((emb == 0).all().item())
    return False

In [15]:
records = []
for fp in sorted(EMBED_DIR.glob("*.npy")):
    data = np.load(fp, allow_pickle=True).item()
    # assume each file is { subject_key: [ sample_dict, ... ] }
    subject_key = next(iter(data))
    samples = data[subject_key]
    print(f"Checking for {subject_key}...")
    
    total = len(samples)
    missing_labram  = sum(1 for s in samples if not s or 'embeds_labram'  not in s)
    missing_cbra    = sum(1 for s in samples if not s or 'embeds_cbramod' not in s)
    missing_content = sum(1 for s in samples 
                          if not s 
                          or 'content' not in s)

    bad_shape_cbra    = 0
    all_zero_cbra     = 0
    bad_shape_labram  = 0
    all_zero_labram   = 0

    for s in samples:
        if not s:
            continue

        # CBrMod embeddings should be (1, 600)
        if 'embeds_cbramod' in s:
            emb = s['embeds_cbramod']
            if shape_tuple(emb) != (1, 600):
                bad_shape_cbra += 1
            elif is_all_zero(emb):
                all_zero_cbra += 1

        # LabraM embeddings should be (1, 200)
        if 'embeds_labram' in s:
            emb = s['embeds_labram']
            if shape_tuple(emb) != (1, 200):
                bad_shape_labram += 1
            elif is_all_zero(emb):
                all_zero_labram += 1


    records.append({
        "file":             fp.name,
        "subject":          subject_key,
        "total_samples":      total,
        "missing_labram":     missing_labram,
        "missing_cbra":       missing_cbra,
        "missing_content":    missing_content,
        "bad_shape_cbra":     bad_shape_cbra,
        "all_zero_cbra":      all_zero_cbra,
        "bad_shape_labram":   bad_shape_labram,
        "all_zero_labram":    all_zero_labram,
    })


Checking for YAC...
Checking for YAG...
Checking for YAK...
Checking for YDG...
Checking for YDR...
Checking for YFR...
Checking for YFS...
Checking for YHS...
Checking for YIS...
Checking for YLS...
Checking for YMD...
Checking for YMS...
Checking for YRH...
Checking for YRK...
Checking for YRP...
Checking for YSD...
Checking for YSL...
Checking for YTL...


In [16]:
df = pd.DataFrame(records)
df.index.name = "idx"
df

Unnamed: 0_level_0,file,subject,total_samples,missing_labram,missing_cbra,missing_content,bad_shape_cbra,all_zero_cbra,bad_shape_labram,all_zero_labram
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,task1-NR-2.0-YAC-dataset_embeds.npy,YAC,349,102,102,99,0,0,0,0
1,task1-NR-2.0-YAG-dataset_embeds.npy,YAG,349,3,3,3,0,0,0,0
2,task1-NR-2.0-YAK-dataset_embeds.npy,YAK,349,96,96,95,0,0,0,0
3,task1-NR-2.0-YDG-dataset_embeds.npy,YDG,349,2,2,2,0,0,0,0
4,task1-NR-2.0-YDR-dataset_embeds.npy,YDR,349,5,5,2,0,0,0,0
5,task1-NR-2.0-YFR-dataset_embeds.npy,YFR,349,108,108,6,0,0,0,0
6,task1-NR-2.0-YFS-dataset_embeds.npy,YFS,349,14,14,1,0,0,0,0
7,task1-NR-2.0-YHS-dataset_embeds.npy,YHS,349,1,1,1,0,0,0,0
8,task1-NR-2.0-YIS-dataset_embeds.npy,YIS,349,1,1,1,0,0,0,0
9,task1-NR-2.0-YLS-dataset_embeds.npy,YLS,349,8,8,2,0,0,0,0
