In [1]:
from pathlib import Path

import json
import re
import pandas as pd

In [2]:
GEST_CSV = Path("/workspaces/GEST/data/gest.csv")

TRAIN_JSON = Path(
    "/workspaces/GEST/miscellaneous/datasets/ActivityNet Captions/train.json"
)
VAL1_JSON = Path(
    "/workspaces/GEST/miscellaneous/datasets/ActivityNet Captions/val_1.json"
)
VAL2_JSON = Path(
    "/workspaces/GEST/miscellaneous/datasets/ActivityNet Captions/val_2.json"
)

PREVIEW_ROWS = 3

In [3]:
def normalize_whitespace(s: str) -> str:
    """Collapse any whitespace to single spaces and trim."""
    return re.sub(r"\s+", " ", s).strip()


def flatten_sentences(sentences):
    """Join stripped sentences with a single space ."""
    return " ".join(s.strip() for s in sentences)


def load_json_as_df(json_path: Path) -> pd.DataFrame:
    with json_path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, dict):
        raise ValueError(f"{json_path} must be a JSON object of (id, item) dicts")
    rows = []
    for _id, item in data.items():
        if (
            isinstance(item, dict)
            and isinstance(item.get("sentences"), list)
            and all(isinstance(s, str) for s in item["sentences"])
        ):
            flat_raw = flatten_sentences(item["sentences"])
            rows.append(
                {
                    "id": _id,
                    "json_flat_text": flat_raw,
                    "json_text_norm": normalize_whitespace(flat_raw),
                }
            )
    return pd.DataFrame(rows)

In [4]:
required_cols = {"dataset", "id", "text", "gest"}
df = pd.read_csv(GEST_CSV, encoding="utf-8-sig")
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"{GEST_CSV} missing required columns: {sorted(missing)}")

df["text"] = df["text"].fillna("")
df["text_norm"] = df["text"].map(normalize_whitespace)

print(f"Loaded gest.csv rows: {len(df):,}")

Loaded gest.csv rows: 13,786


In [5]:
pairs = [
    ("train.json", TRAIN_JSON),
    ("val_1.json", VAL1_JSON),
    ("val_2.json", VAL2_JSON),
]

results = {}

for label, jpath in pairs:
    if not jpath.exists():
        print(f"[{label}] file not found -> skipped: {jpath}")
        continue

    jdf = load_json_as_df(jpath)
    merged = df.merge(jdf, on="id", how="inner")
    matches = merged.loc[
        merged["text_norm"] == merged["json_text_norm"],
        ["id", "dataset", "gest", "text", "json_flat_text"],
    ].copy()
    results[label] = matches

    print(f"[{label}] JSON entries: {len(jdf):6d} | Matches: {len(matches):6d}")

[train.json] JSON entries:  10009 | Matches:   9154
[val_1.json] JSON entries:   4917 | Matches:   4486
[val_2.json] JSON entries:   4885 | Matches:    146


In [6]:
if PREVIEW_ROWS and results:
    for label, matches in results.items():
        if len(matches):
            print(f"\nPreview of common rows for {label} (first {PREVIEW_ROWS}):")
            display(matches.head(PREVIEW_ROWS))


Preview of common rows for train.json (first 3):


Unnamed: 0,id,dataset,gest,text,json_flat_text
0,v_ehGHCYKzyZ8,ActivityNet Captions,"{""actor0"":{""action"":""Exists"",""entities"":[""acto...",The video starts with a title logo sequence. A...,The video starts with a title logo sequence. A...
1,v_fzp5ooc727c,ActivityNet Captions,"{""actor0"":{""action"":""Exists"",""entities"":[""acto...",Two lines of young men are walking side by sid...,Two lines of young men are walking side by sid...
2,v_TmI9MxCDBMw,ActivityNet Captions,"{""actor0"":{""action"":""Exists"",""entities"":[""acto...",A murky river is shown next to a row of trees....,A murky river is shown next to a row of trees....



Preview of common rows for val_1.json (first 3):


Unnamed: 0,id,dataset,gest,text,json_flat_text
0,v_uqiMw7tQ1Cc,ActivityNet Captions,"{""actor0"":{""action"":""Exists"",""entities"":[""acto...",A weight lifting tutorial is given. The coach ...,A weight lifting tutorial is given. The coach ...
1,v_bXdq2zI1Ms0,ActivityNet Captions,"{""actor0"":{""action"":""Exists"",""entities"":[""acto...",A man is seen speaking to the camera and pans ...,A man is seen speaking to the camera and pans ...
2,v_K6Tm5xHkJ5c,ActivityNet Captions,"{""actor0"":{""action"":""Exists"",""entities"":[""acto...",A woman is seen speaking to the camera while h...,A woman is seen speaking to the camera while h...



Preview of common rows for val_2.json (first 3):


Unnamed: 0,id,dataset,gest,text,json_flat_text
4410,v_r-iXUXMP4DY,ActivityNet Captions,"{""actor0"":{""action"":""Exists"",""entities"":[""acto...",A man is wiping sweat from his forehead. He is...,A man is wiping sweat from his forehead. He is...
4411,v_0Zg-7EgFiC8,ActivityNet Captions,"{""actor0"":{""action"":""Exists"",""entities"":[""acto...",A group of people are running around inside a ...,A group of people are running around inside a ...
4412,v_cgPt46YiXNo,ActivityNet Captions,"{""actor0"":{""action"":""Exists"",""entities"":[""acto...",The young man is sitting on a bench. The young...,The young man is sitting on a bench. The young...
