In [10]:
import os, sys, platform, subprocess, torch
!pip -q install --upgrade pip
!pip -q install pyarrow fastparquet polars pandas


Python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
OS: Linux-6.6.56+-x86_64-with-glibc2.35
CUDA available: True
GPU count: 2
GPU[0]: Tesla T4
GPU[1]: Tesla T4


In [11]:
from pathlib import Path

ASL_DIR = Path("/kaggle/input/asl-fingerspelling")
TRAIN_LANDMARKS_DIR = ASL_DIR / "train_landmarks"
SUPP_LANDMARKS_DIR  = ASL_DIR / "supplemental_landmarks"
TRAIN_CSV           = ASL_DIR / "train.csv"
CHARMAP_JSON        = ASL_DIR / "character_to_prediction_index.json"

print("ASL_DIR exists:", ASL_DIR.exists())
print("TRAIN_LANDMARKS_DIR exists:", TRAIN_LANDMARKS_DIR.exists())
print("SUPP_LANDMARKS_DIR exists:", SUPP_LANDMARKS_DIR.exists())
print("TRAIN_CSV exists:", TRAIN_CSV.exists())
print("CHARMAP_JSON exists:", CHARMAP_JSON.exists())


ASL_DIR exists: True
TRAIN_LANDMARKS_DIR exists: True
SUPP_LANDMARKS_DIR exists: True
TRAIN_CSV exists: True
CHARMAP_JSON exists: True


In [12]:
import pandas as pd, json

df_train = pd.read_csv(TRAIN_CSV)
print(df_train.shape)
df_train.head(3)


(67208, 5)


Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier


In [13]:
with open(CHARMAP_JSON, "r") as f:
    char_map = json.load(f)
print("Character map size:", len(char_map))
list(char_map.items())[:5]


Character map size: 59


[(' ', 0), ('!', 1), ('#', 2), ('$', 3), ('%', 4)]

In [14]:
import polars as pl
from glob import glob

train_files = sorted(glob(str(TRAIN_LANDMARKS_DIR / "*.parquet")))[:2]
supp_files  = sorted(glob(str(SUPP_LANDMARKS_DIR  / "*.parquet")))[:2]

print("Sample train_landmarks files:", train_files)
print("Sample supplemental_landmarks files:", supp_files)

if train_files:
    df_landmarks = pl.read_parquet(train_files[0], n_rows=1000)
    print("train_landmarks sample:", df_landmarks.shape)
    print(df_landmarks.head(3))

if supp_files:
    df_supp = pl.read_parquet(supp_files[0], n_rows=1000)
    print("supplemental_landmarks sample:", df_supp.shape)
    print(df_supp.head(3))


Sample train_landmarks files: ['/kaggle/input/asl-fingerspelling/train_landmarks/1019715464.parquet', '/kaggle/input/asl-fingerspelling/train_landmarks/1021040628.parquet']
Sample supplemental_landmarks files: ['/kaggle/input/asl-fingerspelling/supplemental_landmarks/1032110484.parquet', '/kaggle/input/asl-fingerspelling/supplemental_landmarks/1047404576.parquet']
train_landmarks sample: (1000, 1631)
shape: (3, 1_631)
┌───────┬──────────┬──────────┬──────────┬───┬─────────────┬─────────────┬────────────┬────────────┐
│ frame ┆ x_face_0 ┆ x_face_1 ┆ x_face_2 ┆ … ┆ z_right_han ┆ z_right_han ┆ z_right_ha ┆ sequence_i │
│ ---   ┆ ---      ┆ ---      ┆ ---      ┆   ┆ d_18        ┆ d_19        ┆ nd_20      ┆ d          │
│ i16   ┆ f32      ┆ f32      ┆ f32      ┆   ┆ ---         ┆ ---         ┆ ---        ┆ ---        │
│       ┆          ┆          ┆          ┆   ┆ f32         ┆ f32         ┆ f32        ┆ i64        │
╞═══════╪══════════╪══════════╪══════════╪═══╪═════════════╪═════════════

In [None]:
from __future__ import annotations
import os, json, math, re, gc
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import polars as pl
import pandas as pd
import warnings; 
warnings.filterwarnings("ignore", category=FutureWarning); 
warnings.filterwarnings("ignore", message="Resolving the schema of a LazyFrame")

DATA_DIR = Path("/kaggle/input/asl-fingerspelling")
TRAIN_LANDMARKS = DATA_DIR / "train_landmarks"
SUPP_LANDMARKS = DATA_DIR / "supplemental_landmarks"
TRAIN_CSV = DATA_DIR / "train.csv"
OUT_DIR = Path("/kaggle/working/asl_phase1")
SEQS_DIR = OUT_DIR / "sequences"
OUT_DIR.mkdir(parents=True, exist_ok=True)
SEQS_DIR.mkdir(parents=True, exist_ok=True)

POSE_KEEP = [11, 12, 13, 14, 15, 16]

HAND_PREFIXES = [
    ("left_hand", list(range(21))),
    ("right_hand", list(range(21))),
]

COL_CACHE: Dict[str, List[str]] = {}

def discover_columns(parquet_path: Path) -> List[str]:
    if parquet_path.suffix.lower() != ".parquet":
        raise ValueError("Expected a .parquet file")
    df_schema = pl.scan_parquet(str(parquet_path)).schema
    return list(df_schema.keys())

_XYZ = ["x", "y", "z"]

def _landmark_cols(prefix: str, idx: int, cols: List[str]) -> List[str]:
    cands = []
    for axis in _XYZ:
        patts = [
            fr"^{axis}_{re.escape(prefix)}_{idx}$",
            fr"^{re.escape(prefix)}_{axis}_{idx}$",
            fr"^{axis}_(?:hand_)?{re.escape(prefix)}_{idx}$",
            fr"^{axis}_{re.escape(prefix)}{idx}$",
        ]
        for p in patts:
            found = [c for c in cols if re.match(p, c)]
            if found:
                cands.append(found[0])
                break
    return cands if len(cands) == 3 else []


def select_feature_columns(cols: List[str]) -> Tuple[List[str], Dict[str, Tuple[str,str,str]]]:
    logical2cols: Dict[str, Tuple[str,str,str]] = {}

    for hand, idxs in HAND_PREFIXES:
        for i in idxs:
            triple = _landmark_cols(hand, i, cols)
            if triple:
                logical2cols[f"{hand}_{i}"] = tuple(triple)  # type: ignore

    for i in POSE_KEEP:
        triple = _landmark_cols("pose", i, cols)
        if triple:
            logical2cols[f"pose_{i}"] = tuple(triple)  # type: ignore

    ordered = [k for k in [*(f"pose_{i}" for i in POSE_KEEP),
                           *(f"left_hand_{i}" for i in range(21)),
                           *(f"right_hand_{i}" for i in range(21))] if k in logical2cols]
    feat_cols = list(itertools.chain.from_iterable([list(logical2cols[k]) for k in ordered]))
    return feat_cols, logical2cols

import itertools

def interpolate_groupwise(df_pd: pd.DataFrame, group_col: str, value_cols: List[str]) -> pd.DataFrame:
    def _interp(g: pd.DataFrame) -> pd.DataFrame:
        g = g.sort_values("frame").copy()
        g[value_cols] = g[value_cols].astype("float32")
        g[value_cols] = g[value_cols].interpolate(method="linear", limit_direction="both", axis=0)
        g[value_cols] = g[value_cols].fillna(method="ffill").fillna(method="bfill")
        return g
    return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


def compute_reference_points(df_pl: pl.DataFrame, logical2cols: Dict[str, Tuple[str,str,str]]):
    has_pose = all(f"pose_{i}" in logical2cols for i in [11,12])
    def xyz_expr(key: str):
        xs, ys, zs = logical2cols[key]
        return pl.struct([
            pl.col(xs).alias("x"),
            pl.col(ys).alias("y"),
            pl.col(zs).alias("z"),
        ])

    if has_pose:
        lsh = xyz_expr("pose_11")
        rsh = xyz_expr("pose_12")
        center = (lsh + rsh) / 2  # struct arithmetic works elementwise in Polars >=0.20
        shoulder_dist = ((lsh.struct.field("x") - rsh.struct.field("x")).pow(2)
                         + (lsh.struct.field("y") - rsh.struct.field("y")).pow(2)
                         + (lsh.struct.field("z") - rsh.struct.field("z")).pow(2)).sqrt()
        ref = df_pl.with_columns([
            center.struct.field("x").alias("cx"),
            center.struct.field("y").alias("cy"),
            center.struct.field("z").alias("cz"),
            shoulder_dist.alias("scale"),
        ])
    else:
        lw = xyz_expr("left_hand_0") if "left_hand_0" in logical2cols else None
        rw = xyz_expr("right_hand_0") if "right_hand_0" in logical2cols else None
        if lw is None and rw is None:
            ref = df_pl.with_columns([
                pl.lit(0.0).alias("cx"), pl.lit(0.0).alias("cy"), pl.lit(0.0).alias("cz"), pl.lit(1e-6).alias("scale")
            ])
        else:
            if lw is None: lw = rw
            if rw is None: rw = lw
            center = (lw + rw) / 2
            hand_span = ((lw.struct.field("x") - rw.struct.field("x")).pow(2)
                         + (lw.struct.field("y") - rw.struct.field("y")).pow(2)
                         + (lw.struct.field("z") - rw.struct.field("z")).pow(2)).sqrt()
            ref = df_pl.with_columns([
                center.struct.field("x").alias("cx"),
                center.struct.field("y").alias("cy"),
                center.struct.field("z").alias("cz"),
                hand_span.alias("scale"),
            ])

    ref = ref.with_columns([
        pl.when(pl.col("scale") < 1e-6).then(1e-6).otherwise(pl.col("scale")).alias("scale")
    ])
    return ref.select(["cx","cy","cz","scale"])  


def normalize_features(df_pl: pl.DataFrame, feat_cols: List[str], logical2cols: Dict[str, Tuple[str,str,str]]):
    ref = compute_reference_points(df_pl, logical2cols)
    df = pl.concat([df_pl, ref], how="horizontal")
    out_cols = []
    for i in range(0, len(feat_cols), 3):
        x, y, z = feat_cols[i:i+3]
        nx = ((pl.col(x) - pl.col("cx")) / pl.col("scale")).alias(f"n_{x}")
        ny = ((pl.col(y) - pl.col("cy")) / pl.col("scale")).alias(f"n_{y}")
        nz = ((pl.col(z) - pl.col("cz")) / pl.col("scale")).alias(f"n_{z}")
        out_cols += [nx, ny, nz]
    norm = df.select(["sequence_id","frame", *out_cols])
    return norm

def list_parquets() -> List[Path]:
    files = []
    if TRAIN_LANDMARKS.exists():
        files += sorted(TRAIN_LANDMARKS.glob("*.parquet"))
    if SUPP_LANDMARKS.exists():
        files += sorted(SUPP_LANDMARKS.glob("*.parquet"))
    return files


def process_parquet(pq_path: Path) -> Dict:
    cols = discover_columns(pq_path)
    meta_cols = [c for c in ("frame","sequence_id","participant_id","signer_id","hand") if c in cols]
    feat_cols, logical2cols = select_feature_columns(cols)
    if not feat_cols:
        return {"ok": False, "path": str(pq_path), "reason": "No matching landmark columns"}

    df = pl.read_parquet(str(pq_path), columns=meta_cols + feat_cols)

    df = df.with_columns([
        pl.all().exclude(["sequence_id","participant_id","signer_id","hand"]).cast(pl.Float32, strict=False)
    ])

    norm = normalize_features(df, feat_cols, logical2cols)

    norm_pd = norm.to_pandas()
    value_cols = [c for c in norm_pd.columns if c not in ("sequence_id","frame")]
    norm_pd = interpolate_groupwise(norm_pd, group_col="sequence_id", value_cols=value_cols)

    groups = norm_pd.groupby("sequence_id")
    rows = []
    for seq_id, g in groups:
        g = g.sort_values("frame")
        frames = g[value_cols].to_numpy(dtype=np.float32)
        fidx = g["frame"].astype(np.int16).to_numpy()
        out_path = SEQS_DIR / f"{seq_id}.npz"
        np.savez_compressed(out_path, frames=frames, frame_index=fidx)
        rows.append({
            "sequence_id": int(seq_id),
            "n_frames": int(frames.shape[0]),
            "n_features": int(frames.shape[1]),
            "npz_path": str(out_path),
            "source": str(pq_path),
        })
    del df, norm, norm_pd
    gc.collect()
    return {"ok": True, "path": str(pq_path), "n_sequences": len(rows), "rows": rows}


def build_manifest(results: List[Dict]):
    all_rows = list(itertools.chain.from_iterable([r.get("rows", []) for r in results if r.get("ok")]))
    man = pd.DataFrame(all_rows)
    man.to_csv(OUT_DIR / "manifest.csv", index=False)
    if all_rows:
        pass
        
def main(max_files: int | None = 10):
    files = list_parquets()
    if max_files is not None:
        files = files[:max_files]
    results = []
    for i, pq in enumerate(files, 1):
        print(f"[{i}/{len(files)}] Processing {pq.name}…")
        res = process_parquet(pq)
        if res.get("ok"):
            print(f"  -> OK: {res['n_sequences']} sequences")
        else:
            print(f"  -> SKIP: {res.get('reason')}")
        results.append(res)
    build_manifest(results)
    print("Done. Manifest at:", OUT_DIR / "manifest.csv")


if __name__ == "__main__":
    main(max_files=None) 


[1/121] Processing 1019715464.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[2/121] Processing 1021040628.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[3/121] Processing 105143404.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[4/121] Processing 1098899348.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[5/121] Processing 1099408314.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[6/121] Processing 1133664520.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[7/121] Processing 1134756332.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[8/121] Processing 1255240050.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[9/121] Processing 128822441.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[10/121] Processing 1320204318.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[11/121] Processing 1341528257.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[12/121] Processing 1358493307.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[13/121] Processing 1365275733.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 995 sequences
[14/121] Processing 1365772051.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 997 sequences
[15/121] Processing 1405046009.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[16/121] Processing 1448136004.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[17/121] Processing 1497621680.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 997 sequences
[18/121] Processing 149822653.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[19/121] Processing 152029243.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[20/121] Processing 1552432300.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[21/121] Processing 1557244878.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 997 sequences
[22/121] Processing 1562234637.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 997 sequences
[23/121] Processing 1643479812.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[24/121] Processing 1647220008.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[25/121] Processing 1662742697.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[26/121] Processing 1664666588.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[27/121] Processing 169560558.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[28/121] Processing 1726141437.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[29/121] Processing 175396851.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[30/121] Processing 1785039512.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[31/121] Processing 1865557033.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[32/121] Processing 1880177496.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[33/121] Processing 1905462118.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[34/121] Processing 1906357076.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 997 sequences
[35/121] Processing 1920330615.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[36/121] Processing 1967755728.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[37/121] Processing 1969985709.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[38/121] Processing 1997878546.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[39/121] Processing 2026717426.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[40/121] Processing 2036580525.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[41/121] Processing 2072296290.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[42/121] Processing 2072876091.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 997 sequences
[43/121] Processing 2118949241.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[44/121] Processing 234418913.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[45/121] Processing 296317215.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[46/121] Processing 349393104.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[47/121] Processing 388576474.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[48/121] Processing 425182931.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 995 sequences
[49/121] Processing 433948159.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[50/121] Processing 450474571.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 287 sequences
[51/121] Processing 474255203.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[52/121] Processing 495378749.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[53/121] Processing 522550314.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[54/121] Processing 527708222.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[55/121] Processing 532011803.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[56/121] Processing 5414471.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[57/121] Processing 546816846.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[58/121] Processing 566963657.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[59/121] Processing 568753759.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 998 sequences
[60/121] Processing 614661748.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 996 sequences
[61/121] Processing 638508439.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[62/121] Processing 649779897.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[63/121] Processing 654436541.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[64/121] Processing 683666742.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[65/121] Processing 871280215.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[66/121] Processing 882979387.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 997 sequences
[67/121] Processing 933868835.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[68/121] Processing 939623093.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[69/121] Processing 1032110484.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[70/121] Processing 1047404576.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[71/121] Processing 1112747136.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[72/121] Processing 1118603411.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[73/121] Processing 1144115867.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[74/121] Processing 1176508147.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[75/121] Processing 1249944812.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 999 sequences
[76/121] Processing 1279694894.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[77/121] Processing 131312512.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[78/121] Processing 1407656790.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[79/121] Processing 1471096258.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[80/121] Processing 1471341722.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[81/121] Processing 1505488209.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[82/121] Processing 1579345709.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[83/121] Processing 1624527344.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[84/121] Processing 1650637630.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 958 sequences
[85/121] Processing 1682915129.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[86/121] Processing 1727438550.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[87/121] Processing 1755047076.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[88/121] Processing 1756773911.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[89/121] Processing 1779786322.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[90/121] Processing 1857374937.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[91/121] Processing 1881515495.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[92/121] Processing 193950599.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[93/121] Processing 2057261717.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[94/121] Processing 2100073719.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[95/121] Processing 236903981.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[96/121] Processing 285528514.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[97/121] Processing 293101677.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[98/121] Processing 333606065.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[99/121] Processing 33432165.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[100/121] Processing 369584223.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[101/121] Processing 371169664.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[102/121] Processing 440362090.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[103/121] Processing 442061898.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[104/121] Processing 471766624.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[105/121] Processing 595441814.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[106/121] Processing 597469033.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[107/121] Processing 636900267.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[108/121] Processing 639454452.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[109/121] Processing 676340265.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[110/121] Processing 680303484.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[111/121] Processing 697480828.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[112/121] Processing 716508881.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[113/121] Processing 736978972.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[114/121] Processing 756566775.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[115/121] Processing 775880548.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[116/121] Processing 778903889.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


  -> OK: 1000 sequences
[117/121] Processing 86446671.parquet…


  return df_pd.groupby(group_col, as_index=False, group_keys=False).apply(_interp)


In [78]:
from pathlib import Path
import pandas as pd, numpy as np, os, json

ASL_DIR  = Path("/kaggle/input/asl-fingerspelling")
OUT_DIR  = Path("/kaggle/working/asl_phase1")
SEQS_DIR = OUT_DIR / "sequences"
TRAIN_CSV    = ASL_DIR / "train.csv"

assert SEQS_DIR.exists(), f"{SEQS_DIR} missing — run Phase 1 exporter first"
assert TRAIN_CSV.exists(), f"{TRAIN_CSV} missing"

df_train = pd.read_csv(TRAIN_CSV)[["sequence_id","phrase"]]
df_train["sequence_id"] = df_train["sequence_id"].astype(int)

rows = []
npz_files = sorted(SEQS_DIR.glob("*.npz"))
print(f"Found {len(npz_files)} sequence artifacts under {SEQS_DIR}")

for f in npz_files:
    s = f.stem
    if not s.isdigit():
        continue
    sid = int(s)
    try:
        with np.load(f, allow_pickle=False) as z:
            arr = z["frames"]
            T, F = int(arr.shape[0]), int(arr.shape[1])
    except Exception as e:
        T, F = np.nan, np.nan
    rows.append({"sequence_id": sid, "npz_path": str(f), "n_frames": T, "n_features": F})

man_all = pd.DataFrame(rows).drop_duplicates("sequence_id").reset_index(drop=True)
man_all = man_all.merge(df_train, on="sequence_id", how="left")
man_all["has_label"] = man_all["phrase"].notna()

OUT_DIR.mkdir(parents=True, exist_ok=True)
man_all.to_csv(OUT_DIR / "manifest_all.csv", index=False)

man = man_all[man_all["has_label"]].copy()
man = man.drop(columns=["has_label"]).reset_index(drop=True)
man.to_csv(OUT_DIR / "manifest.csv", index=False)

print(f"manifest_all.csv: {len(man_all)} rows  (labeled: {man_all['has_label'].sum()})")
print(f"manifest.csv (labeled only): {len(man)} rows  -> {OUT_DIR/'manifest.csv'}")


Found 120165 sequence artifacts under /kaggle/working/asl_phase1/sequences
✅ manifest_all.csv: 120165 rows  (labeled: 67208)
✅ manifest.csv (labeled only): 67208 rows  -> /kaggle/working/asl_phase1/manifest.csv


In [79]:
import os, math, time, json, random
from pathlib import Path
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.set_float32_matmul_precision("medium")  # enable TF32 where available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ASL_DIR  = Path("/kaggle/input/asl-fingerspelling")
OUT_DIR  = Path("/kaggle/working/asl_phase1")
SEQS_DIR = OUT_DIR / "sequences"
MANIFEST_ALL = OUT_DIR / "manifest_all.csv"
MANIFEST = OUT_DIR / "manifest.csv"
CHARMAP_JSON = ASL_DIR / "character_to_prediction_index.json"

assert MANIFEST.exists(), "Run the manifest rebuild cell first."


In [80]:
with open(CHARMAP_JSON, "r") as f:
    base_char2idx = json.load(f)

PAD, BOS, EOS = "<pad>", "<bos>", "<eos>"

has_pad = (PAD in base_char2idx) or (0 in set(base_char2idx.values()))
if has_pad and (PAD in base_char2idx):
    char2idx = dict(base_char2idx)
    char2idx[PAD] = base_char2idx[PAD]
    start_max = max(char2idx.values())
else:
    min_id = min(base_char2idx.values())
    shift = 1 if min_id == 0 else 0
    char2idx = {c: i+shift for c, i in base_char2idx.items()}
    char2idx[PAD] = 0
    start_max = max(char2idx.values())

if BOS not in char2idx: char2idx[BOS] = start_max + 1; start_max += 1
if EOS not in char2idx: char2idx[EOS] = start_max + 1

idx2char = {i: c for c, i in char2idx.items()}
vocab_size = max(char2idx.values()) + 1

def text_to_ids(s: str) -> List[int]:
    return [char2idx[BOS]] + [char2idx[c] for c in s.lower() if c in char2idx] + [char2idx[EOS]]

def ids_to_text(ids: List[int]) -> str:
    return "".join(idx2char[i] for i in ids if i not in (char2idx[PAD], char2idx[BOS], char2idx[EOS]))


In [81]:
df = pd.read_csv(MANIFEST)
need = {"sequence_id","npz_path","n_frames","n_features","phrase"}
missing = need - set(df.columns)
assert not missing, f"Manifest missing columns: {missing}"

missing_mask = ~df["npz_path"].map(os.path.exists)
if missing_mask.any():
    print(f"Dropping {int(missing_mask.sum())} rows with missing files")
    df = df[~missing_mask].reset_index(drop=True)

df = df.sort_values("n_frames").reset_index(drop=True)
lens = df["n_frames"].to_numpy()
quant = np.quantile(lens, [0.2, 0.4, 0.6, 0.8]) if len(df) > 10 else [0,0,0,0]
def bucket(l):
    return int(l > quant[0]) + int(l > quant[1]) + int(l > quant[2]) + int(l > quant[3])
df["len_bucket"] = [bucket(l) for l in lens]

rng = np.random.default_rng(42)
train_idx, val_idx = [], []
for _, grp in df.groupby("len_bucket", group_keys=False):
    idx = grp.index.to_numpy()
    rng.shuffle(idx)
    cut = max(1, int(0.90 * len(idx)))
    train_idx += idx[:cut].tolist()
    val_idx   += idx[cut:].tolist()

@dataclass
class Sample:
    npz_path: str
    n_frames: int
    phrase: str
    seq_id: int

class ASLFingerSeqDataset(torch.utils.data.Dataset):
    def __init__(self, frame_rows: pd.DataFrame):
        self.rows = [
            Sample(r["npz_path"], int(r["n_frames"]), str(r["phrase"]), int(r["sequence_id"]))
            for _, r in frame_rows.iterrows()
        ]
    def __len__(self): return len(self.rows)
    def __getitem__(self, i: int):
        r = self.rows[i]; p = r.npz_path
        with np.load(p, allow_pickle=False) as z:
            frames = z["frames"]  # (T, F)
        if frames.ndim != 2:
            raise RuntimeError(f"Bad ndim={frames.ndim} in {p}")
        frames = frames.astype(np.float32, copy=False)
        if not np.isfinite(frames).all():
            np.nan_to_num(frames, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
        frames = np.clip(frames, -1e6, 1e6, out=frames)
        frames_t = torch.from_numpy(frames).contiguous()
        tgt_ids  = torch.as_tensor(text_to_ids(r.phrase), dtype=torch.long)
        return frames_t, tgt_ids, r.n_frames, r.seq_id

ds_train = ASLFingerSeqDataset(df.loc[train_idx])
ds_val   = ASLFingerSeqDataset(df.loc[val_idx])

class BucketedBatchSampler(torch.utils.data.Sampler):
    def __init__(self, lengths, batch_size, shuffle=True):
        self.batch_size = batch_size; self.shuffle = shuffle
        order = np.argsort(lengths)
        if shuffle:
            chunks = np.array_split(order, max(1, len(order)//(batch_size*20)))
            rng2 = np.random.default_rng(0)
            order = np.concatenate([rng2.choice(c, size=len(c), replace=False) for c in chunks if len(c)>0])
        self.batches = [order[i:i+batch_size] for i in range(0, len(order), batch_size)]
    def __iter__(self):
        if self.shuffle:
            rng3 = np.random.default_rng(1)
            rng3.shuffle(self.batches)
        return iter(self.batches)
    def __len__(self): return len(self.batches)

def collate_fn(batch):
    frames, tgts, lens, seq_ids = zip(*batch)
    B = len(batch); Fdim = frames[0].shape[1]
    T = max(x.shape[0] for x in frames)
    L = max(y.shape[0] for y in tgts)

    xpad = torch.zeros(B, T, Fdim, dtype=torch.float32)
    xmask = torch.ones(B, T, dtype=torch.bool)
    for i, x in enumerate(frames):
        t = x.shape[0]
        xpad[i, :t] = x
        xmask[i, :t] = False

    ypad = torch.full((B, L), fill_value=char2idx[PAD], dtype=torch.long)
    ymask = torch.ones(B, L, dtype=torch.bool)
    for i, y in enumerate(tgts):
        l = y.shape[0]
        ypad[i, :l] = y
        ymask[i, :l] = False

    return xpad, xmask, ypad, ymask, torch.tensor(lens), torch.tensor(seq_ids)

BASE_BATCH = 64
n_features = int(df["n_features"].dropna().iloc[0])

In [82]:
import multiprocessing as mp
world_size = torch.cuda.device_count() if torch.cuda.is_available() else 1

BATCH_SIZE = max(1, BASE_BATCH * world_size)  

train_sampler = BucketedBatchSampler(df.loc[train_idx, "n_frames"].to_numpy(), BATCH_SIZE, shuffle=True)
val_sampler   = BucketedBatchSampler(df.loc[val_idx,   "n_frames"].to_numpy(), BATCH_SIZE, shuffle=False)

num_workers     = 4
use_persistent  = True
prefetch_factor = 4
pin_mem         = True
timeout_seconds = 0  
mp_ctx = mp.get_context("fork")  

dl_train = torch.utils.data.DataLoader(
    ds_train, batch_sampler=train_sampler,
    num_workers=num_workers, pin_memory=pin_mem, collate_fn=collate_fn,
    persistent_workers=use_persistent, prefetch_factor=prefetch_factor,
    timeout=timeout_seconds, multiprocessing_context=mp_ctx
)
dl_val = torch.utils.data.DataLoader(
    ds_val, batch_sampler=val_sampler,
    num_workers=num_workers, pin_memory=pin_mem, collate_fn=collate_fn,
    persistent_workers=use_persistent, prefetch_factor=prefetch_factor,
    timeout=timeout_seconds, multiprocessing_context=mp_ctx
)

print(f"Workers={num_workers}, prefetch={prefetch_factor}, pin_memory={pin_mem}, start='fork', GPUs={world_size}")

for _ in range(3):
    _ = next(iter(dl_train))
print("DataLoader warmup complete")


Workers=4, prefetch=4, pin_memory=True, start='fork', GPUs=2
✅ DataLoader warmup complete


In [83]:
class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 4000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe)
    def forward(self, x: torch.Tensor):
        T = x.size(1)
        return x + self.pe[:T].unsqueeze(0)

class Landmark2TextTransformer(nn.Module):
    def __init__(self, in_feat: int, vocab_size: int,
                 d_model=512, nhead=8, num_enc=6, num_dec=6, d_ff=2048, dropout=0.1):  # bigger model
        super().__init__()
        self.input_proj = nn.Linear(in_feat, d_model)
        self.pos_enc_in = SinusoidalPositionalEncoding(d_model)
        self.embed_out  = nn.Embedding(vocab_size, d_model, padding_idx=char2idx[PAD])
        self.pos_enc_out = SinusoidalPositionalEncoding(d_model)
        self.tf = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_enc, num_decoder_layers=num_dec,
            dim_feedforward=d_ff, dropout=dropout, batch_first=True
        )
        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)
    def forward(self, src, src_key_padding_mask, tgt, tgt_key_padding_mask):
        src = self.pos_enc_in(self.input_proj(src))
        tgt = self.pos_enc_out(self.embed_out(tgt))
        L = tgt.size(1)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(L, device=tgt.device)
        out = self.tf(src=src, tgt=tgt,
                      src_key_padding_mask=src_key_padding_mask,
                      tgt_key_padding_mask=tgt_key_padding_mask,
                      tgt_mask=tgt_mask)
        out = self.norm(out)
        return self.head(out)

model = Landmark2TextTransformer(in_feat=n_features, vocab_size=vocab_size).to(DEVICE)

if torch.cuda.is_available() and torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs via DataParallel")
    model = torch.nn.DataParallel(model)


Using 2 GPUs via DataParallel


In [84]:
def _levenshtein(a: List[int], b: List[int]) -> int:
    n, m = len(a), len(b)
    dp = list(range(m+1))
    for i in range(1, n+1):
        prev, dp[0] = dp[0], i
        for j in range(1, m+1):
            prev, dp[j] = dp[j], min(
                dp[j] + 1,
                dp[j-1] + 1,
                prev + (a[i-1] != b[j-1])
            )
    return dp[m]

def cer(ref_texts: List[str], hyp_texts: List[str]) -> float:
    edits = total = 0
    for r, h in zip(ref_texts, hyp_texts):
        r_ids = [ord(c) for c in r]
        h_ids = [ord(c) for c in h]
        edits += _levenshtein(r_ids, h_ids)
        total += max(1, len(r_ids))
    return edits / total

EPOCHS = 30                        
LR     = 2e-4                     
criterion = nn.CrossEntropyLoss(ignore_index=char2idx[PAD], label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01, betas=(0.9, 0.98))
steps_per_epoch = len(dl_train)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LR, epochs=EPOCHS, steps_per_epoch=steps_per_epoch)
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

def greedy_decode(model, src, src_mask, max_len=128):
    model.eval()
    B = src.size(0)
    ys = torch.full((B, 1), fill_value=char2idx[BOS], dtype=torch.long, device=src.device)
    with torch.no_grad():
        for _ in range(max_len-1):
            ymask = torch.zeros_like(ys, dtype=torch.bool, device=src.device)
            logits = model(src, src_mask, ys, ymask)
            next_tok = logits[:, -1].argmax(-1, keepdim=True)
            ys = torch.cat([ys, next_tok], dim=1)
            if torch.all(next_tok.squeeze(-1) == char2idx[EOS]):
                break
    return ys

def run_eval(model, dl):
    model.eval()
    refs, hyps = [], []
    with torch.no_grad():
        for xb, xmask, yb, ymask, _, _ in dl:
            xb, xmask = xb.to(DEVICE), xmask.to(DEVICE)
            yb = yb.to(DEVICE)
            preds = greedy_decode(model, xb, xmask, max_len=yb.size(1)+10)
            pred_txt = []
            for row in preds.tolist():
                s = [t for t in row if t not in (char2idx[PAD], char2idx[BOS])]
                if char2idx[EOS] in s: s = s[:s.index(char2idx[EOS])]
                pred_txt.append(ids_to_text(s))
            gold_txt = []
            for row in yb.tolist():
                s = [t for t in row if t not in (char2idx[PAD], char2idx[BOS], char2idx[EOS])]
                gold_txt.append(ids_to_text(s))
            refs.extend(gold_txt); hyps.extend(pred_txt)
    return cer(refs, hyps)


In [85]:
GRAD_ACCUM = 1
CLIP_NORM  = 1.0
SAVE_BEST  = OUT_DIR / "model_phase2_best.pt"

print("GPUs:", torch.cuda.device_count(), "| len(ds_train) =", len(ds_train), "| len(dl_train) =", len(dl_train), "| BATCH_SIZE =", BATCH_SIZE)
print("approx samples/epoch =", len(dl_train) * BATCH_SIZE)

best_cer = float("inf")
for epoch in range(1, EPOCHS+1):
    model.train()
    t0 = time.time(); total_loss = 0.0

    for step, (xb, xmask, yb, ymask, _, _) in enumerate(dl_train, start=1):
        xb, xmask = xb.to(DEVICE, non_blocking=True), xmask.to(DEVICE, non_blocking=True)
        yb, ymask = yb.to(DEVICE, non_blocking=True), ymask.to(DEVICE, non_blocking=True)

        y_in, y_tgt = yb[:, :-1], yb[:, 1:]
        ymask_in    = ymask[:, :-1]

        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(xb, xmask, y_in, ymask_in)
            loss = criterion(logits.reshape(-1, logits.size(-1)), y_tgt.reshape(-1))

        scaler.scale(loss / GRAD_ACCUM).backward()
        if step % GRAD_ACCUM == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
            scaler.step(optimizer); scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        total_loss += loss.item()

    val_cer = run_eval(model, dl_val)
    took = time.time() - t0
    print(f"Epoch {epoch:02d} | train_loss={total_loss/len(dl_train):.4f} | val_CER={val_cer:.4f} | time={took/60:.2f} min")

    to_save = model.module if isinstance(model, torch.nn.DataParallel) else model
    if val_cer < best_cer:
        best_cer = val_cer
        torch.save({"model": to_save.state_dict(),
                    "char2idx": char2idx,
                    "idx2char": idx2char,
                    "config": {"in_feat": n_features, "vocab_size": vocab_size}}, SAVE_BEST)
        print(f"  ✓ New best CER {best_cer:.4f}. Saved -> {SAVE_BEST}")


GPUs: 2 | len(ds_train) = 60485 | len(dl_train) = 473 | BATCH_SIZE = 128
approx samples/epoch = 60544




Epoch 01 | train_loss=3.1788 | val_CER=0.9386 | time=5.34 min
  ✓ New best CER 0.9386. Saved -> /kaggle/working/asl_phase1/model_phase2_best.pt
Epoch 02 | train_loss=2.8424 | val_CER=1.0225 | time=6.49 min
Epoch 03 | train_loss=2.7291 | val_CER=1.0275 | time=7.43 min
Epoch 04 | train_loss=2.6391 | val_CER=0.9655 | time=7.34 min
Epoch 05 | train_loss=2.5728 | val_CER=0.9854 | time=7.62 min
Epoch 06 | train_loss=2.5188 | val_CER=0.9434 | time=7.98 min
Epoch 07 | train_loss=2.4664 | val_CER=0.9440 | time=7.89 min
Epoch 08 | train_loss=2.4051 | val_CER=0.9273 | time=7.86 min
  ✓ New best CER 0.9273. Saved -> /kaggle/working/asl_phase1/model_phase2_best.pt
Epoch 09 | train_loss=2.2999 | val_CER=0.8089 | time=8.32 min
  ✓ New best CER 0.8089. Saved -> /kaggle/working/asl_phase1/model_phase2_best.pt
Epoch 10 | train_loss=2.1381 | val_CER=0.7239 | time=8.16 min
  ✓ New best CER 0.7239. Saved -> /kaggle/working/asl_phase1/model_phase2_best.pt
Epoch 11 | train_loss=1.9683 | val_CER=0.6763 | time

In [92]:
import os, math, time, collections
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
OUT_DIR = Path("/kaggle/working/asl_phase1")
CKPT = OUT_DIR / "model_phase2_best.pt"
assert CKPT.exists(), f"Missing checkpoint at {CKPT}; finish Phase 2 training first."

if 'char2idx' not in globals() or 'idx2char' not in globals():
    blob = torch.load(CKPT, map_location="cpu")
    char2idx = blob["char2idx"]
    idx2char = blob["idx2char"]

PAD = next((k for k,v in char2idx.items() if v == 0), "<pad>")
if PAD not in char2idx: char2idx[PAD] = 0
if "<bos>" not in char2idx:
    char2idx["<bos>"] = max(char2idx.values()) + 1
if "<eos>" not in char2idx:
    char2idx["<eos>"] = max(char2idx.values()) + 1
idx2char = {i:c for c,i in char2idx.items()}
PAD_ID, BOS_ID, EOS_ID = char2idx[PAD], char2idx["<bos>"], char2idx["<eos>"]

if 'Landmark2TextTransformer' not in globals():
    class SinusoidalPositionalEncoding(nn.Module):
        def __init__(self, d_model: int, max_len: int = 4000):
            super().__init__()
            pe = torch.zeros(max_len, d_model)
            pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
            div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
            pe[:, 0::2] = torch.sin(pos * div)
            pe[:, 1::2] = torch.cos(pos * div)
            self.register_buffer("pe", pe)
        def forward(self, x: torch.Tensor):
            T = x.size(1)
            return x + self.pe[:T].unsqueeze(0)

    class Landmark2TextTransformer(nn.Module):
        def __init__(self, in_feat: int, vocab_size: int,
                     d_model=512, nhead=8, num_enc=6, num_dec=6, d_ff=2048, dropout=0.1):
            super().__init__()
            self.input_proj = nn.Linear(in_feat, d_model)
            self.pos_enc_in = SinusoidalPositionalEncoding(d_model)
            self.embed_out  = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
            self.pos_enc_out = SinusoidalPositionalEncoding(d_model)
            self.tf = nn.Transformer(
                d_model=d_model, nhead=nhead,
                num_encoder_layers=num_enc, num_decoder_layers=num_dec,
                dim_feedforward=d_ff, dropout=dropout, batch_first=True
            )
            self.norm = nn.LayerNorm(d_model)
            self.head = nn.Linear(d_model, vocab_size)
        def forward(self, src, src_key_padding_mask, tgt, tgt_key_padding_mask):
            src = self.pos_enc_in(self.input_proj(src))
            tgt = self.pos_enc_out(self.embed_out(tgt))
            L = tgt.size(1)
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(L, device=tgt.device)
            out = self.tf(src=src, tgt=tgt,
                          src_key_padding_mask=src_key_padding_mask,
                          tgt_key_padding_mask=tgt_key_padding_mask,
                          tgt_mask=tgt_mask)
            out = self.norm(out)
            return self.head(out)

blob = torch.load(CKPT, map_location="cpu")
cfg = blob.get("config", {})
in_feat = cfg.get("in_feat", None)
vocab_size = cfg.get("vocab_size", len(char2idx))
assert in_feat is not None, "Checkpoint missing in_feat; was it saved?"

model = Landmark2TextTransformer(in_feat=in_feat, vocab_size=vocab_size).to(DEVICE)
model.load_state_dict(blob["model"], strict=True)
model.eval()
print("✓ Loaded Phase 2 best checkpoint:", CKPT)
print("in_feat =", in_feat, "| vocab_size =", vocab_size, "| device =", DEVICE)


✓ Loaded Phase 2 best checkpoint: /kaggle/working/asl_phase1/model_phase2_best.pt
in_feat = 144 | vocab_size = 62 | device = cuda


In [93]:
import cv2
import mediapipe as mp
mp_hands = mp.solutions.hands
mp_pose  = mp.solutions.pose

POSE_KEEP = [11, 12, 13, 14, 15, 16]
LH_IDX = list(range(21))
RH_IDX = list(range(21))

def _safe_arr3(v):
    try:
        return np.array([float(v.x), float(v.y), float(getattr(v, 'z', 0.0))], dtype=np.float32)
    except Exception:
        return None

def extract_landmarks(frame_bgr):
    img = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    img.flags.writeable = False
    pose_results = extract_landmarks.pose.process(img)
    hands_results = extract_landmarks.hands.process(img)
    img.flags.writeable = True

    out = {'pose': {}, 'left': {}, 'right': {}}
    if pose_results.pose_landmarks:
        plms = pose_results.pose_landmarks.landmark
        for i in POSE_KEEP:
            v = _safe_arr3(plms[i])
            if v is not None:
                out['pose'][i] = v

    if hands_results.multi_hand_landmarks and hands_results.multi_handedness:
        for hand_lms, handed in zip(hands_results.multi_hand_landmarks, hands_results.multi_handedness):
            label = handed.classification[0].label.lower()  # 'left' or 'right'
            pts = hand_lms.landmark
            for j in range(21):
                v = _safe_arr3(pts[j])
                if v is not None:
                    out[label][j] = v
    return out

extract_landmarks.pose  = mp_pose.Pose(static_image_mode=False, model_complexity=1, enable_segmentation=False)
extract_landmarks.hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5, min_tracking_confidence=0.5)

def normalize_frame(det):
    have_pose = (11 in det['pose']) and (12 in det['pose'])
    if have_pose:
        lsh, rsh = det['pose'][11], det['pose'][12]
        center = (lsh + rsh) / 2.0
        scale = np.linalg.norm(lsh - rsh)
    else:
        lw = det['left'].get(0, None)
        rw = det['right'].get(0, None)
        if lw is None and rw is None:
            center = np.zeros(3, dtype=np.float32)
            scale  = 1e-6
        else:
            if lw is None: lw = rw
            if rw is None: rw = lw
            center = (lw + rw) / 2.0
            scale  = np.linalg.norm(lw - rw)
    scale = max(scale, 1e-6)

    feats = []
    for i in POSE_KEEP:
        xyz = det['pose'].get(i, np.zeros(3, dtype=np.float32))
        n = (xyz - center) / scale
        feats.extend(n.tolist())
    for j in LH_IDX:
        xyz = det['left'].get(j, np.zeros(3, dtype=np.float32))
        n = (xyz - center) / scale
        feats.extend(n.tolist())
    for j in RH_IDX:
        xyz = det['right'].get(j, np.zeros(3, dtype=np.float32))
        n = (xyz - center) / scale
        feats.extend(n.tolist())

    x = np.asarray(feats, dtype=np.float32)
    np.nan_to_num(x, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
    np.clip(x, -1e6, 1e6, out=x)
    return x

EXPECTED_IN_FEAT = (len(POSE_KEEP) + len(LH_IDX) + len(RH_IDX)) * 3
assert EXPECTED_IN_FEAT == in_feat, f"Feature dim mismatch: expected {EXPECTED_IN_FEAT} from MP, ckpt needs {in_feat}"
print("✓ MediaPipe feature dim OK =", EXPECTED_IN_FEAT)


✓ MediaPipe feature dim OK = 144


W0000 00:00:1760541486.358401  131159 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1760541486.390062  131159 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1760541486.420030  131156 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1760541486.482399  131156 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [94]:
from collections import deque

class TemporalSmoother:
    """EMA/Moving-average smoothing over next-token probabilities to reduce jitter."""
    def __init__(self, vocab_size, k=5, ema=False, alpha=0.4):
        self.vocab_size = vocab_size
        self.k = k
        self.ema = ema
        self.alpha = alpha
        self.buf = deque(maxlen=k)
        self._ema_vec = None

    def update(self, probs: torch.Tensor) -> np.ndarray:
        p = probs.detach().cpu().float().numpy()
        if self.ema:
            if self._ema_vec is None:
                self._ema_vec = p.copy()
            else:
                self._ema_vec = self.alpha * p + (1 - self.alpha) * self._ema_vec
            return self._ema_vec
        else:
            self.buf.append(p)
            return np.mean(self.buf, axis=0) if len(self.buf) else p

class RealTimeDecoder:
    def __init__(self, model, bos_id, eos_id, pad_id, window=64, smooth_k=5, commit_thresh=0.60, commit_patience=3):
        self.model = model
        self.window = window
        self.queue = deque(maxlen=window)
        self.bos_id, self.eos_id, self.pad_id = bos_id, eos_id, pad_id
        self.smoother = TemporalSmoother(vocab_size=vocab_size, k=smooth_k)
        self.commit_thresh = commit_thresh
        self.commit_patience = commit_patience
        self._cand_id = None
        self._cand_count = 0
        self.committed = []  

    def reset_text(self):
        self.committed.clear()
        self._cand_id, self._cand_count = None, 0
        self.smoother.buf.clear()

    def push_frame(self, x_feat: np.ndarray):
        assert x_feat.shape[0] == in_feat
        self.queue.append(x_feat)

    @torch.no_grad()
    def step(self):
        if len(self.queue) < 4:
            return None, "".join(idx2char.get(t, "") for t in self.committed)

        frames = np.stack(self.queue, axis=0)
        src = torch.from_numpy(frames).unsqueeze(0).to(DEVICE)  
        src_mask = torch.zeros((1, src.size(1)), dtype=torch.bool, device=DEVICE)  

        ys = torch.tensor([[self.bos_id] + self.committed], dtype=torch.long, device=DEVICE)
        ymask = torch.zeros_like(ys, dtype=torch.bool, device=DEVICE)

        logits = self.model(src, src_mask, ys, ymask)     
        next_logits = logits[0, -1]                       
        probs = F.softmax(next_logits, dim=-1)            
        smoothed = self.smoother.update(probs)           

        top_id = int(np.argmax(smoothed))
        top_p  = float(smoothed[top_id])

        if top_id == self._cand_id:
            self._cand_count += 1
        else:
            self._cand_id = top_id
            self._cand_count = 1

        committed_changed = False
        if top_id not in (self.pad_id, self.bos_id) and top_p >= self.commit_thresh and self._cand_count >= self.commit_patience:
            if not self.committed or self.committed[-1] != top_id:
                if top_id == self.eos_id:
                    pass
                else:
                    self.committed.append(top_id)
                    committed_changed = True
            self._cand_count = 0

        text = "".join(idx2char.get(t, "") for t in self.committed if t not in (self.pad_id, self.bos_id, self.eos_id))
        return (top_id, top_p, committed_changed), text

decoder = RealTimeDecoder(model, BOS_ID, EOS_ID, PAD_ID, window=64, smooth_k=5, commit_thresh=0.60, commit_patience=3)
print("✓ RealTimeDecoder ready (window=64, smoothing K=5, thresh=0.60, patience=3)")


✓ RealTimeDecoder ready (window=64, smoothing K=5, thresh=0.60, patience=3)


In [95]:
VIDEO_PATH = 0  
FONT = cv2.FONT_HERSHEY_SIMPLEX

def run_realtime(source=VIDEO_PATH, flip=True, target_fps=20):
    cap = cv2.VideoCapture(source)
    if not cap.isOpened():
        raise RuntimeError("Could not open video source. Set VIDEO_PATH to a video file path.")

    decoder.reset_text()
    last = time.time()
    frame_interval = 1.0 / max(1, target_fps)

    try:
        while True:
            ok, frame = cap.read()
            if not ok: break
            if flip: frame = cv2.flip(frame, 1)

            det = extract_landmarks(frame)
            x = normalize_frame(det)
            decoder.push_frame(x)
            _, text = decoder.step()

            cv2.putText(frame, text, (20, 40), FONT, 1.0, (0, 255, 0), 2, cv2.LINE_AA)

            now = time.time()
            if now - last >= frame_interval:
                last = now
                cv2.imshow("ASL Fingerspelling (Phase 3 - smoothed)", frame)
            if cv2.waitKey(1) & 0xFF == 27:  # ESC
                break
    finally:
        cap.release()
        cv2.destroyAllWindows()

print("✓ Live loop ready. Call run_realtime() to test (webcam) or run_realtime('/path/to/video.mp4').")


✓ Live loop ready. Call run_realtime() to test (webcam) or run_realtime('/path/to/video.mp4').


In [96]:
def infer_on_image(bgr_img, warmup=32):
    decoder.reset_text()
    det = extract_landmarks(bgr_img)
    x = normalize_frame(det)
    for _ in range(warmup):
        decoder.push_frame(x)
        decoder.step()
    _, text = decoder.step()
    return text

print("✓ Single-frame function ready: infer_on_image(bgr_img)")


✓ Single-frame function ready: infer_on_image(bgr_img)
