<a href="https://colab.research.google.com/github/SattamAltwaim/SaSOKE/blob/DHM-tech/notebooks/3_train_soke(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SOKE Stage 2: Train Sign Language Generator
Trains the mBART-based multilingual sign language generator using tokenized poses.


In [1]:
# Clone repo if not present
import os
if not os.path.exists('/content/SaSOKE'):
    !git clone https://github.com/SattamAltwaim/SaSOKE.git
    %cd /content/SaSOKE
    !git checkout DHM-tech
else:
    %cd /content/SaSOKE
    !git checkout DHM-tech
    !git pull origin DHM-tech

# Install dependencies
!pip install -r requirements_colab.txt

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

drive_data = '/content/drive/MyDrive/GraduationProject/CodeFiles/SaSOKE'

# Link dependencies from Drive
!rm -rf deps checkpoints smpl-x
!ln -sf {drive_data}/deps deps
!ln -sf {drive_data}/smpl-x smpl-x
!ln -sf {drive_data}/checkpoints checkpoints

print("✓ Setup complete!")
print("Code:", os.getcwd())
print("Data:", drive_data)

Cloning into 'SaSOKE'...
remote: Enumerating objects: 506, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 506 (delta 4), reused 0 (delta 0), pack-reused 495 (from 1)[K
Receiving objects: 100% (506/506), 2.47 MiB | 13.99 MiB/s, done.
Resolving deltas: 100% (219/219), done.
/content/SaSOKE
Branch 'DHM-tech' set up to track remote branch 'DHM-tech' from 'origin'.
Switched to a new branch 'DHM-tech'
Collecting pytorch-lightning>=2.0.0 (from -r requirements_colab.txt (line 3))
  Downloading pytorch_lightning-2.5.6-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>=1.0.0 (from -r requirements_colab.txt (line 4))
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting shortuuid>=1.0.0 (from -r requirements_colab.txt (line 16))
  Downloading shortuuid-1.0.13-py3-none-any.whl.metadata (5.8 kB)
Collecting ftfy>=6.1.0 (from -r requirements_colab.txt (line 20))
  Downloading ftfy-6.3.1-py3-n

## Prerequisites
Ensure tokenizer is trained or pretrained checkpoint exists at `checkpoints/vae/tokenizer.ckpt`


In [4]:
# Verify tokenizer checkpoint in Drive
assert os.path.exists(f'{drive_data}/checkpoints/vae/tokenizer.ckpt'), "Tokenizer not found in Drive!"
print("Tokenizer checkpoint found in Drive")

Tokenizer checkpoint found in Drive


## Configuration Setup


In [5]:
# Update config for Colab/CUDA
import yaml

with open('configs/soke.yaml', 'r') as f:
    config = yaml.safe_load(f)

# GPU settings
config['ACCELERATOR'] = 'gpu'
config['DEVICE'] = [0]

# Point to Drive for data/models
config['DATASET']['H2S']['ROOT'] = f'{drive_data}/data/How2Sign'
config['DATASET']['H2S']['MEAN_PATH'] = f'{drive_data}/smpl-x/mean.pt'
config['DATASET']['H2S']['STD_PATH'] = f'{drive_data}/smpl-x/std.pt'

# Model paths in Drive
config['TRAIN']['PRETRAINED_VAE'] = f'{drive_data}/checkpoints/vae/tokenizer.ckpt'
config['model']['params']['lm_path'] = f'{drive_data}/deps/mbart-h2s-csl-phoenix'

# Training settings
config['TRAIN']['NUM_WORKERS'] = 2
config['TRAIN']['BATCH_SIZE'] = 16

# Save config
with open('configs/soke_colab.yaml', 'w') as f:
    yaml.dump(config, f)

print("Config updated - GitHub code + Drive data")


Config updated - GitHub code + Drive data


In [7]:
# ============================================
# CRITICAL: Generate GloVe Word Embeddings
# ============================================

import os
import numpy as np
import pickle
from os.path import join as pjoin

glove_dir = "/content/drive/MyDrive/GraduationProject/CodeFiles/SaSOKE/deps/t2m/glove/"
os.makedirs(glove_dir, exist_ok=True)

print("Step 1: Downloading GloVe embeddings...")
# Download GloVe 6B.300d
!wget -q --show-progress http://nlp.stanford.edu/data/glove.6B.zip -O /content/glove.6B.zip

print("\nStep 2: Extracting...")
import zipfile
with zipfile.ZipFile("/content/glove.6B.zip", 'r') as zip_ref:
    zip_ref.extract("glove.6B.300d.txt", "/content/")

print("\nStep 3: Preparing vocabulary for How2Sign dataset...")

# Read the text annotations from How2Sign to build vocabulary
# This creates the word-to-index mapping
from collections import Counter
import json

# Get vocabulary from your How2Sign dataset
h2s_root = f"{drive_data}/data/How2Sign"
vocab_words = set()

# Scan through your dataset to collect all words
for split in ['train', 'val', 'test']:
    split_file = f"{h2s_root}/how2sign_{split}.txt"
    if os.path.exists(split_file):
        with open(split_file, 'r') as f:
            for line in f:
                words = line.strip().lower().split()
                vocab_words.update(words)
        print(f"  Loaded {split} vocabulary")

# Add special tokens
special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
vocab_words.update(special_tokens)

print(f"\nTotal vocabulary size: {len(vocab_words)}")

# Load GloVe embeddings
print("\nStep 4: Loading GloVe vectors...")
glove_file = "/content/glove.6B.300d.txt"
glove_embeddings = {}

with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        if word in vocab_words:
            vector = np.asarray(values[1:], dtype='float32')
            glove_embeddings[word] = vector

print(f"Found {len(glove_embeddings)} words in GloVe")

# Create word-to-index mapping
word_to_idx = {word: idx for idx, word in enumerate(sorted(vocab_words))}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Create embedding matrix
embedding_dim = 300
vocab_size = len(vocab_words)
embedding_matrix = np.random.randn(vocab_size, embedding_dim).astype(np.float32) * 0.01

# Fill in GloVe vectors where available
for word, idx in word_to_idx.items():
    if word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]

print("\nStep 5: Saving files...")

# Save the files with the correct names
np.save(pjoin(glove_dir, 'our_vab_data.npy'), embedding_matrix)
with open(pjoin(glove_dir, 'our_vab_idx.pkl'), 'wb') as f:
    pickle.dump(word_to_idx, f)
with open(pjoin(glove_dir, 'our_vab_words.pkl'), 'wb') as f:
    pickle.dump(idx_to_word, f)

# Also copy the raw GloVe file
!cp /content/glove.6B.300d.txt {glove_dir}/glove.6B.300d.txt

print("\n✓ GloVe files generated successfully!")
print(f"  - Vocabulary size: {vocab_size}")
print(f"  - Embedding dim: {embedding_dim}")
print(f"  - Matrix shape: {embedding_matrix.shape}")

# Verify the files
data = np.load(pjoin(glove_dir, 'our_vab_data.npy'))
print(f"\n✓ Verification: our_vab_data.npy shape = {data.shape}")
print(f"  File size: {os.path.getsize(pjoin(glove_dir, 'our_vab_data.npy')) / 1024:.2f} KB")

Step 1: Downloading GloVe embeddings...

Step 2: Extracting...

Step 3: Preparing vocabulary for How2Sign dataset...

Total vocabulary size: 4

Step 4: Loading GloVe vectors...
Found 0 words in GloVe

Step 5: Saving files...

✓ GloVe files generated successfully!
  - Vocabulary size: 4
  - Embedding dim: 300
  - Matrix shape: (4, 300)

✓ Verification: our_vab_data.npy shape = (4, 300)
  File size: 4.81 KB


In [12]:
# ============================================
# FIX: Regenerate GloVe with correct format
# ============================================

import os
import numpy as np
import pickle
from os.path import join as pjoin

glove_dir = "/content/drive/MyDrive/GraduationProject/CodeFiles/SaSOKE/deps/t2m/glove/"

print("Reading existing GloVe file...")
glove_file = f"{glove_dir}/glove.6B.300d.txt"

# Load GloVe vectors
glove_dict = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_dict[word] = vector

print(f"Loaded {len(glove_dict)} GloVe vectors")

# Create vocabulary - use top 5000 most common words
vocab_size = 5000
words = list(glove_dict.keys())[:vocab_size]

# Add special tokens at the beginning
special_tokens = ['sos/OTHER', 'eos/OTHER', 'unk/OTHER']
words = special_tokens + words

print(f"Total vocabulary: {len(words)} words")

# Create the correct format:
# word2idx: maps word -> index (integer)
# idx2word: maps index (integer) -> word
word2idx = {word: idx for idx, word in enumerate(words)}
idx2word = {idx: word for idx, word in enumerate(words)}

# Create embedding matrix
embedding_dim = 300
vocab_len = len(words)
embedding_matrix = np.zeros((vocab_len, embedding_dim), dtype=np.float32)

# Fill embeddings
for idx, word in enumerate(words):
    if word in glove_dict:
        embedding_matrix[idx] = glove_dict[word]
    else:
        # Random init for special tokens
        embedding_matrix[idx] = np.random.randn(embedding_dim).astype(np.float32) * 0.01

print("\nSaving corrected vocabulary files...")

# Save with correct structure
np.save(pjoin(glove_dir, 'our_vab_data.npy'), embedding_matrix)

with open(pjoin(glove_dir, 'our_vab_idx.pkl'), 'wb') as f:
    pickle.dump(word2idx, f)  # word -> idx mapping

with open(pjoin(glove_dir, 'our_vab_words.pkl'), 'wb') as f:
    pickle.dump(idx2word, f)  # idx -> word mapping

print("\n✅ Fixed vocabulary structure!")
print(f"  - Vocabulary size: {vocab_len}")
print(f"  - Embedding shape: {embedding_matrix.shape}")
print(f"  - word2idx type: {type(word2idx)}")
print(f"  - Sample word2idx: {list(word2idx.items())[:5]}")
print(f"  - idx2word type: {type(idx2word)}")
print(f"  - Sample idx2word: {list(idx2word.items())[:5]}")

# Verify
data = np.load(pjoin(glove_dir, 'our_vab_data.npy'))
with open(pjoin(glove_dir, 'our_vab_idx.pkl'), 'rb') as f:
    loaded_w2i = pickle.load(f)
with open(pjoin(glove_dir, 'our_vab_words.pkl'), 'rb') as f:
    loaded_i2w = pickle.load(f)

print(f"\n✓ Verification successful!")
print(f"  - Embedding data shape: {data.shape}")
print(f"  - word2idx has {len(loaded_w2i)} entries")
print(f"  - idx2word has {len(loaded_i2w)} entries")

print("\n✅ Ready to train! Run the training command again.")

Reading existing GloVe file...
Loaded 400000 GloVe vectors
Total vocabulary: 5003 words

Saving corrected vocabulary files...

✅ Fixed vocabulary structure!
  - Vocabulary size: 5003
  - Embedding shape: (5003, 300)
  - word2idx type: <class 'dict'>
  - Sample word2idx: [('sos/OTHER', 0), ('eos/OTHER', 1), ('unk/OTHER', 2), ('the', 3), (',', 4)]
  - idx2word type: <class 'dict'>
  - Sample idx2word: [(0, 'sos/OTHER'), (1, 'eos/OTHER'), (2, 'unk/OTHER'), (3, 'the'), (4, ',')]

✓ Verification successful!
  - Embedding data shape: (5003, 300)
  - word2idx has 5003 entries
  - idx2word has 5003 entries

✅ Ready to train! Run the training command again.


## Train SOKE Model


In [13]:
# Start training
!python -m train --cfg configs/soke_colab.yaml --nodebug

2025-11-27 16:19:56,883 SEED_VALUE: 1234
DEBUG: false
FULL_CONFIG: false
PRECISION: null
TRAIN:
  SPLIT: train
  NUM_WORKERS: 2
  BATCH_SIZE: 16
  END_EPOCH: 150
  RESUME: ''
  PRETRAINED_VAE: /content/drive/MyDrive/GraduationProject/CodeFiles/SaSOKE/checkpoints/vae/tokenizer.ckpt
  PRETRAINED: ''
  OPTIM:
    target: AdamW
    params:
      lr: 0.0002
      betas:
      - 0.9
      - 0.99
      weight_decay: 0.0
  LR_SCHEDULER:
    target: CosineAnnealingLR
    params:
      T_max: ${TRAIN.END_EPOCH}
      eta_min: 1.0e-06
  STAGE: lm_pretrain
EVAL:
  SPLIT: val
  BATCH_SIZE: 1
  NUM_WORKERS: 16
TEST:
  CHECKPOINTS: null
  SPLIT: test
  BATCH_SIZE: 1
  NUM_WORKERS: 16
  SAVE_PREDICTIONS: true
  COUNT_TIME: false
  REPLICATION_TIMES: 1
  REP_I: 0
  FOLDER: results
model:
  target: mGPT.models.mgpt.MotionGPT
  params:
    condition: text
    task: t2m
    lm: ${lm.mbart_h2s_csl_phoenix}
    motion_vae: ${vq.re96}
    stage: ${TRAIN.STAGE}
    debug: ${DEBUG}
    codebook_size: ${model.p

## Monitor Training


In [None]:
# Load tensorboard
%load_ext tensorboard
%tensorboard --logdir experiments/mgpt/SOKE/


## Test Model
Run inference after training completes.


In [None]:
# Run inference on test set
!python -m test --cfg configs/soke_colab.yaml --task t2m
