<a href="https://colab.research.google.com/github/SattamAltwaim/SaSOKE/blob/main/notebooks/5_text_to_sign_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Text-to-Sign Language Inference
Generate sign language from custom text input using SOKE model.


## 1. Setup Environment


In [1]:
# Clone repo if not present
import os
if not os.path.exists('/content/SaSOKE'):
    !git clone https://github.com/SattamAltwaim/SaSOKE.git
%cd /content/SaSOKE

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

drive_data = '/content/drive/MyDrive/GraduationProject/CodeFiles/SaSOKE'
print("✓ Code:", os.getcwd())
print("✓ Data:", drive_data)


Cloning into 'SaSOKE'...
remote: Enumerating objects: 364, done.[K
remote: Counting objects: 100% (364/364), done.[K
remote: Compressing objects: 100% (270/270), done.[K
remote: Total 364 (delta 113), reused 327 (delta 87), pack-reused 0 (from 0)[K
Receiving objects: 100% (364/364), 2.43 MiB | 12.38 MiB/s, done.
Resolving deltas: 100% (113/113), done.
/content/SaSOKE
Mounted at /content/drive
✓ Code: /content/SaSOKE
✓ Data: /content/drive/MyDrive/GraduationProject/CodeFiles/SaSOKE


In [2]:
# Install dependencies (if needed)
# Install dependencies
%pip install -q pytorch_lightning torchmetrics omegaconf shortuuid transformers diffusers einops wandb rich matplotlib
%pip install -q smplx h5py scikit-image spacy ftfy more-itertools natsort tensorboard sentencepiece
%pip install -q gdown pandas



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/831.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m737.3/831.6 kB[0m [31m22.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m831.6/831.6 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2. Verify GPU


In [3]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️ No GPU detected! Go to Runtime → Change runtime type → GPU")


CUDA available: True
GPU: NVIDIA A100-SXM4-40GB
GPU Memory: 42.47 GB


## 3. Enter Your Custom Text


In [4]:
# Enter your text here - you can modify this!
custom_texts = [
    "Hello, how are you today?",
    "Thank you for your help.",
    "I am learning sign language."
]

# Or enter a single text
# custom_texts = ["Your custom text here"]

print("Input texts:")
for i, text in enumerate(custom_texts, 1):
    print(f"{i}. {text}")


Input texts:
1. Hello, how are you today?
2. Thank you for your help.
3. I am learning sign language.


## 4. Run Inference on Your Text


In [5]:
# Configuration and Argument Parsing (Run this cell ONLY ONCE per runtime session)
import sys
import yaml
import os
from omegaconf import OmegaConf
from mGPT.config import parse_args

# Define the expected path and the actual path in Google Drive
drive_data = '/content/drive/MyDrive/GraduationProject/CodeFiles/SaSOKE' # Ensure drive_data is defined
expected_smpl_path = 'deps/smpl_models'
actual_smpl_path = f'{drive_data}/deps/smpl_models'

# Create a symbolic link if it doesn't exist
if not os.path.exists(expected_smpl_path):
    print(f"Creating symbolic link from '{expected_smpl_path}' to '{actual_smpl_path}'")
    # Ensure the parent directory for the symlink exists
    os.makedirs(os.path.dirname(expected_smpl_path), exist_ok=True)
    os.symlink(actual_smpl_path, expected_smpl_path)
    print("Symbolic link created.")
else:
    print(f"Symbolic link or directory already exists at '{expected_smpl_path}'")

# Configure paths
with open('configs/soke.yaml', 'r') as f:
    config = yaml.safe_load(f)

config['ACCELERATOR'] = 'gpu'
config['DEVICE'] = [0]
config['DATASET']['H2S']['ROOT'] = f'{drive_data}/data/How2Sign'
config['DATASET']['H2S']['MEAN_PATH'] = f'{drive_data}/smpl-x/mean.pt'
config['DATASET']['H2S']['STD_PATH'] = f'{drive_data}/smpl-x/std.pt'
config['TRAIN']['PRETRAINED_VAE'] = f'{drive_data}/checkpoints/vae/tokenizer.ckpt'

with open('configs/text_inference.yaml', 'w') as f:
    yaml.dump(config, f)

# Update assets with the correct path from Google Drive
with open('configs/assets.yaml', 'r') as f:
    assets = yaml.safe_load(f)

# Use the expected_smpl_path which is now a symlink to the drive path
assets['RENDER']['SMPL_MODEL_PATH'] = os.path.join(expected_smpl_path, 'smpl')
assets['RENDER']['MODEL_PATH'] = expected_smpl_path
assets['METRIC']['TM2T']['t2m_path'] = f'{drive_data}/deps/t2m/t2m/'

with open('configs/assets_inference.yaml', 'w') as f:
    yaml.dump(assets, f)

# Parse config (This is where the resolver is registered/used)
sys.argv = ['', '--cfg', 'configs/text_inference.yaml', '--cfg_assets', 'configs/assets_inference.yaml']

# Check if 'eval' resolver is already registered before registering (still good practice)
if not OmegaConf.has_resolver("eval"):
    OmegaConf.register_new_resolver("eval", eval)

cfg = parse_args(phase="test")
cfg.FOLDER = cfg.TEST.FOLDER

print("✓ Configuration and arguments parsed!")

Creating symbolic link from 'deps/smpl_models' to '/content/drive/MyDrive/GraduationProject/CodeFiles/SaSOKE/deps/smpl_models'
Symbolic link created.


ValueError: resolver 'eval' is already registered

In [None]:
# Model Setup and Loading (You can rerun this cell as needed)
import torch
import pytorch_lightning as pl
from mGPT.models.build_model import build_model
from mGPT.data.build_data import build_data
from mGPT.utils.load_checkpoint import load_pretrained_vae, load_pretrained
from mGPT.utils.logger import create_logger

# Assuming 'cfg' and 'drive_data' are defined from the previous cell

# Seed
pl.seed_everything(cfg.SEED_VALUE)

# Build data and model
print("Loading model...")
datamodule = build_data(cfg)
model = build_model(cfg, datamodule)

# Load checkpoints
logger = create_logger(cfg, phase="test")
if cfg.TRAIN.PRETRAINED_VAE:
    load_pretrained_vae(cfg, model, logger)

# Check for trained checkpoint
ckpt_path = f'{drive_data}/experiments/mgpt/SOKE/checkpoints/last.ckpt'
if os.path.exists(ckpt_path):
    print(f"Loading trained checkpoint from {ckpt_path}")
    cfg.TEST.CHECKPOINTS = ckpt_path
    load_pretrained(cfg, model, logger, phase="test")
else:
    print("Using pretrained mBART (no fine-tuned checkpoint found)")

model = model.cuda()
model.eval()

print("✓ Model ready!")

In [None]:
# Helper function to convert features to SMPL-X parameters
def feats_to_smplx(features, mean, std):
    """Convert 133-dim compressed features to SMPL-X parameters."""
    # Denormalize features
    features = features * std + mean

    # Add zero root pose (36 dims) to get 169 dims total
    T = features.shape[0]
    zero_pose = torch.zeros(T, 36).to(features)
    features_full = torch.cat([zero_pose, features], dim=-1)  # (T, 169)

    # Extract SMPL-X parameters
    smplx_params = {
        'root_pose': features_full[:, 0:3].cpu().numpy(),
        'body_pose': features_full[:, 3:66].cpu().numpy(),
        'lhand_pose': features_full[:, 66:111].cpu().numpy(),
        'rhand_pose': features_full[:, 111:156].cpu().numpy(),
        'jaw_pose': features_full[:, 156:159].cpu().numpy(),
        'expression': features_full[:, 159:169].cpu().numpy(),
    }
    return smplx_params

# Generate sign language poses
output_dir = 'text_sign_results'
os.makedirs(output_dir, exist_ok=True)
print(f"\nGenerating sign language for {len(custom_texts)} text(s)....\n")

# Get mean and std for denormalization
mean = datamodule.hparams.mean.cuda()
std = datamodule.hparams.std.cuda()

with torch.no_grad():
    for idx, text in enumerate(custom_texts):
        print(f"[{idx+1}/{len(custom_texts)}] Processing: '{text}'")

        # Prepare input
        batch = {
            'text': [text]
        }

        try:
            # Generate FULL SEQUENCE
            output = model.t2m_eval(batch)

            # Extract features
            feats = output['feats'][0] if 'feats' in output else None

            if feats is None:
                print(f"  ✗ No features generated")
                continue

            # Convert to SMPL-X parameters (full sequence)
            smplx_params = feats_to_smplx(feats, mean, std)

            # Save result (NO TOKENS, only SMPL-X params)
            filename = f"text_{idx+1}.pkl"
            filepath = os.path.join(output_dir, filename)

            result = {
                'text': text,
                'smplx_params': smplx_params,  # Full sequence of SMPL-X poses
                'num_frames': smplx_params['body_pose'].shape[0]
            }

            with open(filepath, 'wb') as f:
                pickle.dump(result, f)

            print(f"  ✓ Saved: {filepath}")
            print(f"    - Frames: {result['num_frames']}")
            print(f"    - SMPL-X parameters saved (no tokens)")

        except Exception as e:
            print(f"  ✗ Error: {e}")
            import traceback
            traceback.print_exc()
            continue

print(f"\nComplete! Predictions saved in '{output_dir}'")
print(f"\nTo play the animations, download results and use:")
print(f"  python3 generate_animation_html.py text_sign_results/text_1.pkl")

## 5. View Results


In [None]:
# List generated files
print("Generated predictions:")
!ls -lh {output_dir}

# Load and display results
pkl_files = sorted([f for f in os.listdir(output_dir) if f.endswith('.pkl')])

for pkl_file in pkl_files:
    filepath = os.path.join(output_dir, pkl_file)

    with open(filepath, 'rb') as f:
        result = pickle.load(f)

    print(f"\n{pkl_file}:")
    print(f"  Text: {result['text']}")
    print(f"  Frames: {result['num_frames']}")

    # Display SMPL-X parameters info
    if result.get('smplx_params') is not None:
        smplx = result['smplx_params']
        print(f"  SMPL-X Parameters:")
        print(f"- root_pose: {smplx['root_pose'].shape} (global orientation)")
        print(f"- body_pose: {smplx['body_pose'].shape} (21 body joints × 3)")
        print(f"- lhand_pose: {smplx['lhand_pose'].shape} (15 left hand joints × 3)")
        print(f"- rhand_pose: {smplx['rhand_pose'].shape} (15 right hand joints × 3)")
        print(f"- jaw_pose: {smplx['jaw_pose'].shape} (jaw rotation)")
        print(f"- expression: {smplx['expression'].shape} (facial expression)")

## 6. Download Results


In [None]:
# Zip results for easy download
!zip -r text_sign_results.zip {output_dir}/

# Download
from google.colab import files
files.download('text_sign_results.zip')

print("✓ Results packaged and ready to download")


## Notes

- **GPU Required**: Make sure you're using a GPU runtime (Runtime → Change runtime type → GPU → T4/V100/A100)
- **First Time**: Run notebook 1 first to download all dependencies to your Google Drive
- **Custom Text**: Simply modify the `custom_texts` list in cell 8 with your own text
- **Output**: Each text generates a `.pkl` file containing predicted sign language poses (3D coordinates)
- **Format**: Poses are in SMPL-X format and can be visualized using 3D animation tools

### Troubleshooting
- **OOM Error**: Reduce text length or batch size
- **Missing files**: Make sure notebook 1 was run successfully to download models
- **Slow generation**: Normal on T4 GPU, faster on V100/A100
