<a href="https://colab.research.google.com/github/Siddamsetti-Venkata-Pavan/DeepLearning/blob/main/sign_language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers sentencepiece datasets opencv-python moviepy tqdm

In [None]:
import torch, sys
if torch.cuda.is_available():
    print("CUDA available. Device:", torch.cuda.get_device_name(0))
else:
    print("No GPU available. Make sure you selected a GPU runtime (Runtime -> Change runtime type).")


CUDA available. Device: Tesla T4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
DRIVE_ROOT = "/content/drive/MyDrive"
# change this name if your folder is named differently
PREFERRED_FOLDER_NAMES = ["sign language project", "sign_language_project", "SignLanguageProject", "Sign Language Project"]
found = None
for candidate in PREFERRED_FOLDER_NAMES:
    cand_path = os.path.join(DRIVE_ROOT, candidate)
    if os.path.isdir(cand_path):
        found = cand_path
        break

if found is None:
    # list top-level drive folders to help you find the folder name
    print("\nCould not auto-find your dataset folder under MyDrive.")
    print("List of folders in /content/drive/MyDrive/:")
    print(os.listdir(DRIVE_ROOT))
    print("\nPlease note the exact folder name and set DATA_ROOT variable accordingly in the next cell.")
    DATA_ROOT = None
else:
    DATA_ROOT = found
    print("\nFound dataset folder at:", DATA_ROOT)
    print("Contents (first 50 entries):")
    print(os.listdir(DATA_ROOT)[:50])


Found dataset folder at: /content/drive/MyDrive/sign language project
Contents (first 50 entries):
['how2sign_realigned_test.csv', 'how2sign_realigned_train.csv', 'how2sign_realigned_val.csv', 'train_rgb_front_clips.zip', 'test_rgb_front_clips.zip', 'val_rgb_front_clips.zip']


In [None]:
WORK_DIR = "/content/SignLanguageProject"
os.makedirs(WORK_DIR, exist_ok=True)
print("\nWorking extraction directory (runtime):", WORK_DIR)


Working extraction directory (runtime): /content/SignLanguageProject


In [None]:
import pandas as pd

# Confirm dataset root is correct
print("DATA_ROOT =", DATA_ROOT)
print("\nFiles inside dataset folder:")
all_files = os.listdir(DATA_ROOT)
for f in all_files:
    print(" -", f)

# Find CSV files
csv_files = [f for f in all_files if f.lower().endswith(".csv")]
print("\nCSV files found:", csv_files)

# Load and preview one CSV (adjust name if needed, e.g., 'train.csv')
if len(csv_files) > 0:
    sample_csv_path = os.path.join(DATA_ROOT, csv_files[0])
    df = pd.read_csv(sample_csv_path, sep="\t")  # these are usually tab-separated
    print("\nPreview of", csv_files[0])
    print(df.head())
else:
    print("⚠️ No CSV files found.")


DATA_ROOT = /content/drive/MyDrive/sign language project

Files inside dataset folder:
 - how2sign_realigned_test.csv
 - how2sign_realigned_train.csv
 - how2sign_realigned_val.csv
 - train_rgb_front_clips.zip
 - test_rgb_front_clips.zip
 - val_rgb_front_clips.zip

CSV files found: ['how2sign_realigned_test.csv', 'how2sign_realigned_train.csv', 'how2sign_realigned_val.csv']

Preview of how2sign_realigned_test.csv
      VIDEO_ID               VIDEO_NAME    SENTENCE_ID  \
0  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_0   
1  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_2   
2  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_3   
3  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_4   
4  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_5   

               SENTENCE_NAME  START_REALIGNED  END_REALIGNED  \
0  -fZc293MpJk_0-1-rgb_front             0.26           6.79   
1  -fZc293MpJk_2-1-rgb_front             7.27          20.30   
2  -fZc293MpJk_3-1-rgb_front            21.

In [None]:
import zipfile

# Define paths
zip_files = {
    "train": os.path.join(DATA_ROOT, "train_rgb_front_clips.zip"),
    "val": os.path.join(DATA_ROOT, "val_rgb_front_clips.zip"),
    "test": os.path.join(DATA_ROOT, "test_rgb_front_clips.zip"),
}

extract_paths = {
    split: os.path.join(WORK_DIR, f"{split}_clips")
    for split in zip_files.keys()
}

# Extract if not already
for split, zip_path in zip_files.items():
    out_dir = extract_paths[split]
    if not os.path.exists(out_dir):
        print(f"Extracting {split} data...")
        with zipfile.ZipFile(zip_path, "r") as zf:
            zf.extractall(out_dir)
    else:
        print(f"{split} data already extracted at {out_dir}")

# Verify extraction
for split, out_dir in extract_paths.items():
    print(f"\nListing first 10 files in {split} set:")
    print(os.listdir(out_dir)[:10])

train data already extracted at /content/SignLanguageProject/train_clips
val data already extracted at /content/SignLanguageProject/val_clips
test data already extracted at /content/SignLanguageProject/test_clips

Listing first 10 files in train set:
['raw_videos']

Listing first 10 files in val set:
['raw_videos']

Listing first 10 files in test set:
['raw_videos']


In [None]:
import pandas as pd

# Path to the CSVs
csv_paths = {
    "train": os.path.join(DATA_ROOT, "how2sign_realigned_train.csv"),
    "val": os.path.join(DATA_ROOT, "how2sign_realigned_val.csv"),
    "test": os.path.join(DATA_ROOT, "how2sign_realigned_test.csv"),
}

# Load train CSV (tab-separated file)
train_df = pd.read_csv(csv_paths["train"], sep="\t")

print("Train CSV columns:", train_df.columns)
print("\nSample rows from train.csv:")
print(train_df.head())

# Path to extracted train raw videos
train_videos_path = os.path.join(WORK_DIR, "train_clips/raw_videos")

# Check if SENTENCE_NAME matches files in raw_videos
sample_files = train_df["SENTENCE_NAME"].head(10).tolist()
print("\nFirst 10 SENTENCE_NAME entries from CSV:")
print(sample_files)
# Add .mp4 extension to SENTENCE_NAME
sample_files_mp4 = [f + ".mp4" for f in sample_files]

existing_mp4 = [f for f in sample_files_mp4 if f in os.listdir(train_videos_path)]

print("Checking with .mp4 extension...")
print("Found matching files:", existing_mp4)


Train CSV columns: Index(['VIDEO_ID', 'VIDEO_NAME', 'SENTENCE_ID', 'SENTENCE_NAME',
       'START_REALIGNED', 'END_REALIGNED', 'SENTENCE'],
      dtype='object')

Sample rows from train.csv:
      VIDEO_ID               VIDEO_NAME     SENTENCE_ID  \
0  --7E2sU6zP4  --7E2sU6zP4-5-rgb_front  --7E2sU6zP4_10   
1  --7E2sU6zP4  --7E2sU6zP4-5-rgb_front  --7E2sU6zP4_11   
2  --7E2sU6zP4  --7E2sU6zP4-5-rgb_front  --7E2sU6zP4_12   
3  --7E2sU6zP4  --7E2sU6zP4-5-rgb_front  --7E2sU6zP4_13   
4  --7E2sU6zP4  --7E2sU6zP4-5-rgb_front   --7E2sU6zP4_5   

                SENTENCE_NAME  START_REALIGNED  END_REALIGNED  \
0  --7E2sU6zP4_10-5-rgb_front           129.06         142.48   
1  --7E2sU6zP4_11-5-rgb_front           142.49         169.40   
2  --7E2sU6zP4_12-5-rgb_front           169.45         182.57   
3  --7E2sU6zP4_13-5-rgb_front           183.12         189.01   
4   --7E2sU6zP4_5-5-rgb_front            55.95          65.19   

                                            SENTENCE  
0  And I

In [None]:
import os
import pandas as pd

# Paths
# BASE_DIR = "/content/SignLanguageProject" # Old
# RAW_VIDEOS_DIR = os.path.join(BASE_DIR, "raw_videos") # Old

# Use the correct paths from previous cells
# DATA_ROOT is defined in cell vZHQnbTRQCCa
# extract_paths is defined in cell KPrYw2LZQCKe
# WORK_DIR is defined in cell ctGuOLUAQCFC

# CSV paths (using DATA_ROOT)
train_csv = os.path.join(DATA_ROOT, "how2sign_realigned_train.csv")
val_csv   = os.path.join(DATA_ROOT, "how2sign_realigned_val.csv")
test_csv  = os.path.join(DATA_ROOT, "how2sign_realigned_test.csv")

# Load CSVs (using sep='\t' as seen in cell Wkk58kYiQCHh)
def load_tsv(path):
    df = pd.read_csv(path, sep="\t", quotechar='"', engine="python")

    # If extra columns exist beyond the expected 7, merge them into SENTENCE
    if df.shape[1] > 7:
        df['SENTENCE'] = df.iloc[:, 6:].astype(str).agg(' '.join, axis=1)
        df = df.iloc[:, :7]  # keep only the first 7 cols

    return df

# Load all splits
df_train = load_tsv(train_csv)
df_val   = load_tsv(val_csv)
df_test  = load_tsv(test_csv)

# Function to link CSV with actual video paths
def link_videos(df, split):
    video_paths, sentences = [], []
    # Construct the correct path to the raw video directory for the split
    split_raw_videos_dir = os.path.join(extract_paths[split], "raw_videos") # Use extract_paths

    for _, row in df.iterrows():
        fname = row["SENTENCE_NAME"] + ".mp4"
        fpath = os.path.join(split_raw_videos_dir, fname) # Use the split_raw_videos_dir
        if os.path.exists(fpath):   # only keep valid files
            video_paths.append(fpath)
            sentences.append(row["SENTENCE"])
    return pd.DataFrame({"video_path": video_paths, "sentence": sentences, "split": split})

# Create dataset index for each split
train_data = link_videos(df_train, "train")
val_data   = link_videos(df_val, "val")
test_data  = link_videos(df_test, "test")

# Combine all
dataset_index = pd.concat([train_data, val_data, test_data]).reset_index(drop=True)

print("Dataset index created ✅")
display(dataset_index.head(10)) # Use display for better formatting
print("Total samples:", len(dataset_index))
print("Train:", len(train_data), "Val:", len(val_data), "Test:", len(test_data))

Dataset index created ✅


Unnamed: 0,video_path,sentence,split
0,/content/SignLanguageProject/train_clips/raw_v...,And I call them decorative elements because ba...,train
1,/content/SignLanguageProject/train_clips/raw_v...,So they don't really have much of a symbolic m...,train
2,/content/SignLanguageProject/train_clips/raw_v...,"Now this is very, this is actually an insert o...",train
3,/content/SignLanguageProject/train_clips/raw_v...,"This is all the you know, take off on the idea...",train
4,/content/SignLanguageProject/train_clips/raw_v...,It's almost has a feathery like posture to it.,train
5,/content/SignLanguageProject/train_clips/raw_v...,"And so, it's used in architecture as a decorat...",train
6,/content/SignLanguageProject/train_clips/raw_v...,And so what's happened with the idea of acanth...,train
7,/content/SignLanguageProject/train_clips/raw_v...,And it has been wildly colored so you can look...,train
8,/content/SignLanguageProject/train_clips/raw_v...,"Here, actually, I have some samples of traced ...",train
9,/content/SignLanguageProject/train_clips/raw_v...,Hi.,train


Total samples: 35129
Train: 31047 Val: 1739 Test: 2343


In [None]:
import cv2
import numpy as np
import tensorflow as tf

# Parameters
IMG_SIZE = 224
FPS = 25
MAX_FRAMES = 64  # you can try 128 if GPU allows

def load_video(path, max_frames=MAX_FRAMES, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    frame_count = 0

    # Get original FPS
    orig_fps = cap.get(cv2.CAP_PROP_FPS)
    step = max(1, int(orig_fps // FPS))  # sampling step

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % step == 0:  # sample frames
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, resize)
            frames.append(frame)

        frame_count += 1

    cap.release()

    # Convert to numpy
    frames = np.array(frames, dtype=np.float32)
    frames = frames / 255.0  # normalize

    # Pad/truncate to fixed size
    if len(frames) < max_frames:
        pad_len = max_frames - len(frames)
        pad = np.zeros((pad_len, resize[0], resize[1], 3), dtype=np.float32)
        frames = np.concatenate([frames, pad], axis=0)
    else:
        frames = frames[:max_frames]

    return frames

# ✅ Test on one sample video
sample_path = dataset_index[dataset_index['split']=="train"].iloc[0]['video_path']
frames = load_video(sample_path)
print("Video shape:", frames.shape)  # (MAX_FRAMES, 224, 224, 3)

Video shape: (64, 224, 224, 3)


In [None]:
import tensorflow as tf

# ⚙️ Parameters
BATCH_SIZE = 4
AUTOTUNE = tf.data.AUTOTUNE

# Function to load video and sentence
def preprocess_row(row):
    video_path = row['video_path']
    sentence = row['sentence']
    frames = load_video(video_path)  # returns (MAX_FRAMES, 224, 224, 3)
    return frames, sentence

# Create a TensorFlow dataset from subset dataframe (only 2000 samples)
subset_df = dataset_index[dataset_index['split'] == 'train'].sample(2000, random_state=42).reset_index(drop=True)

# Wrap dataframe rows into a generator
def data_generator(df):
    for i, row in df.iterrows():
        yield preprocess_row(row)

# Build tf.data.Dataset
train_ds = tf.data.Dataset.from_generator(
    lambda: data_generator(subset_df),
    output_signature=(
        tf.TensorSpec(shape=(MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.string)
    )
)

# Shuffle, batch, prefetch
train_ds = train_ds.shuffle(100).batch(BATCH_SIZE).prefetch(AUTOTUNE)

# ✅ Test one batch
for videos, texts in train_ds.take(1):
    print("Batch video shape:", videos.shape)
    print("Batch text example:", texts[:2].numpy())


Batch video shape: (4, 64, 224, 224, 3)
Batch text example: [b"Again it's a smaller goalie box, so these are a little bit rarer than the one V one penalty kicks."
 b"I'm going to do some and go the opposite way."]


In [None]:
from tensorflow.keras import layers, models

# ⚙️ Parameters
EMBED_DIM = 512
NUM_HEADS = 4
FF_DIM = 512
NUM_LAYERS = 2

def build_video_transformer(input_shape=(MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3)):
    inputs = layers.Input(shape=input_shape)

    # Step 1: CNN feature extractor (per frame)
    cnn_base = tf.keras.applications.EfficientNetB0(
        include_top=False, weights='imagenet', pooling='avg'
    )
    cnn_base.trainable = False  # Freeze CNN weights

    # Apply CNN to each frame
    time_distributed = layers.TimeDistributed(cnn_base)(inputs)  # (batch, frames, features=1280)

    # Project CNN features to EMBED_DIM
    projected_features = layers.Dense(EMBED_DIM, activation='relu')(time_distributed) # (batch, frames, EMBED_DIM=512)

    # Step 2: Positional encoding for frames
    positions = tf.range(start=0, limit=MAX_FRAMES, delta=1)
    pos_embed = layers.Embedding(input_dim=MAX_FRAMES, output_dim=EMBED_DIM)(positions) # (frames, EMBED_DIM=512)

    # Add positional encoding to projected features
    x = projected_features + pos_embed

    # Step 3: Transformer Encoder layers
    for _ in range(NUM_LAYERS):
        attn_output = layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)(x, x)
        x = layers.Add()([x, attn_output])
        x = layers.LayerNormalization()(x)
        ff_output = layers.Dense(FF_DIM, activation='relu')(x)
        ff_output = layers.Dense(EMBED_DIM)(ff_output)
        x = layers.Add()([x, ff_output])
        x = layers.LayerNormalization()(x)

    # Step 4: Global average pooling across time
    x = layers.GlobalAveragePooling1D()(x)

    # Step 5: Final dense projection
    outputs = layers.Dense(EMBED_DIM, activation='relu')(x)

    model = models.Model(inputs, outputs, name="video_transformer")
    return model

video_transformer = build_video_transformer()
video_transformer.summary()