In [14]:
import os
import glob
import string
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from ml2en import ml2en # Import the Malayalam to Manglish library

# Import the LipFormer model from your script
from my_model import LipFormer

In [15]:
# --- 1. Configuration ---
CONFIG = {
    "data": {
        "landmarks": r"C:\Users\TejasRanjith\Desktop\FINAL MAIN\LipReading\dataset\Extracted_landmarks_model_ready\\",
        "lip_rois": r"C:\Users\TejasRanjith\Desktop\FINAL MAIN\LipReading\dataset\Extracted_lip_crosssection\\",
        "transcripts": r"C:\Users\TejasRanjith\Desktop\FINAL MAIN\LipReading\dataset\Transcripts\\",
    },
    "checkpoint_dir": "checkpoints",
    "epochs": 1, # Increased epochs for meaningful training
    "batch_size": 1, # You can try 1 if memory issues persist
    "learning_rate": 1e-4,
    "teacher_forcing_ratio": 0.5,
    "lambda_val": 0.7,
    "image_size": (80, 160),
    "validation_split": 0.1, # 10% of data for validation
}


2. Vocabulary Definitions 

In [16]:
# --- Vocabulary for Manglish ---
MANGLISH_PAD_TOKEN = 0
MANGLISH_SOS_TOKEN = 1
MANGLISH_EOS_TOKEN = 2
MANGLISH_UNK_TOKEN = 3
MANGLISH_CHARS = string.ascii_lowercase + " .'-"
manglish_to_int = {char: i + 4 for i, char in enumerate(MANGLISH_CHARS)}
manglish_to_int["<pad>"] = MANGLISH_PAD_TOKEN
manglish_to_int["<sos>"] = MANGLISH_SOS_TOKEN
manglish_to_int["<eos>"] = MANGLISH_EOS_TOKEN
manglish_to_int["<unk>"] = MANGLISH_UNK_TOKEN
int_to_manglish = {i: char for char, i in manglish_to_int.items()}
MANGLISH_VOCAB_SIZE = len(manglish_to_int)

# --- Vocabulary for Malayalam ---
MALAYALAM_PAD_TOKEN = 0
MALAYALAM_SOS_TOKEN = 1
MALAYALAM_EOS_TOKEN = 2
MALAYALAM_UNK_TOKEN = 3
malayalam_to_int = {
    "<pad>": MALAYALAM_PAD_TOKEN,
    "<sos>": MALAYALAM_SOS_TOKEN,
    "<eos>": MALAYALAM_EOS_TOKEN,
    "<unk>": MALAYALAM_UNK_TOKEN,
}
int_to_malayalam = {}

In [17]:
def build_malayalam_vocab(transcript_dir):
    """Scans all transcript files to build the Malayalam character vocabulary."""
    vocab = set()
    transcript_files = glob.glob(os.path.join(transcript_dir, "*.txt"))
    for file_path in tqdm(transcript_files, desc="Building Malayalam Vocab"):
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = [line.strip().split() for line in f.readlines()]
        full_text = " ".join([parts[-1] for parts in lines if len(parts) > 2])
        vocab.update(list(full_text))
    
    for i, char in enumerate(sorted(list(vocab))):
        malayalam_to_int[char] = i + 4
        
    global int_to_malayalam
    int_to_malayalam = {i: char for char, i in malayalam_to_int.items()}
    
    return len(malayalam_to_int)

In [18]:
MALAYALAM_VOCAB_SIZE = build_malayalam_vocab(CONFIG["data"]["transcripts"])

Building Malayalam Vocab: 100%|██████████| 7406/7406 [00:00<00:00, 16831.76it/s]


In [19]:
MALAYALAM_VOCAB_SIZE

78

In [26]:
int_to_malayalam

{0: '<pad>',
 1: '<sos>',
 2: '<eos>',
 3: '<unk>',
 4: ' ',
 5: '"',
 6: '-',
 7: '.',
 8: 'ം',
 9: 'ഃ',
 10: 'അ',
 11: 'ആ',
 12: 'ഇ',
 13: 'ഈ',
 14: 'ഉ',
 15: 'ഊ',
 16: 'ഋ',
 17: 'എ',
 18: 'ഏ',
 19: 'ഐ',
 20: 'ഒ',
 21: 'ഓ',
 22: 'ഔ',
 23: 'ക',
 24: 'ഖ',
 25: 'ഗ',
 26: 'ഘ',
 27: 'ങ',
 28: 'ച',
 29: 'ഛ',
 30: 'ജ',
 31: 'ഞ',
 32: 'ട',
 33: 'ഠ',
 34: 'ഡ',
 35: 'ണ',
 36: 'ത',
 37: 'ഥ',
 38: 'ദ',
 39: 'ധ',
 40: 'ന',
 41: 'പ',
 42: 'ഫ',
 43: 'ബ',
 44: 'ഭ',
 45: 'മ',
 46: 'യ',
 47: 'ര',
 48: 'റ',
 49: 'ല',
 50: 'ള',
 51: 'ഴ',
 52: 'വ',
 53: 'ശ',
 54: 'ഷ',
 55: 'സ',
 56: 'ഹ',
 57: 'ാ',
 58: 'ി',
 59: 'ീ',
 60: 'ു',
 61: 'ൂ',
 62: 'ൃ',
 63: 'െ',
 64: 'േ',
 65: 'ൈ',
 66: 'ൊ',
 67: 'ോ',
 68: 'ൌ',
 69: '്',
 70: 'ൗ',
 71: 'ൺ',
 72: 'ൻ',
 73: 'ർ',
 74: 'ൽ',
 75: 'ൾ',
 76: '瑞',
 77: '阿'}

In [25]:
manglish_to_int

{'a': 4,
 'b': 5,
 'c': 6,
 'd': 7,
 'e': 8,
 'f': 9,
 'g': 10,
 'h': 11,
 'i': 12,
 'j': 13,
 'k': 14,
 'l': 15,
 'm': 16,
 'n': 17,
 'o': 18,
 'p': 19,
 'q': 20,
 'r': 21,
 's': 22,
 't': 23,
 'u': 24,
 'v': 25,
 'w': 26,
 'x': 27,
 'y': 28,
 'z': 29,
 ' ': 30,
 '.': 31,
 "'": 32,
 '-': 33,
 '<pad>': 0,
 '<sos>': 1,
 '<eos>': 2,
 '<unk>': 3}

In [24]:
malayalam_to_int

{'<pad>': 0,
 '<sos>': 1,
 '<eos>': 2,
 '<unk>': 3,
 ' ': 4,
 '"': 5,
 '-': 6,
 '.': 7,
 'ം': 8,
 'ഃ': 9,
 'അ': 10,
 'ആ': 11,
 'ഇ': 12,
 'ഈ': 13,
 'ഉ': 14,
 'ഊ': 15,
 'ഋ': 16,
 'എ': 17,
 'ഏ': 18,
 'ഐ': 19,
 'ഒ': 20,
 'ഓ': 21,
 'ഔ': 22,
 'ക': 23,
 'ഖ': 24,
 'ഗ': 25,
 'ഘ': 26,
 'ങ': 27,
 'ച': 28,
 'ഛ': 29,
 'ജ': 30,
 'ഞ': 31,
 'ട': 32,
 'ഠ': 33,
 'ഡ': 34,
 'ണ': 35,
 'ത': 36,
 'ഥ': 37,
 'ദ': 38,
 'ധ': 39,
 'ന': 40,
 'പ': 41,
 'ഫ': 42,
 'ബ': 43,
 'ഭ': 44,
 'മ': 45,
 'യ': 46,
 'ര': 47,
 'റ': 48,
 'ല': 49,
 'ള': 50,
 'ഴ': 51,
 'വ': 52,
 'ശ': 53,
 'ഷ': 54,
 'സ': 55,
 'ഹ': 56,
 'ാ': 57,
 'ി': 58,
 'ീ': 59,
 'ു': 60,
 'ൂ': 61,
 'ൃ': 62,
 'െ': 63,
 'േ': 64,
 'ൈ': 65,
 'ൊ': 66,
 'ോ': 67,
 'ൌ': 68,
 '്': 69,
 'ൗ': 70,
 'ൺ': 71,
 'ൻ': 72,
 'ർ': 73,
 'ൽ': 74,
 'ൾ': 75,
 '瑞': 76,
 '阿': 77}