In [None]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

DATA_PATH = "/home/smayan/Desktop/ASL/dataset/pk-hfad-1.landmarks-mediapipe-world-csv/"
JSON_PATH = "./pk-dictionary-mapping.json"
MAX_SEQ_LENGTH = 100
NUM_FEATURES = None

print("Step 1: Loading labels and creating mappings...")

with open(JSON_PATH, 'r') as f:
    json_data = json.load(f)

all_labels = []
for entry in json_data:
    if 'mapping' in entry:
        for item in entry['mapping']:
            if 'label' in item:
                all_labels.append(item['label'])

unique_labels = sorted(list(set(all_labels)))

label_encoder = LabelEncoder()
label_encoder.fit(unique_labels)
NUM_CLASSES = len(label_encoder.classes_)

print(f"Found {len(unique_labels)} unique labels.")
print(f"Number of classes: {NUM_CLASSES}")

print("\nStep 2: Loading landmark data from CSV files...")

sequences = []
labels = []
available_files = os.listdir(DATA_PATH)

for filename in available_files:
    if filename.endswith(".csv"):
        file_label = os.path.splitext(filename)[0]
        
        if file_label in label_encoder.classes_:
            try:
                df = pd.read_csv(os.path.join(DATA_PATH, filename), header=None)
                
                if NUM_FEATURES is None:
                    NUM_FEATURES = df.shape[1]
                    print(f"Dynamically determined number of features: {NUM_FEATURES}")

                sequences.append(df.values)
                labels.append(label_encoder.transform([file_label])[0])
            except Exception as e:
                print(f"Could not read or process file {filename}: {e}")

print(f"Loaded {len(sequences)} sequences.")

if not sequences:
    raise ValueError("No sequences were loaded. Check the DATA_PATH and file contents.")
    
print("\nStep 3: Preprocessing data (padding and encoding)...")

X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post', dtype='float32')
y = to_categorical(labels, num_classes=NUM_CLASSES)

print(f"Shape of data tensor X: {X.shape}")
print(f"Shape of labels tensor y: {y.shape}")

print("\nStep 4: Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

print("\nStep 5: Building the Hybrid CNN-LSTM model...")

model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(MAX_SEQ_LENGTH, NUM_FEATURES)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    
    LSTM(128, return_sequences=True, activation='relu'),
    LSTM(64, return_sequences=False, activation='relu'),
    
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

print("\nStep 6: Compiling the model...")
model.compile(
    optimizer='adam', 
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

print("\nStep 7: Starting model training...")
EPOCHS = 50
BATCH_SIZE = 32

history = model.fit(
    X_train, 
    y_train, 
    epochs=EPOCHS, 
    batch_size=BATCH_SIZE,
    validation_data=(X_test, y_test)
)

print("\nTraining finished.")

print("\nStep 8: Evaluating model performance on the test set...")
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Test Loss: {loss:.4f}")

# model.save('sign_language_cnn_lstm.h5')
# print("Model saved as 'sign_language_cnn_lstm.h5'")