## BABY SIGN LANGUAGE MODEL


### TRAINING THE MODEL

In [None]:
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet18
from torchvision.transforms import Compose, ToTensor, Normalize
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

# Configuration
data_dir = r"C:\Users\roksh\OneDrive\Desktop\AI LAB\SignLanguage\Preprocessed_Frame_Hands"  # Dataset directory
num_classes = len(os.listdir(data_dir)) # Number of classes in the dataset
input_shape = (3, 224, 224)
batch_size = 32
epochs = 15
num_folds = 3  # Number of folds for K-Fold Cross-Validation ( we mostly used 2 or 3 folds for our experiments)

def load_image(image_path): 
    image = cv2.imread(image_path)
    return image

# Dataset class for loading data from the directory

class SignLanguageDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data = []  # List of (frame_path, class_idx) tuples
        self.transform = transform

        class_mapping = {cls: idx for idx, cls in enumerate(sorted(os.listdir(data_dir)))}
        for label_dir in os.listdir(data_dir):
            label_path = os.path.join(data_dir, label_dir)
            if os.path.isdir(label_path):
                for video_dir in os.listdir(label_path):
                    video_path = os.path.join(label_path, video_dir)
                    if os.path.isdir(video_path):
                        for frame_file in os.listdir(video_path):
                            if frame_file.endswith(".jpg"):  # Filter for image files
                                frame_path = os.path.join(video_path, frame_file)
                                self.data.append((frame_path, class_mapping[label_dir]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        frame_path, class_idx = self.data[idx]
        frame = load_image(frame_path)

        if self.transform:
            frame = self.transform(frame)

        return frame, class_idx

# ResNet-18 pretrained model where the final classification layer is removed and replaced with a custom classifier
class ResNet18CustomClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ResNet18CustomClassifier, self).__init__()
        self.resnet18 = resnet18(pretrained=True)
        in_features = self.resnet18.fc.in_features

        # Remove the final classification layer
        self.resnet18.fc = nn.Identity()

        # Add a custom classifier
        self.custom_classifier = nn.Sequential(
            nn.Linear(in_features, 256),  
            nn.ReLU(),
            nn.Dropout(0.5), 
            nn.Linear(256, num_classes)  
        )

    def forward(self, x):
        x = self.resnet18(x)
        x = self.custom_classifier(x)
        return x

# Load and preprocess data
transform = Compose([
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset = SignLanguageDataset(data_dir, transform=transform)
kf = KFold(n_splits=num_folds, shuffle=True)

train_acc_history = []
val_acc_history = []

# K-Fold Cross-Validation Loop
fold = 0
for train_indices, val_indices in kf.split(range(len(dataset))):
    print(f"Fold {fold}")

    # dataloaders for the current fold
    train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
    val_sampler = torch.utils.data.SubsetRandomSampler(val_indices)

    train_dataloader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    val_dataloader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

    # Create a new model instance for each fold
    model = ResNet18CustomClassifier(num_classes)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    best_val_accuracy = 0.0
    best_model_state = None

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        all_labels = []
        all_predictions = []

        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            all_labels.extend(labels.tolist())

            _, predictions = torch.max(outputs, 1)
            all_predictions.extend(predictions.tolist())

        epoch_loss = total_loss / len(train_dataloader)
        epoch_accuracy = accuracy_score(all_labels, all_predictions)

        # Validation loop
        model.eval()
        val_loss = 0.0
        all_val_labels = []
        all_val_predictions = []

        with torch.no_grad():
            for val_inputs, val_labels in val_dataloader:
                val_outputs = model(val_inputs)
                val_batch_loss = criterion(val_outputs, val_labels)
                val_loss += val_batch_loss.item()

                _, val_predictions = torch.max(val_outputs, 1)
                all_val_labels.extend(val_labels.tolist())
                all_val_predictions.extend(val_predictions.tolist())

            val_epoch_loss = val_loss / len(val_dataloader)
            val_epoch_accuracy = accuracy_score(all_val_labels, all_val_predictions)

        print(f"Fold {fold+1}, Epoch [{epoch+1}/{epochs}], "
              f"Training Loss: {epoch_loss:.4f}, Training Accuracy: {epoch_accuracy:.4f}, "
              f"Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_epoch_accuracy:.4f}")
        
        # Append accuracy to history lists
        train_acc_history.append(epoch_accuracy)
        val_acc_history.append(val_epoch_accuracy)

        if val_epoch_accuracy > best_val_accuracy:
            best_val_accuracy = val_epoch_accuracy
            best_model_state = model.state_dict()
    
    print(f"Fold {fold+1} finished.")
    fold += 1
    if best_model_state is not None:
        fold_model_path = f'best_resnet18_fold_{fold+1}.pth'
        torch.save(best_model_state, fold_model_path)
        print(f"Best model for fold {fold+1} saved to {fold_model_path}")

plt.figure(figsize=(10, 5))
plt.plot(train_acc_history, label='Training Accuracy')
plt.plot(val_acc_history, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Add fold labels
for i in range(num_folds):
    plt.text(i * epochs, max(val_acc_history), f'Fold {i+1}', fontsize=12, va='bottom', ha='left', backgroundcolor='w')

plt.title('Training and Validation Accuracy History with Fold Labels')
plt.grid()
plt.show()

print("K-Fold Cross-Validation finished.")


### TESTING THE MODEL

In [2]:
import cv2
from cvzone.HandTrackingModule import HandDetector
import numpy as np
import torch
from torchvision.models import resnet18
import torch.nn as nn
from torchvision import transforms

class ResNet18CustomClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ResNet18CustomClassifier, self).__init__()
        self.resnet18 = resnet18(pretrained=True)
        in_features = self.resnet18.fc.in_features

        self.resnet18.fc = nn.Identity()

        self.custom_classifier = nn.Sequential(
            nn.Linear(in_features, 256),  
            nn.ReLU(),
            nn.Dropout(0.5),  
            nn.Linear(256, num_classes) 
        )

    def forward(self, x):
        x = self.resnet18(x)
        x = self.custom_classifier(x)
        return x


labels = ["dont_know", "down", "drink","eat", "frustrated", "i_love_you", "mad_grumpy", "milk", "mine", "mom", "potty", "sorry"]

num_classes = len(labels)  # Number of classes in the dataset
model = ResNet18CustomClassifier(num_classes)

# Load the saved weights from the .pth file
model.load_state_dict(torch.load(r"C:\Users\roksh\OneDrive\Desktop\AI LAB\BabySignLanguageProject\signlanguage_model.pth"))
model.eval()  # Set the model to evaluation mode

cap = cv2.VideoCapture(0)
detector = HandDetector(maxHands=1)

offset = 20
imgSize = 224  


font_color = (68,214,44)  
font_scale = 1.4  
font_thickness = 2  


while True:
    success, img = cap.read()
    imgOutput = img.copy()
    hands, img = detector.findHands(img)
    
    if hands:
        hand = hands[0]
        x, y, w, h = hand['bbox']

        imgCrop = img[y - offset:y + h + offset, x - offset:x + w + offset]

        # Resize the cropped hand region to match the input size of ResNet-18
        imgResize = cv2.resize(imgCrop, (imgSize, imgSize))

        # Normalize the image data 
        imgResize = transforms.ToTensor()(imgResize)
        imgResize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(imgResize)

        # Make predictions using the ResNet-18 model
        with torch.no_grad():
            predictions = model(imgResize.unsqueeze(0))  # Add the batch dimension
            predicted_label_index = torch.argmax(predictions, dim=1).item()
            prediction = labels[predicted_label_index]

            # Apply softmax to get confidence scores
            softmax_scores = torch.softmax(predictions, dim=1)
            confidence_score = softmax_scores[0][predicted_label_index].item()

        confidence_percentage = confidence_score * 100
        # Calculate the text size
        text_to_display = f"{prediction} ({confidence_percentage:.2f})"
        text_size, _ = cv2.getTextSize(text_to_display, cv2.FONT_HERSHEY_DUPLEX, font_scale, font_thickness)

        # Calculate the width of the first rectangle based on the text size
        rect_width = text_size[0] + 20  # Add some padding

        # Draw the first rectangle
        cv2.rectangle(imgOutput, (x - offset, y - offset - 40),
                    (x - offset + rect_width, y - offset - 50 + 50), font_color, cv2.FILLED)

        # Draw the text
        cv2.putText(imgOutput, text_to_display, (x - offset + 10, y - 26), cv2.FONT_HERSHEY_DUPLEX, font_scale, (255, 255, 255), font_thickness)

        # Draw the second rectangle
        cv2.rectangle(imgOutput, (x - offset, y - offset),
                    (x + w + offset, y + h + offset), font_color, 4)


    cv2.imshow("Image", imgOutput)
    key = cv2.waitKey(1) 
    if key == ord('q'):
        break



AttributeError: 'NoneType' object has no attribute 'copy'

## EXTRA CODE FOR PREPROCESSING ( BEFORE TRAINING)

### FRAME EXTRACTION

In [None]:
import os
import cv2
import glob

videos_dir = r'c:\Users\roksh\OneDrive\Desktop\AI LAB\SignLanguage\BigDataset'

frames_output_dir = r'C:\Users\roksh\OneDrive\Desktop\AI LAB\SignLanguage\AddingExtracted'
os.makedirs(frames_output_dir, exist_ok=True)

# Create a mapping of category names to labels
category_to_label = {category_name: label for label, category_name in enumerate(os.listdir(videos_dir))}

# Iterate through video files and extract frames
for category_name in os.listdir(videos_dir):
    category_dir = os.path.join(videos_dir, category_name)
    label = category_to_label[category_name]
    
    video_files = glob.glob(os.path.join(category_dir, '*.mp4'))
    for video_file in video_files:
        cap = cv2.VideoCapture(video_file)
        frame_count = 0
        
        # Create a subdirectory for the video's frames
        video_frames_dir = os.path.join(frames_output_dir, category_name, os.path.basename(video_file)[:-4])
        os.makedirs(video_frames_dir, exist_ok=True)
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            frame_path = os.path.join(video_frames_dir, f'frame_{frame_count:04d}.jpg')
            cv2.imwrite(frame_path, frame)
            
            frame_count += 1
        
        cap.release()

print("Frame extraction and labeling completed.")


### DATA AUGMENTATION

In [None]:
import os
import cv2
import random
import numpy as np

input_directory = r"C:\Users\roksh\OneDrive\Desktop\AI LAB\SignLanguage\FramesExtracted"  
output_directory = r"C:\Users\roksh\OneDrive\Desktop\AI LAB\SignLanguage\Augmented3"  

def apply_color_jitter(img):
    # Apply slight color jitter
    jitter_amount = np.random.randint(-10, 10)
    img = img.astype(np.int16)
    img += jitter_amount
    img = np.clip(img, 0, 255).astype(np.uint8)
    return img

def apply_gaussian_noise(img):
    # Add Gaussian noise
    noise = np.random.normal(0, 10, img.shape).astype(np.int16)
    img = img.astype(np.int16) + noise
    img = np.clip(img, 0, 255).astype(np.uint8)
    return img

def apply_horizontal_flip(img):
    return cv2.flip(img, 1)

def apply_vertical_flip(img):
    return cv2.flip(img, 0)

def apply_background_removal(img):
    fgbg = cv2.createBackgroundSubtractorMOG2()
    fgmask = fgbg.apply(img)
    masked_frame = cv2.bitwise_and(img, img, mask=fgmask)
    return masked_frame

def apply_random_rotation(img):
    angle = random.randint(-60, 60)  # Rotate by random angle between -30 and 30 degrees
    rows, cols, _ = img.shape
    rotation_matrix = cv2.getRotationMatrix2D((cols/2, rows/2), angle, 1)
    return cv2.warpAffine(img, rotation_matrix, (cols, rows))

def apply_brightness_contrast(img):
    alpha = random.uniform(0.8, 1.2)  # Brightness adjustment
    beta = random.randint(-20, 20)    # Contrast adjustment
    return cv2.convertScaleAbs(img, alpha=alpha, beta=beta)

augmentation_functions = [
    lambda img: apply_color_jitter(img),
    lambda img: apply_gaussian_noise(img)
    #lambda img: apply_background_removal(img)
    #Add more augmentation functions here
]

for class_name in os.listdir(input_directory):
    class_path = os.path.join(input_directory, class_name)
    output_class_path = os.path.join(output_directory, class_name)
    os.makedirs(output_class_path, exist_ok=True)
    
    for video_folder in os.listdir(class_path):
        video_folder_path = os.path.join(class_path, video_folder)
        output_video_folder_path = os.path.join(output_class_path, video_folder)
        os.makedirs(output_video_folder_path, exist_ok=True)
        
        for frame_name in os.listdir(video_folder_path):
            frame_path = os.path.join(video_folder_path, frame_name)
            frame = cv2.imread(frame_path)
            
            # Apply random augmentation function to the frame
            augmentation_func = random.choice(augmentation_functions)
            augmented_frame = augmentation_func(frame)
            
            # Save augmented frame
            output_frame_path = os.path.join(output_video_folder_path, f"{frame_name.split('.')[0]}_augmented.jpg")
            cv2.imwrite(output_frame_path, augmented_frame)

print("Data augmentation complete.")


### CROP HANDS OUT OF FRAMES

In [None]:
import cv2
from cvzone.HandTrackingModule import HandDetector
import numpy as np
import math
import os
import time

# Initialize the HandDetector
detector = HandDetector(maxHands=1)

# Offset and image size parameters
offset = 20
imgSize = 300

root_dir = r"C:\Users\roksh\OneDrive\Desktop\AI LAB\SignLanguage\provaa" # CHANGE THIS
output_root = r"C:\Users\roksh\OneDrive\Desktop\AI LAB\SignLanguage\Preprocessed_Frame_Hands" #CHANGE THIS
os.makedirs(output_root, exist_ok=True)

categories = os.listdir(root_dir)

for category in categories:
    category_path = os.path.join(root_dir, category)

    # Process each video within the category
    videos = os.listdir(category_path)

    for video in videos:
        video_path = os.path.join(category_path, video)

        # Create an output folder for the current video
        output_category_folder = os.path.join(output_root, category)
        output_video_folder = os.path.join(output_category_folder, video)
        os.makedirs(output_video_folder, exist_ok=True)

        # Process each frame in the video
        frames = os.listdir(video_path)

        for frame_filename in frames:
            frame_path = os.path.join(video_path, frame_filename)

            # Load the image
            img = cv2.imread(frame_path)

            # Find hands in the image
            hands, img = detector.findHands(img)

            if hands:
                hand = hands[0]
                x, y, w, h = hand['bbox']

                imgWhite = np.ones((imgSize, imgSize, 3), np.uint8) * 255
                imgCrop = img[y - offset:y + h + offset, x - offset:x + w + offset]

                # Resize and place the cropped hand image into a white canvas
                imgCropShape = imgCrop.shape
                aspectRatio = h / w

                if aspectRatio > 1:
                    k = imgSize / h
                    wCal = math.ceil(k * w)
                    imgResize = cv2.resize(imgCrop, (wCal, imgSize))
                    imgResizeShape = imgResize.shape
                    wGap = math.ceil((imgSize - wCal) / 2)
                    imgWhite[:, wGap:wCal + wGap] = imgResize
                else:
                    k = imgSize / w
                    hCal = math.ceil(k * h)
                    imgResize = cv2.resize(imgCrop, (imgSize, hCal))
                    imgResizeShape = imgResize.shape
                    hGap = math.ceil((imgSize - hCal) / 2)
                    imgWhite[hGap:hCal + hGap, :] = imgResize

                # Save the cropped hand image
                output_filename = os.path.splitext(frame_filename)[0] + "_hand.jpg"
                output_path = os.path.join(output_video_folder, output_filename)
                cv2.imwrite(output_path, imgWhite)

                print(f"Saved cropped hand image: {output_path}")

print("All frames processed and cropped.")