This *.ipynb* file is used to 
1. create
2. modify
3. manage

the *.py* files within the /src directory.

In [1]:
import os

# defining paths
root_dir= os.path.dirname(os.getcwd())
src_dir= os.path.join(root_dir, 'src')
data_dir= os.path.join(root_dir, 'data')

print(f'project_directory: {root_dir}')
print(f'src_directory: {src_dir}')

src_dir= src_dir.replace('\\', '/')

project_directory: C:\Users\sadeg\OneDrive\Desktop\Thesis\python_codes\SignLanguageProject
src_directory: C:\Users\sadeg\OneDrive\Desktop\Thesis\python_codes\SignLanguageProject\src


# making \_\_init\_\_.py file

In [2]:
%%writefile $src_dir/__init__.py
# Explanation: This file will mark the source directory as a python package

Overwriting C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/SignLanguageProject/src/__init__.py


# making prepare\_datasets.py file

In [17]:
%%writefile $src_dir/prepare_datasets.py
# Explanation: This python file will contain functions that are used to extract landmarks from LSA64 and WLASL100 datasets and prepare data.
# importing libraries for working with directories of the libreries
import os
from pathlib import Path
from natsort import natsorted 

# importing OpenCV and Mediapipe to read videos and extract landmarks
import cv2                                                                         
import mediapipe as mp  

# importing numpy to work with arrays
import numpy as np                                                                
                                                            
# importing tqdm for progression bar and typing for writing input types for each function                                                      
from tqdm.auto import tqdm                                                         
from typing import Callable, List

# A list for all class names in WLALS100
wlasl100class_names= ["accident", "africa", "all", "apple", "basketball", "bed", "before", "bird", "birthday",
                      "black", "blue", "bowling", "brown", "but", "can", "candy", "chair", "change", "cheat", "city",
                      "clothes", "color", "computer", "cook", "cool", "corn", "cousin", "cow", "dance", "dark",
                      "deaf", "decide", "doctor", "dog", "drink", "eat", "enjoy", "family", "fine", "finish",
                      "fish", "forget", "full", "give", "go", "graduate", "hat", "hearing", "help", "hot",
                      "how", "jacket", "kiss", "language", "last", "letter", "like", "man", "many", "meet",
                      "mother", "need", "no", "now", "orange", "paint", "paper", "pink", "pizza", "play",
                      "pull", "purple", "right", "same", "school", "secretary", "shirt", "short", "son", "study",
                      "table", "tall", "tell", "thanksgiving", "thin", "thursday", "time", "walk", "want", "what",
                      "white", "who", "woman", "work", "wrong", "year", "yes", "book", "later", "medicine"]

# A list for all class names in LSA64
lsa64class_names= ['Opaque', 'Red', 'Green', 'Yellow', 'Bright', 'Light-blue', 'Colors', 'Pink',
                   'Women', 'Enemy', 'Son', 'Man', 'Away', 'Drawer', 'Born', 'Learn',
                   'Call', 'Skimmer', 'Bitter', 'Sweet milk', 'Milk', 'Water', 'Food', 'Argentina',
                   'Uruguay', 'Country', 'Last name', 'Where', 'Mock', 'Birthday', 'Breakfast', 'Photo',
                   'Hungry', 'Map', 'Coin', 'Music', 'Ship', 'None', 'Name', 'Patience',
                   'Perfume', 'Deaf', 'Trap', 'Rice', 'Barbecue', 'Candy', 'Chewing-gum', 'Spaghetti',
                   'Yogurt', 'Accept', 'Thanks', 'Shut down', 'Appear', 'To land', 'Catch', 'Help',
                   'Dance', 'Bathe', 'Buy', 'Copy', 'Run', 'Realize', 'Give', 'Find']

#---------------------------------------------------------------LSA 64--------------------------------------------------------------------------
# function to get landmarks from LSA64 dataset.
def get_landmarks_LSA64(root: str,
                        class_names: List[str],                    
                        frame_numbers: int):
    """
    This function initially retrieves all video paths from the LSA64 directory. Then uses a dictionary to map each class name in lsa64class_names 
    to numbers from 1 to 64, this is used later to detect correct label for each video. Then the function analysis videos frame by frame and extract
    landmark. Finallythe function is able to assigne each video, an array of detected landmarks and a label.
    Args:
        root: Path to LSA64 video dataset directory.
        class_names: List of all words in the dataset.
        frame_numbers: number of frames we want to take from the each video in the dataset.
    Returns:
        A tuple of (detections, labels, len(all_video_paths),len(none_cv2_video_paths)) where:
        detections is a list of all mediapipe landmarks that were detected from all videos.
        labels is a list of labels corresponding to each video detection.
        len(all_video_paths) is the number of videos in the dataset.
        len(none_cv2_video_paths) is the number of videos that OpenCV cant capture.
    Note:
        video_detections has the following structure:
        indexes (0 to 131) of the list correspond to the first 33 pose landmarks. each pose landmark has: x, y, z, visibility
        indexes (132 to 1535) of the list correspond to the first 468 face landmarks. each face landmark has: x, y, z
        indexes (1536 to 1598) of the list correspond to the first 21 left hand landmarks. each left hand landmark has x, y, z
        indexes (1599 to 1661) of the list correspond to the first 21 right hand landmarks each right hand landmark has x, y, z
        in total 1662 values that correspond to 543 landmark objects.
    Example use:
        results= get_landmarks_LSA64(root= root, class_names= lsa64class_names, frame_numbers= 30)
        detections, labels, number_of_all_vidoes, number_of_bad_videos= results[0], results[1], results[2], results[3]
    """
    labels= [] # a list to store video labels
    detections= [] # a list to store all video detections
   
    none_cv2_video_paths= [] # a list to store video paths that cv2 can't capture
    
    all_video_paths= Path(root).glob("**/*.mp4")             # a list to store all video paths in the dataset
    all_video_paths= [str(path) for path in all_video_paths] # changing path objects to strings since natosrt works with strings
    all_video_paths= natsorted(all_video_paths)              # sorted
    
    vid_idx_to_label= {i+1:label for i, label in enumerate(class_names)} # this mapping is used to change the video titles to labels
    
    with mp.solutions.holistic.Holistic(min_detection_confidence= 0.5, min_tracking_confidence=0.5) as holistic:
        for video_path in tqdm(all_video_paths, desc="Processing videos"):
            cap = cv2.VideoCapture(video_path)              # capture each video using OpenCV
            if not cap.isOpened():                          # if OpenCV can't capture the video path
                none_cv2_video_paths.append(video_path)     # add the video path to none_cv2_video_paths
            else:                                           
                video_detections= [] # a list to store video detections

                total_frames_number= cap.get(cv2.CAP_PROP_FRAME_COUNT)                                  # getting total number of frames from a video
                total_frames_number = int(total_frames_number)                                          # changing float to integer   
                frame_idxs_to_process = np.linspace(0, total_frames_number-1, frame_numbers, dtype=int) # picking desiered frame indexes
                
                for idx in frame_idxs_to_process:
                    cap.set(cv2.CAP_PROP_POS_FRAMES, idx) # set the video to the desired frame index
                    ret, frame= cap.read()
                    
                    result= holistic.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # processing the frame (Mediapipe works with RGB...)
                    pose= np.array([[res.x, res.y, res.z, res.visibility] for res in result.pose_landmarks.landmark]).flatten() if result.pose_landmarks else np.zeros(33*4) 
                    face= np.array([[res.x, res.y, res.z] for res in result.face_landmarks.landmark]).flatten() if result.face_landmarks else np.zeros(468*3) 
                    lh= np.array([[res.x, res.y, res.z] for res in result.left_hand_landmarks.landmark]).flatten() if result.left_hand_landmarks else np.zeros(21*3)
                    rh= np.array([[res.x, res.y, res.z] for res in result.right_hand_landmarks.landmark]).flatten() if result.right_hand_landmarks else np.zeros(21*3)
                    frame_detection= np.concatenate((pose,face,lh, rh)) # concatenating detected landmarks of the frame
                    video_detections.append(frame_detection)            # storing the frame detection in the video detection list

                video_idx= int(os.path.basename(video_path).split('_')[0]) # extract video index from the video title
                label= vid_idx_to_label[video_idx]                         # map the video index to a label
                
                labels.append(label)
                detections.append(video_detections) 
   
            cap.release()
        
    return detections, labels, len(all_video_paths),len(none_cv2_video_paths)

#---------------------------------------------------------------WLASL 100--------------------------------------------------------------------------
# function to get landmarks from WLASL100 dataset.
def get_landmarks_WLASL100(root: str,
                           class_names: List[str],
                           frame_numbers: int):
    """
    This function initially retrieves all video paths from the WLSA100 directory. Then uses a dictionary to map each class name in wlasl100 
    class_names to numbers from 1 to 100, this is used later to detect correct label for each video. Then the function analysis videos frame 
    by frame and extract landmark. Since some of the videos have faulty frames. it checks for before and after frames first. incase those are
    faulty as well it puts an empty list for that frame of the video. Finally the function is able to assigne each video, 
    an array of detected landmarks and a label.    
    Args:
        root: Path to video dataset directory.
        class_names: List of all words in the dataset.
        frame_numbers: number of frames we want to take from the entire video.
    Returns:
        A tuple of (detections, labels, len(all_video_paths),len(none_cv2_video_paths)) where:
        detections is a list of all mediapipe landmarks that were detected from all videos.
        labels is a list of labels corresponding to each video detection.
        len(all_video_paths) is the number of videos in the dataset.
        len(none_cv2_video_paths) is the number of videos that OpenCV cant capture.
    Note:
        video_detections has the following structure:
        indexes (0 to 131) of the list correspond to the first 33 pose landmarks. each pose landmark has: x, y, z, visibility
        indexes (132 to 1535) of the list correspond to the first 468 face landmarks. each face landmark has: x, y, z
        indexes (1536 to 1598) of the list correspond to the first 21 left hand landmarks. each left hand landmark has x, y, z
        indexes (1599 to 1661) of the list correspond to the first 21 right hand landmarks each right hand landmark has x, y, z
        in total 1662 values that correspond to 543 landmark objects.        
    Example use:
        results= get_landmarks_WLASL100(root= root, class_names= class_names frame_numbers= 30):
    """
    labels= [] # a list to store video labels
    detections= [] # a list to store all video detections
   
    none_cv2_video_paths= [] # a list to store video paths that cv2 can't capture
    
    all_video_paths= Path(root).glob("**/*.mp4")             # a list to store all video paths in the dataset
    all_video_paths= [str(path) for path in all_video_paths] # changing path objects to strings since natosrt works with strings
    all_video_paths= natsorted(all_video_paths)              # sorted
    
    vid_idx_to_label= {i+1:label for i, label in enumerate(class_names)} # this mapping is used to change the video titles to labels
    
    with mp.solutions.holistic.Holistic(min_detection_confidence= 0.5, min_tracking_confidence=0.5) as holistic:
        for video_path in tqdm(all_video_paths, desc="Processing videos"):
            cap = cv2.VideoCapture(video_path)              # capture each video using Opencv
            if not cap.isOpened():                          # if OpenCV can't capture the video
                none_cv2_video_paths.append(video_path)     # add the video path to none_cv2_video_paths list
            else:
                video_detections= []
                total_frames_number= cap.get(cv2.CAP_PROP_FRAME_COUNT)                                     # getting total number of frames from a video
                total_frames_number = int(total_frames_number)                                             # changing float to integer   
                frame_idxs_to_process = np.linspace(0, total_frames_number - 1, frame_numbers, dtype= int) # picking desiered frame indexes
                
                for idx in frame_idxs_to_process:
                    cap.set(cv2.CAP_PROP_POS_FRAMES, idx) # set the video to the desired frame index
                    ret, frame= cap.read()
                    
                    if not ret:
                        # if the return value is False: meaning the frame was "unreadable".
                        print(f"Failed to grab frame {idx}, of video {video_path} of length {total_frames_number} frames. trying adjacent frames...")
                        cap.set(cv2.CAP_PROP_POS_FRAMES, idx - 1)
                        ret, frame = cap.read()
                        if not ret:
                            cap.set(cv2.CAP_PROP_POS_FRAMES, idx + 1)
                            ret, frame = cap.read()
                            
                    if not ret:
                        # if the return value is still False: meaning the next frame was also "unreadable"
                        print(f"Unable to retrieve any frames around index {idx}, of video {video_path} of length {total_frames_number} frames.")
                        frame_detection= [] # we add empty detection that will be filled later, using interpolation
                        video_detections.append(frame_detection)
                        continue
                                
                    result= holistic.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                    pose= np.array([[res.x, res.y, res.z, res.visibility] for res in result.pose_landmarks.landmark]).flatten() if result.pose_landmarks else np.zeros(33*4) 
                    face= np.array([[res.x, res.y, res.z] for res in result.face_landmarks.landmark]).flatten() if result.face_landmarks else np.zeros(468*3) 
                    lh= np.array([[res.x, res.y, res.z] for res in result.left_hand_landmarks.landmark]).flatten() if result.left_hand_landmarks else np.zeros(21*3)
                    rh= np.array([[res.x, res.y, res.z] for res in result.right_hand_landmarks.landmark]).flatten() if result.right_hand_landmarks else np.zeros(21*3)
                    frame_detection= np.concatenate((pose,face,lh, rh)) # concatenating detected landmarks of the frame
                    video_detections.append(frame_detection)            # storing the frame detection in the video detection list

                video_idx= int(os.path.basename(os.path.dirname(video_path))) # extract video index from the video_path
                label= vid_idx_to_label[video_idx]                            # map the video index to a label
                detections.append(video_detections)    
                labels.append(label)
       
            cap.release()
            
        return detections, labels, len(all_video_paths),len(none_cv2_video_paths)

# function to interpolate two frames of a video.
def interpolate_frames(most_recent_detection, next_coming_detection, alpha):
    """
    Based on the value of most recent detection and next coming detection which are the frames before and after our faulty frame returns 
    a landmark array for the faulty frame.
    Args:
        most_recent_detection: landmarks detected in previous frame.
        next_coming_detection: landmarks detected in the next frame.
        alpha: interpolation factor. 
    Returns:
        either: (1 - alpha) * most_recent_detection + alpha * next_coming_detection
        or: next_coming_detection
        or: most_recent_detection
    Example use:
        video_detection[i]= interpolate_frames(most_recent_detection, next_coming_detection, 0.5)
    """
    if most_recent_detection is None and next_coming_detection is not None:             # first to nth frames are all corrupt
        return next_coming_detection
    elif most_recent_detection is not None and next_coming_detection is None:           # nth to last frames are all corrupt
        return most_recent_detection
    else:
        return (1 - alpha) * most_recent_detection + alpha * next_coming_detection 

# function to fill the empty detections in the videos using interpolation
def fill_empty_detections(detections):
    """
    In principle fills up the empty landmark detections for frames that where faulty in the dataset and returns the dataset .
    Args:
        detections: all video detections from mediapipe
    Returns:
        detections (with no empty landmark frame)
    Example use: 
        detections= fill_empty_detections(detections)
    """
    for video_detection in detections:
        most_recent_detection= None
        for i in range(len(video_detection)):
            if len(video_detection[i]) != 0:
                most_recent_detection= video_detection[i]
            else:
                next_coming_detection= None
                for j in range(i+1, len(video_detection)):
                    if len(video_detection[j]) != 0:
                        next_coming_detection= video_detection[j]
                        break
                    else:
                        continue
                     
                video_detection[i]= interpolate_frames(most_recent_detection, next_coming_detection, 0.5)
                most_recent_detection= video_detection[i]

    return detections


Overwriting C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/SignLanguageProject/src/prepare_datasets.py


# making preprocess_utils.py file

In [28]:
%%writefile $src_dir/preprocess_utils.py
#Explanation: This python file contains functions for preprocessing our data.
# importing libraries for preprocessing data
import random 

# importing numpy to work with arrays
import numpy as np

# import torch
import torch

# importing defaultdict for making dictionaries and sorting videos under labels
from collections import defaultdict

# importing train_test_split for splitting data
from sklearn.model_selection import train_test_split

#importing tqdm for progression bar and typing and numpy.typing for writing input types for each function
from tqdm.auto import tqdm 
from typing import List, Tuple
from numpy.typing import NDArray

#---------------------------------------------------------------Interpolation--------------------------------------------------------------------------

# function to implement interpolation on two videos
def interpolate_video_detections(video_detection_1: NDArray[np.float64], 
                                 video_detection_2: NDArray[np.float64], 
                                 frame_structure: List[Tuple[int, int]],
                                 alpha: float):
    """
    This function gets two video detection arrays and based interpolates them frame by frame. to make correct interpolations the function
    first checks , if both frames contain same body parts.
    Args:
        video_detection_1: First video detection array.
        video_detection_2: Second video detection array.
        frame_structure: represents the start and end index for each landmark class: pose, face, lh, rh.
        alpha: interpolation factor
    Returns:
        an array that is the interpolation of the two input video detections:
        inter_vid_detection
    Example usage: 
        inter_vid_detection = interpolate_video_detections(video_detection_1= v1, video_detection_2= v2, frame_structure= frame_structure, alpha= 0.5)
    """
    num_frames = video_detection_1.shape[0] # number of frames that will be interpolated
    inter_vid_detection= np.zeros_like(video_detection_1) # zero array for storing interpolated values
    
    for i in range(num_frames):
        frame_detection_1= video_detection_1[i]             
        frame_detection_2= video_detection_2[i]             
        inter_frame_detection= np.zeros_like(frame_detection_1) # stores interpolated frame
        
        for (start, end) in frame_structure:
            bodypart1= frame_detection_1[start:end]
            bodypart2= frame_detection_2[start:end]
            # if the body part does not exist in both frames
            if np.all(bodypart1 == 0) and np.all(bodypart2 == 0):
                inter_frame_detection[start:end] = np.zeros(end- start) # put zero    
            # if a body part 1 does not exist
            elif np.all(bodypart1 == 0):                       
                inter_frame_detection[start:end] = bodypart2 # put bodypart 2
            # if body part 2 does not exist
            elif np.all(bodypart2 == 0):                        
                inter_frame_detection[start:end] = bodypart1 # put bodypart 1
            # if both exists then we interpolate
            else:
                inter_frame_detection[start:end]= (1 - alpha) * bodypart1 + alpha * bodypart2
                # this formula also works very nice
                #A = bodypart1+ ((bodypart1 + bodypart2) / 2)**2 - (bodypart1)**2 + ((bodypart1 + bodypart2) / 2)**2 - (bodypart2)**2 
                #B = bodypart1+ ((bodypart1 + bodypart2) / 2)**2 - (bodypart1)**2 + ((bodypart1 + bodypart2) / 2)**2 - (bodypart2)**2 
                #inter_frame_detection[start:end]= (1 - alpha) * A + alpha * B

        inter_vid_detection[i]= inter_frame_detection 
    return inter_vid_detection

# function that applies interpolation accross the entire dataset
def interpolate_dataset(detections: NDArray[np.float64],
                        labels: List[str],
                        alpha: float= 0.5,
                        num_interpolation_samples: int= 10):
    """
    This function applies interpolation accross the entire dataset. It only interpolates between videos that have the same label. 
    Args:
        detections: array of all video detections from LSA64 or WLASL100 dataset
        labels: list of all video labels in the dataset
        alpha: interpolation factor.
        num_interpolations_samples: number of interpolated samples that should be produced for each label
    Returns:
        a tuple of (np.array(x), y) where np.array(x) is the detections and y is the labels
    Example usage:
        detections, labels = interpolate_dataset(detections, labels, alpha= 0.5, min_interpolations= 13)
    """
    current_data= defaultdict(list)                 # stores current data
    interpolated_data= defaultdict(list)            # stores interpolated data
    augumented_data = defaultdict(list)             # union of current and interpolated data
    
    frame_structure= [(0, 132), (132, 1536), (1536, 1599), (1599, 1662)]  # represents the indexes of the concatenated pose, face, lh, rh
    
    x = []  #stores augmented detections
    y = []  #stores augmented labels

    # making a dictionary where key is label and value is list of all videos with same label
    for idx, label in enumerate(labels):
        current_data[label].append(detections[idx])

    # for each label, finding all video pair combinations:
    for label, video_detections in current_data.items():
        pairs= []
        for i in range(len(video_detections)):
            for j in range(i+1, len(video_detections)):
                pairs.append((i, j))

        # since all considering all combinations is too much. randomly select a specific number of video pairs
        selected_pairs = random.sample(pairs, min(num_interpolation_samples, len(pairs)))
        for (i, j) in selected_pairs:
            video_detection_1= video_detections[i]
            video_detection_2= video_detections[j]
            inter_vid_detection = interpolate_video_detections(video_detection_1, video_detection_2, frame_structure, alpha) #interpolate

            # add to the interpolated_data dictionary
            interpolated_data[label].append(inter_vid_detection)
    
    # add video detections of both current and interpolated data together 
    for data in (current_data, interpolated_data):
        for label, video_detections in data.items():
            augumented_data[label].extend(video_detections)
    
    # convert the dictionary back into detection, label arrays
    for label, video_detections in augumented_data.items():
        for video_detection in video_detections:
            x.append(video_detection)
            y.append(label)

    return np.array(x), y

#---------------------------------------------------------------Split Data--------------------------------------------------------------------------

def convert(detections: NDArray[np.float64],
            labels: List[str],
            class_names: List[str]):
    """
    This function maps our Labels to numbers. so that they are prepared for the training phase. It also changes the detections
    from float64 to float32. since float64 would generate errors when training.
    Args:
        detections: array of all video detections
        label: labels for each video detection
        class_names: list of all class names withing the dataset
    Returns:
        a tuple of (X, y) where X is our features/ detections and has type tensor float 32 and y is our label and has type long.
    Example use:
        X, y= convert(detections= detections, labels= labels, class_names= wlasl100class_names)
    """
    label_to_number= {label: num for num, label in enumerate(class_names)} # used for mapping the labels to numbers
    X= torch.tensor(detections, dtype=torch.float32)
    y= [label_to_number[label] for label in labels] # a list that has all the labels but in number format
    y= torch.tensor(y, dtype=torch.long)    
    
    return X, y


def split_dataset(detections: NDArray[np.float64],
                  labels: List[str],
                  class_names: List[str],
                  test_size: float):
    """
    This function splits the dataset and converts them so that they are suitable for training process (ex: it maps the label "Red" to number 1)
    Args:
        detections: video detections for the entire dataset.
        labels: list of all video labels for the entire dataset.
        class_names: list of all class names in the dataset
        test_size: determines how data should be splitted    
    Returns:
        a tuple of (X_train, X_test, y_train, y_test) 
    Example usage:
        xtrain, xtest, ytrain, ytest= split_dataset(detections, labels, class_names, 0.2)
    """
    X_train, X_test, y_train, y_test = train_test_split(detections, labels, test_size= test_size, random_state=42, stratify=labels)
    X_train, y_train= convert(X_train, y_train, class_names)
    X_test, y_test= convert(X_test, y_test, class_names)
    
    return X_train, X_test, y_train, y_test
        

Overwriting C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/SignLanguageProject/src/preprocess_utils.py


# making models.py

In [5]:
%%writefile $src_dir/models.py
# Explanation: this file contains classes that are used to make the LSTM and transformer model variations.
# importing necessary libraries
import torch 
from torch import nn
import math
# importing typing for writing function input types
from typing import List, Callable

#-----------------------------------------------------------------functions for building transformer-----------------------------------------------------------
#normal positional encoding
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, seq_len):
    super().__init__()

    pe = torch.zeros(seq_len, d_model)
    position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * -(math.log(10000.0)/d_model))
    pe[:, 0::2] = torch.sin(position*div_term)
    pe[:, 1::2] = torch.cos(position*div_term)
    self.register_buffer('pe', pe.unsqueeze(0))

  def forward(self, x):
    return x + self.pe[:, :x.shape[1]]

#multihead attention layer
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads):
    super().__init__()
    assert d_model % num_heads == 0, "d_model should be divisible by num_heads"
    self.d_model = d_model
    self.num_heads = num_heads
    self.d_k = d_model // num_heads

    self.w_q = nn.Linear(d_model, d_model)
    self.w_k = nn.Linear(d_model, d_model)
    self.w_v = nn.Linear(d_model, d_model)
    self.w_o = nn.Linear(d_model, d_model)

  def scaled_dot_product_attention(self, Q, K, V):
    attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
    attn_probs = torch.softmax(attn_scores, dim=1)
    output = torch.matmul(attn_probs, V)
    return output

  def split_heads(self, x):
    batch_size, seq_len, d_model = x.shape
    return x.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

  def combine_heads(self, x):
    batch_size, num_heads, seq_len, d_k = x.shape
    return x.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)

  def forward(self, Q, K, V):
    Q = self.split_heads(self.w_q(Q))
    K = self.split_heads(self.w_k(K))
    V = self.split_heads(self.w_v(V))

    attn_output = self.scaled_dot_product_attention(Q, K, V)
    output = self.w_o(self.combine_heads(attn_output))
    return output

# feed forward layer
class PositionWiseFeedForward(nn.Module):
  def __init__(self, d_model, d_ff):
    super().__init__()
    self.fc1 = nn.Linear(d_model, d_ff)
    self.fc2 = nn.Linear(d_ff, d_model)
    self.relu = nn.ReLU()

  def forward(self, x):
    return self.fc2(self.relu(self.fc1(x)))

# encoder
class EncoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout):
    super().__init__()
    self.self_attn = MultiHeadAttention(d_model, num_heads)
    self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    attn_output = self.self_attn(x, x, x)
    x = self.norm1(x + self.dropout(attn_output))
    ff_output = self.feed_forward(x)
    x = self.norm2(x + self.dropout(ff_output))
    return x

#------------------------------------------------------------------Transformer Models---------------------------------------------------------------------------
# encoder based transformer model for classification, no positional encoding
class Transformer(nn.Module):
    def __init__(self, class_names: List[str], seq_len: int, d_model: int, nhead: int, d_ff: int = 2048, num_layers: int = 2, dropout: float = 0.1):
        """
        Transformer model for sign language classification
        Parameters:
            class_names : list of all the classes in the dataset.
            seq_len : length of input sequences-> corresponds to frame numbers in a video sample.
            d_model : dimention of the model inputs (number of features).
            nhead : the number of attention heads in the multi-head attention layer.
            d_ff : the dimension of the feedforward network.
            num_layers: the number of layers in the Transformer encoder. Default is 2.
            dropout : the dropout probability.
        """
        super().__init__()
        self.model_type = 'transformer'
        self.class_names = class_names
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, nhead, d_ff, dropout) for i in range(num_layers)])
        self.classifier = nn.Linear(in_features=d_model, out_features=len(self.class_names))
        
    def forward(self, src: torch.Tensor):
        output = src
        for encoder_layer in self.encoder_layers:
            output = encoder_layer(output)
            
        output = torch.mean(output, dim=1)
        output = self.classifier(output)
        return output

# encoder based transformer model for classification, with positional encoding
class PETransformer(Transformer):
    def __init__(self, class_names: List[str], seq_len: int, d_model: int, nhead: int, d_ff: int = 2048, num_layers: int = 2, dropout: float = 0.1):
        super().__init__(class_names, seq_len, d_model, nhead, d_ff, num_layers, dropout)
        self.model_type = 'PEtransformer'
        self.positional_encoding = PositionalEncoding(d_model, seq_len)

    def forward(self, src: torch.Tensor):
        output = self.positional_encoding(src)
        for encoder_layer in self.encoder_layers:
            output = encoder_layer(output)
        output = torch.mean(output, dim=1)
        output = self.classifier(output)
        return output

# encoder based transformer model for classification, with a learnable parameter for positional encoding
class ParamTransformer(Transformer):
    def __init__(self, class_names: List[str], seq_len: int, d_model: int, nhead: int, d_ff: int = 2048, num_layers: int = 2, dropout: float = 0.1):
        """
        Transformer model with learnable parameter as encoding
        """
        super().__init__(class_names, seq_len, d_model, nhead, d_ff, num_layers, dropout)
        self.model_type = 'paramtransformer'
        self.positional_encoding = nn.Parameter(torch.randn(1, seq_len, d_model))

    def forward(self, src: torch.Tensor):
        output = src + self.positional_encoding
        for encoder_layer in self.encoder_layers:
            output = encoder_layer(output)
        output = torch.mean(output, dim=1)
        output = self.classifier(output)
        return output

# encoder based transformer model for classification, with 1D CNN for positional encoding
class ConvoTransformer(Transformer):
    def __init__(self, class_names: List[str], seq_len: int, d_model: int = 129, nhead: int = 3, d_ff: int = 2048, num_layers: int = 2, input_shape: int = 1662, kernel_size: int = 1, dropout: float = 0.1):
        super().__init__(class_names, seq_len, d_model, nhead, d_ff, num_layers, dropout)
        self.model_type = 'convotransformer'
        self.positional_encoding = nn.Conv1d(in_channels=input_shape, out_channels=d_model, kernel_size=kernel_size)

    def forward(self, src: torch.Tensor):
        src = src.permute(0, 2, 1)
        output = self.positional_encoding(src)
        output = output.permute(0, 2, 1)
        for encoder_layer in self.encoder_layers:
            output = encoder_layer(output)
        output = torch.mean(output, dim=1)
        output = self.classifier(output)
        return output

#------------------------------------------------------------------------------------LSTM Model-----------------------------------------------------------------
class LstmModel(nn.Module):
    def __init__(self, class_names: List[str], input_size: int, hidden_size: int, num_layers: int= 1, activition: Callable= nn.ReLU()):
        super().__init__()
        self.model_type= 'lstm'
        self.num_layers = num_layers
        self.class_names= class_names
        self.lstm_layers= nn.ModuleList()
        self.lstm_layers.append(nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True))
        
        for _ in range(1, num_layers):
            self.lstm_layers.append(nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True))
        
        self.fc = nn.Linear(in_features= hidden_size, out_features= len(self.class_names))
        self.activition = activition

    def forward(self, src):
        output = src
        for lstm in self.lstm_layers:
            output, hidden_states = lstm(output)
            output = self.activition(output)

        output= self.fc(output[:,-1,:])
        return output


Overwriting C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/SignLanguageProject/src/models.py


# making train_utils.py file

In [23]:
%%writefile $src_dir/train_utils.py
#Explanation: This python file contains functions for implementing the training step
#importing libraries for training models
import torch  
from torch.utils.data import DataLoader # for writing input types
# importing tqdm for progression bar
from tqdm.auto import tqdm 
# importing typing for writing input types for the functions
from typing import Callable, List

#function for resetting the model parameters if needed
def reset_model_parameters(model):
    for name, module in model.named_children():
        if hasattr(module, 'reset_parameters'):
            module.reset_parameters()
            
# function to calculate accuracy
def accuracy_fn(y_logits: torch.Tensor, y: torch.Tensor):
    """
    returns accuracy based on true and predicted label values
    Args:
        y_logits: torch tensor that represents model outputs
        y: torch tensor that represents true output values
    Returns:
        accuracy
    Example usage: 
        accuracy= accuracy_fn(y_logits, y)
    """
    y_preds= torch.argmax(y_logits, 1)                 # gives the position --> label of the strongest prediction
    corrects= (y_preds==y)                             # compare prediction with truth
    accuracy= corrects.sum().item()/ corrects.shape[0] # number of true predictions / all predictions
    return accuracy

# function to train the model
def train_model(num_epochs: int,
                model: torch.nn.Module,
                train_dataloader: DataLoader,
                test_dataloader: DataLoader,
                optimizer: torch.optim.Optimizer,
                loss_fn: torch.nn.Module,
                device: torch.device):
    """
    Trains a model on given train and test data. and returns avg loss and avg accruacies for each epoch.
    Args:
        num_epochs: number of times (epochs) the model is trained with the entire dataset
        model: model object
        train_dataloader: DataLoader object of train dataset
        test_dataloader: DataLoader object of test dataset.
        optimizer: optimizing entity that updates the weights of the model
        loss_fn: function to calculate loss
        device: Cuda or CPU
    Returns:
        A tuple of (train_losses, test_losses, train_accuracies, test_accuracies, y_trues, y_preds) where:
        train_losses is a list that contains avg train loss of all batches, for every epoch.
        test_losses is a list that contains avg test loss of all batches, for every epoch.
        train_accuracies is a list that contains avg train accuracy of all batches, for every epoch.
        test_accuracies is a list that contains avg test accuracy of all batches, for every epoch.
        y_trues and y_preds are used to draw confusion matrix (they get overwritten in each epoch so in principle the last value of y_trues and y_preds is
        returned).
    Example usage: 
        results= train(num_epochs, model, train_dataloader, test_dataloader, optimizer, loss_fn, accuracy_fn, device)
        train_losses, test_losses, train_accuracies, test_accuracies, y_trues, y_preds= results[0], results[1], results[2], results[3], results[4], results[5]
    """
    
    train_losses= []     
    test_losses= []          
    train_accuracies= []      
    test_accuracies= []       
 
    for epoch in tqdm(range(num_epochs), desc="Training Epoch"):
        model.train()
        train_loss= [] # a list to store loss of every batch
        train_acc= []  # a list to store acc of every batch

        for X, y in train_dataloader:
            # sending detections and labels to device
            X= X.to(device) 
            y= y.to(device)

            # train the model
            optimizer.zero_grad()
            y_logits = model(X)
            loss = loss_fn(y_logits, y)        # batch loss
            loss.backward()
            optimizer.step()

            accuracy= accuracy_fn(y_logits, y) # batch accuracy

            #add loss and accuray of the batch to the list
            train_loss.append(loss.item())
            train_acc.append(accuracy)
            
        # adding average loss and accuracy for the epoch
        train_losses.append(sum(train_loss) / len(train_loss))  
        train_accuracies.append(sum(train_acc) / len(train_acc))
    
        model.eval()      # setting model to evaluation mode so no weights are changed

        y_trues= []       
        y_preds= []       
        test_loss= []     # list to store loss of every batch
        test_acc= []      # list to store accuracy of every batch
        
        with torch.no_grad():
            for X, y in test_dataloader:
                X = X.to(device)
                y = y.to(device)
                
                y_logits = model(X)
                loss = loss_fn(y_logits, y)        # test batch loss
                accuracy= accuracy_fn(y_logits, y) # test batch accuracy
                
                test_loss.append(loss.item())
                test_acc.append(accuracy)
                y_pred= torch.argmax(y_logits, 1)                 # predicted labels
                
                y_trues.extend(y.flatten().cpu().numpy())          # Store true labels
                y_preds.extend(y_pred.flatten().cpu().numpy())     # Store predictions
                
        test_losses.append(sum(test_loss) / len(test_loss))
        test_accuracies.append(sum(test_acc) / len(test_acc))

    return train_losses, test_losses, train_accuracies, test_accuracies, y_trues, y_preds


Overwriting C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/SignLanguageProject/src/train_utils.py


# making plot_utils.py file

In [21]:
%%writefile $src_dir/plot_utils.py
# Explanation: This python file contains functions for plotting training results and other important data.
# importing libraries for plotting data                                                 
import cv2 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import confusion_matrix

# for writing input types for the functions                                                                                
from typing import List
from numpy.typing import NDArray

#---------------------------------------------------------------Preprocessing-----------------------------------------------------------------------------------
# functions for drawing video landmarks
def draw_circles(frame: np.ndarray,
                   frame_detection: NDArray[np.float64],
                   frame_structure: List[tuple]):
    """
    This function draws circles on the frame based on x and y position of the landmark.
    Args:
        frame: represents frame that is shown
        frame_detection: represents the coordinates of the landmarks in a frame that was processed by mediapipe.
        frame_structure: a list that represents the start and end index for each landmark class: pose, face, lh, rh.
    Returns:
        manipulated frame 
    Example usage:
        frame = draw_circles(frame, frame_detection, [(0, 132), (132, 1536), (1536, 1599), (1599, 1662)])
    """
    for (start, end) in frame_structure:
        bodypart= frame_detection[start:end]
        if (start, end) == frame_structure[0]:
            for i in range(0, len(bodypart), 4):
                x, y = bodypart[i], bodypart[i+ 1]
                px = int(x * frame.shape[1]) 
                py = int(y * frame.shape[0])
                cv2.circle(frame, (px, py), 3, (0, 255, 0), -1)
        else:
            for i in range(0, len(bodypart), 3):
                x, y = bodypart[i], bodypart[i+ 1]
                px = int(x * frame.shape[1]) 
                py = int(y * frame.shape[0])
                cv2.circle(frame, (px, py), 3, (0, 255, 0), -1)
    return frame

# function to show the vidoe detections
def show_video_detections(video_detection: NDArray[np.float64]):
    """
    This function draws Mediapipe landmarks that were detected from a video. It uses a video_detection array that has x, y, z( and visibility for pose) values. 
    here we only focus on the (x,y) coordinates we do not draw in 3D (no z or visibility).
    Args:
        video_detection: an array that represents video detections 
    """
    height, width= 720, 1280
    frame_structure= [(0, 132), (132, 1536), (1536, 1599), (1599, 1662)]  
    cv2.namedWindow("video detection", cv2.WINDOW_NORMAL)
    cv2.resizeWindow("video detection", width= width, height= height)
    try:
        for frame_detection in video_detection:
            frame = np.zeros((height, width, 3), dtype=np.uint8)
            frame = draw_circles(frame, frame_detection, frame_structure)
            cv2.imshow("video detection", frame)
            if cv2.waitKey(100) & 0xFF == 27:  #ESC key
                break
    finally:
        cv2.destroyAllWindows()

#---------------------------------------------------------------Training-----------------------------------------------------------------------------------

# function for drawing loss and accuracy of a training session
def plot_loss_accuracy(train_losses: List[float],
                       test_losses: List[float],
                       train_accuracies: List[float],
                       test_accuracies: List[float],
                       batch_size: int):
    """
    Draws loss and accuracy of a training session.
    Args:
        train_losses: list of train losses
        test_losses: list of test losses
        train_accuracies: list of train accuracies
        test_accuracies: list of test accuracies
        batch_size: batch size
    Example usage:
        plot_loss_accuracy(train_losses, test_losses, train_accuracies, test_accuracies, 64)
    """
    plt.figure(figsize=(18, 9))

    # Loss
    plt.subplot(1, 2, 1) 
    plt.plot(train_losses, label='Train Loss')  
    plt.plot(test_losses, label='Test Loss')
    plt.title(f'Loss over Epochs(batch size= {batch_size}), Last Loss:{test_losses[-1]}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Train Accuracy')
    plt.plot(test_accuracies, label='Test Accuracy')
    plt.title(f'Accuracy over Epochs(batch size= {batch_size}), Last Accuracy: {test_accuracies[-1]}')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.show()


# function for drawing confusion matrix
def plot_confusion_matrix(y_trues: List[int],
                          y_preds: List[int],
                          class_names: List[str],
                          num_epochs: int):
    """
    Plots confusion matrix of a model using true values and model predictions.
    Args:
        y_trues: true values
        y_preds: model predictions
        class_names: list of all class names in the dataset
        num_epochs: number of epochs
    Example usage:
        plot_confusion_matrix(y_trues, y_preds, class_names, num_epochs)
    """
    
    conf_matrix = confusion_matrix(y_trues, y_preds)
    plt.figure(figsize=(18, 15))
    
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    
    plt.title(f'Confusion Matrix after {num_epochs} epoches')
    plt.show()

# function to draw the training results in tensor board
def draw_in_tensorboard(train_losses: List[float],
                        test_losses: List[float],
                        train_accuracies: List[float], 
                        test_accuracies: List[float],  
                        save_directory: str):
    
    """
    Plots loss and accuracy of the training process in tensor board.
    Args:
        train_losses: train loss values for all epochs
        test_losses: test loss values for all epochs
        train_accuracies: train accuracy values for all epochs
        test_accuracies: test accuracy values for all epochs
        save_directory: the directory in which the files need to be saved
    Example usage:
        draw_in_tensorboard(train_losses, test_losses, train_accuracies, test_accuracies, save_directory)
    """
    
    with SummaryWriter(log_dir= save_directory) as writer:
        losses_and_accuracies= zip(train_losses, test_losses, train_accuracies, test_accuracies)
        for epoch , (tr_losses, te_losses, tr_accs, te_accs) in enumerate(losses_and_accuracies):
            writer.add_scalar('Loss/train', tr_losses, epoch)
            writer.add_scalar('Loss/test', te_losses, epoch)
            writer.add_scalar('Accuracy/train', tr_accs, epoch)
            writer.add_scalar('Accuracy/test', te_accs, epoch)
            

Overwriting C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/SignLanguageProject/src/plot_utils.py


# making train.py file

In [3]:
%%writefile $src_dir/train.py
# Explanation: this python file carries out the training process. from making the datast to plotting the results.
# importing numpy, torch, nn, typing: for writing input types for the functions
import numpy as np 
from numpy.typing import NDArray
from typing import Callable, List, Tuple, Literal
import torch
from torch import nn
# from tqdm.auto import tqdm  

import torch.optim as optim                   #optimizer
from torch.utils.data import Dataset          # dataset calss
from torch.utils.data import DataLoader       #data loader
    
from sklearn.utils import resample         # used for bootstrapping
from sklearn.model_selection import KFold  # for K fold cross validation if necessary

# connecting the steps
from preprocess_utils import interpolate_dataset, split_dataset, convert
from plot_utils import draw_in_tensorboard, plot_confusion_matrix, plot_loss_accuracy
from train_utils import train_model, reset_model_parameters # for K fold cross validation if necessary

# making a simple dataset class from to CustomImageDataset example from pytorch.org
class CustomDataset(Dataset):
    def __init__(self,features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, index):
        feature = self.features[index]
        label = self.labels[index]
        return feature, label
#------------------------------------------------------------------train--------------------------------------------------------------------------------
# function to configure the training enviroment
def configure(detections: NDArray[np.float64], 
              labels: List[str],
              class_names: List[str],
              test_size: float,
              batch_size: int,
              num_epochs: int,
              model: torch.nn.Module,
              lr: float,
              device: torch.device,
              dir: Literal['LSA64', 'WLASL100']): # only allows on or the other
    """
    This function configures the parameters parameters for the training of the model.  
    Args:
        detections: array of all video detections
        labels: list of all video labels
        class_names: a list containing unique class names in the dataset
        batch_size: batch size
        num_epochs: how many times to train the model
        lr: determines the learning rate or rate with which we apply changes to model parameters
        device: Cuda or CPU
        dir: directory of experiment results -> LSA64 or  WLASL100
    Returns:
        trains the model and plots loss and accuracy for both train and test dataset in tensor board. also plots confusion matrix.
    """
    save_directory =f'C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/SignLanguageProject/experiment_results/{dir}/{model.model_type}/runs/'
    X_train, X_test, y_train, y_test= split_dataset(detections, labels, class_names, test_size) # split the dataset
    # making datasets and datloaders
    train_dataset= CustomDataset(X_train, y_train)  
    test_dataset= CustomDataset(X_test, y_test) 
    train_loader = DataLoader(dataset=train_dataset, batch_size= batch_size, num_workers=0, shuffle=True) # train 
    test_loader = DataLoader(dataset=test_dataset, batch_size= batch_size, num_workers=0, shuffle=False)  # test
    # send model to device: gpu or cpu
    model= model.to(device)                  
    loss_fn = nn.CrossEntropyLoss() #loss
    optimizer = optim.Adam(model.parameters(), lr= lr) #optimizer
    #training the model
    train_losses, test_losses, train_accuracies, test_accuracies, y_trues, y_preds = train_model(num_epochs,model, train_loader, test_loader, optimizer, loss_fn, device)
    # drawing the train results in tensorboard
    draw_in_tensorboard(train_losses, test_losses, train_accuracies, test_accuracies, save_directory)
    # draw confusion matrix
    plot_confusion_matrix(y_trues, y_preds, class_names, num_epochs)

#--------------------------------------------------------------------k fold------------------------------------------------------------------------------
def configure_Kfold(detections: NDArray[np.float64], 
                    labels: List[str],
                    class_names: List[str],
                    n_splits: int,
                    batch_size: int,
                    num_epochs: int,
                    model: torch.nn.Module,
                    lr: float,
                    device: torch.device):
    """
    This function configures the parameters for K_fold_cross_validation.  
    Args:
        detections: array of all video detections
        labels: list of all video labels
        class_names: a list containing unique class names in the dataset
        n_splits: determines the number of folds that the data will be divided to
        batch_size: batch size
        num_epochs: how many times to train the model
        lr: determines the learning rate or rate with which we apply changes to model parameters
        device: Cuda or CPU        
    Returns:
        plots loss and accuracy of both train and test dataset for each fold
    """
    X, y= convert(detections, labels, class_names) # converting detections and labels to the right format.
    dataset= CustomDataset(X, y)                   # making dataset
    kf= KFold(n_splits=n_splits, shuffle=True)     # making kfold object to split the dataset
    model= model.to(device)
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(dataset)):
        print("------------------------------------------------------------------------------------------------------------------------------")
        print(f"Fold {fold + 1}")
        reset_model_parameters(model) # resetting the model parameters
        # making dataloaders
        train_loader = DataLoader(dataset=dataset, batch_size= batch_size, sampler=torch.utils.data.SubsetRandomSampler(train_idx)) 
        test_loader = DataLoader(dataset=dataset, batch_size= batch_size, sampler=torch.utils.data.SubsetRandomSampler(test_idx))
        # loss and optimizer
        loss_fn = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr= lr)
        # training the model
        train_losses, test_losses, train_accuracies, test_accuracies, y_trues, y_preds = train_model(num_epochs,model, train_loader, test_loader, optimizer, loss_fn, device)
        #plot results
        plot_loss_accuracy(train_losses, test_losses, train_accuracies, test_accuracies, batch_size)
        # Call the function to plot the confusion matrix
        #plot_confusion_matrix(y_trues, y_preds, class_names, num_epochs)

Overwriting C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/SignLanguageProject/src/train.py


# making analyse_layer.py file

In [4]:
%%writefile $src_dir/analyse_layer.py
# Explanation: this python file contains functions for analysing layer attentions
# importing necessary libraries
import os
import cv2
import torch
from torch import Tensor
import mediapipe as mp
import numpy as np
import matplotlib.pyplot as plt

mp_holistic= mp.solutions.holistic     # mediapipe holistic model
mp_drawing= mp.solutions.drawing_utils # pre-made class that has functions for drawing media pipe result object

#--------------------------------------------------------------preparing data----------------------------------------------------------------------
def get_landmarks_from_vid(video_path: str, frame_numbers: int = 30):
    """
    This function returns a list of landmarks, a list of pixel coordinates(x,y) and a list of mediapipe result objects for a video path
    by applying mediapipe model  to each frame of the video in LSA64 dataset. (can also be modified slightly to handle WLASL100 videos)
    Args:
        video_path: Path to video.
        frame_numbers: number of frames we want to take from the entire video.
    Returns:
        A tuple of (results, coordinates, video_detections, label) where :
        results is a list of mediapipe objects. It is used later for drawing mediapipe landmarks with mp_drawing class.
        coordiantes is a list of (x, y) coordinates. It is used later for drawing circles with OpenCV, on specific landmarks.
        video_detections is a flattened array of mediapipe detections. It is later fed to the model for layer analysis.
        vid_index is an index number (range 0 to 63) that corresponds to the label of the vidoe.

    Note:
        video_detections has the following structure:
        indexes (0 to 131) of the list correspond to the first 33 pose landmarks : x, y, z, visibility
        indexes (132 to 1535) of the list correspond to the first 468 face landmarks: x, y, z
        indexes (1536 to 1598) of the list correspond to the first 21 left hand landmarks: x, y, z
        indexes (1599 to 1661) of the list correspond to the first 21 right hand landmarks: x, y, z
    """
    
    #'Example vidoe path: C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/lsa64_raw/all/001_001_001.mp4'
    with mp.solutions.holistic.Holistic(min_detection_confidence= 0.5, min_tracking_confidence=0.5) as holistic:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened(): # if OpenCV and not read the video
            print(f"ERROR in opening the video path{video_path}")    
        else:
            result, coordinates, video_detections= [], [], []
            total_frames_number = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))                               
            frame_idxs_to_process = np.linspace(0, total_frames_number-1, frame_numbers, dtype=int)  
            
            for idx in frame_idxs_to_process:
                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
                ret, frame= cap.read()
                if not ret:
                    print("unreadble frame detected")
                    break       
                result= holistic.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                results.append(result) # appending list of result objects
                # making frame_detections by concatenating pose, face lh, rh 
                pose= np.array([[res.x, res.y, res.z, res.visibility] for res in result.pose_landmarks.landmark]).flatten() if result.pose_landmarks else np.zeros(33*4) 
                face= np.array([[res.x, res.y, res.z] for res in result.face_landmarks.landmark]).flatten() if result.face_landmarks else np.zeros(468*3) 
                lh= np.array([[res.x, res.y, res.z] for res in result.left_hand_landmarks.landmark]).flatten() if result.left_hand_landmarks else np.zeros(21*3)
                rh= np.array([[res.x, res.y, res.z] for res in result.right_hand_landmarks.landmark]).flatten() if result.right_hand_landmarks else np.zeros(21*3)
                frame_detection= np.concatenate((pose,face,lh, rh))
                video_detections.append(frame_detection) # appending video detection
                # storing coordinates
                pose_coor= [(int(res.x * frame.shape[1]), int(res.y * frame.shape[0])) for res in result.pose_landmarks.landmark] if result.pose_landmarks else [(0, 0)] * 33 
                face_coor= [(int(res.x * frame.shape[1]), int(res.y * frame.shape[0])) for res in result.face_landmarks.landmark] if result.face_landmarks else [(0, 0)] * 468 
                lh_coor= [(int(res.x * frame.shape[1]), int(res.y * frame.shape[0])) for res in result.left_hand_landmarks.landmark] if result.left_hand_landmarks else [(0, 0)] * 21
                rh_coor= [(int(res.x * frame.shape[1]), int(res.y * frame.shape[0])) for res in result.right_hand_landmarks.landmark] if result.right_hand_landmarks else [(0, 0)] * 21
                coordinate= pose_coor+face_coor+lh_coor+rh_coor
                coordinates.append(coordinate)
                    
            vid_index= int(os.path.basename(video_path).split('_')[0]) - 1
        cap.release()
        
        return results, video_detections, coordinates, vid_index

# function for calculating mean attentions
def calculate_means(attributions: Tensor):
    """
    Calculates the mean Layer attribution of a landmark. each landmark has x, y, z (and in case of pose_landmarks visibility) values
    each landmark value has an attribution that can effect the transformer layer. mean acts as a parameter that shows how much a 
    landmark is effecting the output of the model.
    Args:
        attributions: a tensor of shape(1, frame_number, 1662)
    Returns:
        means: a tensor of shape (1, frame_number, 543)
    """
    #pose -> x, y, z, vis
    pose_part = attributions[:, :, :132]                                               #batch, seq_len, coordinates
    pose_part = pose_part.reshape(attributions.shape[0], attributions.shape[1], -1, 4) #batch, seq_len, landmark, coordinates_per_landmark
    pose_means = pose_part.mean(dim=3) # calculate mean for pose
    # face, lh, rh -> x, y, z
    rest = attributions[:, :, 132:] 
    rest = rest.reshape(attributions.shape[0], attributions.shape[1], -1, 3)
    rest_means = rest.mean(dim=3) # mean for face, lh, rh
    means = torch.cat((first_part_means, second_part_means), dim=2) # concatenate means: batch, seq_len, landmark, mean value
    return means

#-------------------------------------------------------------------plotting data ------------------------------------------------------------------

# function for drawing the attention heatmap
def plot_atts_heatmap(attributions: Tensor, title="Video"):
    """
    for each video, this function plots attribution as a heatmap where x axis are attributes and y axis are frames.
    """
    vid_attributions = attributions.detach().cpu().numpy() # moving tensor to CPU for drawing 
    num_vids, num_frames, num_features = vid_attributions.shape
    for num in range(num_vids): # for each video 
        plt.figure(figsize=(20, 15))
        plt.imshow(vid_attributions[num].T, cmap='cividis', aspect='auto', origin='lower') 
        plt.colorbar()
        
        plt.xlim(0, num_frames - 1)  # Set x-axis limits
        plt.xticks(range(0, num_frames))  # Set ticks every 5 units
        plt.xlabel("frames")

        plt.ylim(0, num_features - 1)  # Set y-axis limits
        plt.yticks(range(0, num_features, 100))  # Set ticks every 10 units
        plt.ylabel("features")

        plt.title(f"{title} - Sample {num}")
        plt.show()


#------------------------------------------------------------------plotting on data on video---------------------------------------------------------
# I have to work on these function and try to rewrite them better and more efficient.
def make_idx_tr_pairs(indices: Tensor, means: Tensor):
    """
    Args:
        indices: a tensor that contains indices of most to least significant landmarks for each frame
        means: output of the above function it is basically used as a transparency score from 0 to 10

    Returns:
        idx_tr: an ordered list containing indices of most to least significant landmarks for each frame along with their transparency score

    Note:
        the reason idx_tr is an ordered list is so that we can draw landmarks from least to most important. this is usefull
        when for example a left hand landmark that is important hovers over a face landmrk. in this case it is drawn on top of 
        the less imoprtant landmark.
    """
    #prepare transparency_level
    transparency_level= means/ torch.max(abs(means))
    transparency_level= transparency_level * 10
    
    
    list_1= indices.tolist()
    list_2= transparency_level.int().tolist()
    
    idx_trs = []
    for f in range(len(list_1[0])):  
        frame = []
        for i in range(len(list_1[0][f])):
            index = list_1[0][f][i]  
            value = list_2[0][f][index]  
            frame.append((index, value))  
        idx_trs.append(frame)  

    return idx_trs


def plot_mp_landmarks(frame, result):
    """
    This function draws landmarks and connections on a given frame.
    Args:
        frame: video frame that we want to draw on.
        result: the detected media pipe object corresponding to the frame.
        
    """
    mp_drawing.draw_landmarks(frame, result.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color= (0, 255, 0), thickness= 1, circle_radius= 2),
                              mp_drawing.DrawingSpec(color= (0, 255, 0), thickness= 1, circle_radius= 2))
    mp_drawing.draw_landmarks(frame, result.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                              mp_drawing.DrawingSpec(color= (0, 255, 0), thickness= 1, circle_radius= 2),
                              mp_drawing.DrawingSpec(color= (0, 255, 0), thickness= 1, circle_radius= 2))
    mp_drawing.draw_landmarks(frame, result.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color= (0, 255, 0), thickness= 1, circle_radius= 2),
                              mp_drawing.DrawingSpec(color= (0, 255, 0), thickness= 1, circle_radius= 2))
    mp_drawing.draw_landmarks(frame, result.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color= (0, 255, 0), thickness= 1, circle_radius= 2),
                              mp_drawing.DrawingSpec(color= (0, 255, 0), thickness= 1, circle_radius= 2))



def plot_circle(frame, coor, idx_tr):
    """
    This function visualizes layer attributions in a frame.
    Args:
        frame: video frame that we want to draw on.
        coor: list of all (x, y) coordinates of the landmarks detected in the frame
        idx_tr: contain indexes of landmarks and their transparency score. the indexes correspond to the 
        coordinates of the landmarks in coor list.
    Note:
        for more information about landmark indexes take a look at get_landmarks and draw_layer_attr functions.
        
    """
    for idx, tr in reversed(idx_tr[:75]):
        intensity = int(min(255, max(0, 255 * abs(tr) / 10)))

        color = (intensity, 255, 0)  

        cv2.circle(frame, (coor[idx][0], coor[idx][1]), radius=5, color=color, thickness=-1)


def draw_layer_attr(video_path, results, pixel_coor, idx_trs, frame_numbers = 30, wait= 200):
    """
    This function visualizes layer attributions in the video.
    Args:
        video_path: path to the video.
        results: set of all (x, y) coordinates of the landmarks detected in the frame
        pixel_coor: this list contains indexes of most important landmarks, it can be any number from 0 to 542.
        idx_trs: an ordered list of tuples containing the index landmarks and their transparency
        
    """
    #'C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/lsa64_raw/all/001_001_001.mp4'
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"ERROR in opening the video path{video_path}")
    else:
        total_frames_number = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_idxs_to_process = np.linspace(0, total_frames_number - 1, frame_numbers, dtype=int)
        
        for frame_idx, result, coor, idx_tr in zip(frame_idxs_to_process ,results , pixel_coor, idx_trs):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if not ret:
                break
            plot_mp_landmarks(frame, result)
            plot_circle(frame, coor, idx_tr)

            width = int(frame.shape[1] * 0.60)
            height = int(frame.shape[0] * 0.60)
            resized_frame = cv2.resize(frame, (width, height))

            cv2.imshow("Video", resized_frame)
        
            # Set wait time to 33 milliseconds for approx. 30 fps
            if cv2.waitKey(wait) & 0xFF == 27:  # Exit on ESC key
                break
        
        cap.release()
        cv2.destroyAllWindows()

Overwriting C:/Users/sadeg/OneDrive/Desktop/Thesis/python_codes/SignLanguageProject/src/analyse_layer.py
