# Models training
In this notebook we train and analyze LSTM models, compare them and tune them to get the best results. We will work with a small subset due to low hardware availability.

## 0.1 Load variable
This notebook is a direct continuation of data_preprocessing.ipynb. We start by loading the necessary libraries variables from that.

In [38]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fastdtw import fastdtw
from collections import Counter
from sklearn.cluster import KMeans
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from landmarks_augmentator import produce_augmentations

with open('data/landmarks_subset.pkl', 'rb') as file:
    subset_landmarks = pickle.load(file)
for key in list(subset_landmarks.keys()):
    if len(subset_landmarks[key]) == 0:
        del subset_landmarks[key]

data_info = pd.read_csv('data/video_labels.csv',dtype={'video_id': object},index_col=0)
data_info = data_info.loc[data_info.video_id.isin(subset_landmarks.keys())]
subset_words = data_info.loc[data_info.video_id.isin(subset_landmarks.keys()),'word'].unique()

POSE = np.hstack((np.ones(33), np.zeros(21+21+468))) == 1
LH = np.hstack((np.zeros(33), np.ones(21), np.zeros(21+468))) == 1
RH = np.hstack((np.zeros(33+21), np.ones(21), np.zeros(468))) == 1
FACE = np.hstack((np.zeros(33+21+21), np.ones(468))) == 1

## 0.2 Test-train split

List of hyperparameters:
- **Dimensions** Check wether 3D is significanly more sufficient than 2D
- **Face landmarks** Do the face landmarks contribute much? If yes, how many of them should we take?


In [39]:
PICKED_FRAMES = 24 # min([len(video) for video in landmarks.values()])
DIMENSTIONS = 3
USE_AUGMENTATIONS = True# Use augmented videos
    

In [None]:
#train_signer_ids = [118, 31, 59, 11, 115, 94, 6, 21, 10, 38, 56, 41, 4, 45, 32, 46, 13,
#                    42, 39, 17, 89, 60, 35, 15, 3, 92, 93, 34, 107, 28, 99, 37, 8, 97,
#                    70, 19, 91, 106, 63, 29, 26, 117, 66, 119, 50, 103, 120, 95, 78, 27,
#                    108, 57, 53, 75, 104, 43, 40, 77, 1, 33, 22, 105, 48, 73, 23]
#val_signer_ids = [2, 52, 12, 98, 88]
#test_signer_ids = [59, 115, 90, 4, 116, 100, 101, 102, 96, 90]
#
#num_of_words = data_info[data_info["word"].isin(subset_words)]["word"].nunique()
#subset_data = data_info[data_info["word"].isin(subset_words)]
#subset_train_data = subset_data[subset_data["signer_id"].isin(train_signer_ids)]
#subset_test_data = subset_data[subset_data["signer_id"].isin(val_signer_ids)]
#subset_validation_data = subset_data[subset_data["signer_id"].isin(test_signer_ids)]
#
#print(f"subset train data count: {len(subset_train_data)}")
#print(f"subset test data count: {len(subset_test_data)}")
#print(f"subset validation data count: {len(subset_validation_data)}")
#print(f"num of words: {num_of_words}")

In [40]:
# train, val, and test as given by the data authors, should find a better split
train_ids = list(data_info.loc[data_info.split=='train','video_id'])
validation_ids = list(data_info.loc[data_info.split=='val','video_id'])
test_ids = list(data_info.loc[data_info.split=='test','video_id'])

## 0.3 Produce Augmentations 
For train test only

In [41]:
#subset_landmarks = produce_augmentations(subset_landmarks,data_info)
train_landmarks = {id:video for id, video in subset_landmarks.items() if id in train_ids}
train_landmarks = produce_augmentations(train_landmarks,data_info.loc[data_info.video_id.isin(train_landmarks.keys())])

## 0.3 Prepare data to fit in LSTM
Take same amount of frames from each video.
<p style="color:red;">hopefully this is a temporary phase in the development </p>

In [42]:
def pick_frames(video,num_frames):
    ''' Take a subset of the frames, evenly spread over the whole video
    ----------------------------------------------------------------------------------------------------------
        EDIT: After getting the missing data, some video are too short and need more frames instad
              Will it bias the predictions? Nee to consider discarding these examples.
              OR ask chatGPT to write a function to make it a smooth "slow motion", I don't have time for this
    ----------------------------------------------------------------------------------------------------------
    '''
    if len(video) < num_frames:
        video_longer = video.copy()
        for _ in range(len(video),num_frames):
            video_longer = np.append(video_longer,video_longer[-1])
        return video_longer
    step_size = len(video) // num_frames
    video_shorter = video[::step_size][:num_frames]
    return video_shorter

In [43]:
words = subset_words 

#lstm_landmarks = {}
#for word in words:
#    videos = data_info.loc[data_info.word==word,'video_id']
#    for vid in videos:
#        lstm_landmarks[vid] = pick_frames(subset_landmarks[vid][:,POSE+LH+RH,:DIMENSTIONS].copy(),PICKED_FRAMES)
#        for key in subset_landmarks:
#            if key.split('_')[0] == vid:
#                lstm_landmarks[vid] = pick_frames(subset_landmarks[vid][:,POSE+LH+RH,:DIMENSTIONS].copy(),PICKED_FRAMES)
for id, video in list(subset_landmarks.items()):
    if id not in train_ids:
        subset_landmarks[id] = pick_frames(video[:,POSE+LH+RH,:DIMENSTIONS],PICKED_FRAMES)
    else: # Clear up space, maybe irrelevant because we soon delete the whole subset_landmarks object
        del subset_landmarks[id]

for id, video in train_landmarks.items():
        train_landmarks[id] = pick_frames(video[:,POSE+LH+RH,:DIMENSTIONS],PICKED_FRAMES)
        original_id = id.split('_')[0]
        if original_id != id and id not in data_info.video_id: # if augmented type, add its label to the data_info table
            augmented_row = data_info.loc[data_info.video_id==original_id,:]
            augmented_row.video_id = id
            data_info = pd.concat([data_info, augmented_row])

print(f'{len(subset_landmarks)+len(train_landmarks)} videos in in total for {len(words)} words')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  augmented_row.video_id = id


16971 videos in in total for 494 words


In [31]:
foo_tr = {}
for video_id, video in train_landmarks.items():
    foo_tr[video_id] = np.array([frame.flatten() for frame in video])

## 0.4 Train test split

In [50]:
for video_id, video in train_landmarks.items():
    train_landmarks[video_id] = np.array([frame.flatten() for frame in video])
for video_id, video in subset_landmarks.items():
    subset_landmarks[video_id] = np.array([frame.flatten() for frame in video])

X_train = np.array([video for video in train_landmarks.values()])
y_train = [data_info.loc[data_info.video_id==id,'word'].item() for id in train_landmarks.keys()]

X_val = np.array([video for id, video in subset_landmarks.items() if id in validation_ids])
y_val = [data_info.loc[data_info.video_id==id,'word'].item() for id in subset_landmarks.keys() if id in validation_ids]

X_test = np.array([video for id, video in subset_landmarks.items() if id in test_ids])
y_test = [data_info.loc[data_info.video_id==id,'word'].item() for id in subset_landmarks.keys() if id in test_ids]

del subset_landmarks, train_landmarks
# Suffle the train set
num_instances = X_train.shape[0]
shuffled_indices = np.arange(num_instances)
np.random.shuffle(shuffled_indices)
X_train = X_train[shuffled_indices]
y_train = np.array(y_train)[shuffled_indices]

  X_train = np.array([video for video in train_landmarks.values()])
  X_val = np.array([video for id, video in subset_landmarks.items() if id in validation_ids])
  X_test = np.array([video for id, video in subset_landmarks.items() if id in test_ids])


## 1.2 LSTM

In [54]:
from tensorflow import keras
from keras.utils import to_categorical
import os

MODEL_VERSION = 1

def load_tf_model(folder_path):
    if not os.path.exists(folder_path):
        return False
    for file in os.listdir(folder_path):
        if file.endswith('.keras'):
            return keras.saving.load_model(os.path.join(folder_path,file))
    return False
folder_path = os.path.join("serving","lstm",str(MODEL_VERSION))
model = load_tf_model(folder_path)

index_to_word = {word: i for i, word in enumerate(words)}
y_train_categorical = to_categorical([index_to_word[word] for word in y_train], num_classes=len(words))
if not model:    
    input_shape = (X_train[0].shape)  #Hopefully we can do variable number of frames later
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=input_shape))
    model.add(LSTM(128, return_sequences=True, activation='relu'))
    model.add(LSTM(64, return_sequences=False, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(len(words), activation='softmax'))

    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

    model.fit(X_train, y_train_categorical, epochs=100, batch_size=8)
    model.save(os.path.join('serving','lstm',str(MODEL_VERSION),'sign_to_text.keras'))

AttributeError: module 'tensorflow.keras' has no attribute 'saving'

Let's find how many correctly classified instances we get

In [24]:
count_train = Counter(y_train)
def per_stats(X_test = X_test,y_test = y_test):
    correct = {}
    for i,j in enumerate(model.predict(X_test).argmax(axis=1)):
        if y_test[i] == words[j]:
            correct[y_test[i]] = correct.get(y_test[i],0) + 1
    count_test = Counter(y_test)
    pred_stats = []
    for word, count in correct.items():
        row_data = {"word": word, "correct_count": count, "appeared_test": count_test.get(word, 0), "train_instances": count_train.get(word, 0)}
        pred_stats.append(row_data)

    # Concatenate the list of dictionaries to the DataFrame
    pred_stats = pd.concat([ pd.DataFrame(columns=["word","correct_count","appeared_test","train_instances"]), pd.DataFrame(pred_stats)], ignore_index=True)
    return pred_stats
display(per_stats(X_test = X_val,y_test = y_val))
print("The mean and median amount of instances per word in the train set are:",np.mean(list(count_train.values())),np.median(list(count_train.values())))



Unnamed: 0,word,correct_count,appeared_test,train_instances
0,tall,4,8,40
1,man,4,8,40
2,cold,8,12,28
3,pizza,2,8,32
4,dark,4,4,36
5,wear,5,10,15


The mean and median amount of instances per word in the train set are: 27.225806451612904 28.0


We see that to get better reults we need more augmentations.

In [144]:
# Helper function for self use, get video ids for a word in a particular set
def find_video_id(word,set = validation_ids):
    df = pd.concat([data_info.loc[data_info.video_id==id,['video_id','word']] for id in lstm_landmarks.keys() if id in set])
    return df.loc[df.word == word]
#use like this:
#find_video_id('short',validation_ids)

## 1.3 LSTM Version 2

In [25]:
MODEL_VERSION = 2
folder_path = os.path.join("serving","lstm",str(MODEL_VERSION))
model = load_tf_model(folder_path)

if not model:    
    input_shape = (lstm_landmarks[list(lstm_landmarks.keys())[0]].shape)  #Hopefully we can do variable number of frames later
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, activation='relu', input_shape=input_shape))
    model.add(LSTM(256, return_sequences=True, activation='relu'))
    model.add(LSTM(256, return_sequences=True, activation='relu'))
    model.add(LSTM(128, return_sequences=False, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(len(words), activation='softmax'))

    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

    model.fit(X_train, y_train_categorical, epochs=100, batch_size=8)
    model.save(os.path.join('serving','lstm',str(MODEL_VERSION),'sign_to_text.keras'))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [31]:
display(per_stats(X_test = X_val,y_test = y_val))
print("The mean and median amount of instances per word in the train set are:",np.mean(list(count_train.values())),np.median(list(count_train.values())))



Unnamed: 0,word,correct_count,appeared_test,train_instances
0,tall,4,8,40
1,man,4,8,40
2,taste,5,5,25
3,dark,4,4,36
4,room,4,8,28
5,dress,5,10,20


The mean and median amount of instances per word in the train set are: 27.225806451612904 28.0
