# Imports

In [None]:
import os

print(os.listdir("/kaggle/working"))

In [None]:
!pip install dlib torchinfo torchmetrics pytorch-nlp
!pip uninstall numpy
!pip install numpy==1.26.4

In [None]:
# Pytorch imports
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch import nn
import torchmetrics
import torchinfo

# Utils imports
import numpy as np
import os
import matplotlib.pyplot as plt

# GNLDataLoader

In [None]:
import os
import dlib
import cv2
import numpy as np
import torch
from torch.utils.data import Dataset
from torchnlp.encoders import LabelEncoder

debug_dl = True

class GNLDataLoader(Dataset):
    """Creates a dataloader for the Lipsync Project"""
    face_detector = dlib.get_frontal_face_detector()
    landmark = dlib.shape_predictor("shape_predictor_68_face_landmarks_GTX.dat")

    alphabet = [x for x in "abcdefghijklmnopqrstuvwxyz0123456789 "]
    encoder = LabelEncoder(alphabet, reserved_labels=['unknown'], unknown_index=0)
    CROPMARGIN = 20

    def __init__(self, labels_path: str, data_path: str, transform = None, train_test_percent: int = 75, debug: bool = False) -> None:
        """
        Creates a dataset given the path to the labels and the image directory

        Parameters:
            - `labels_path`: the path to the `csv` file containing the labels;
            - `images_dir`: the path to the directory with the images;
            - `transform`: states whether a transformation should be applied to the images or not.
        """
        super().__init__()
        self.debug: bool = debug

        if self.debug:
            print(f"[DEBUG] The data dir has{' ' if os.path.isdir(data_path) else ' not '}been recognized")
            print(f"[DEBUG] The label dir has{' ' if os.path.isdir(labels_path) else ' not '}been recognized")

        self.data_path, self.labels_path = data_path, labels_path
        self.data_dir, self.labels_dir = sorted(os.listdir(data_path)), sorted(os.listdir(labels_path))
        self.transform = transform


    def __len__(self) -> int:
        """
        Returns the length of the data/labels folder

        Returns:
            - `length` (`int`): the length of the data/labels folder
        """
        return len(self.data_dir)


    def __getitem__(self, index: int, straight: bool = False) -> tuple[torch.Tensor, list[str]]:
        """
        Get the ith item(s) in the dataset

        Parameters:
            - `index`: the index of the image that must be retrieven.

        Returns:
            - (`item`, `label`) (`tuple[torch.Tensor, torch.Tensor]`): the item in the ith position in the dataset, along with its label.
        """

        if self.debug:
            print(f"[DEBUG] Index of the dataloader: {index}")
            print(f"[DEBUG] Data folder: {self.data_dir[index]}")
            print(f"[DEBUG] Labels folder: {self.labels_dir[index]}")

        datas = [self.data_dir[index]] if type(self.data_dir[index]) != list else self.data_dir[index]
        labels = [self.labels_dir[index]] if type(self.labels_dir[index]) != list else self.labels_dir[index]

        to_return = []

        for ind, item in enumerate(datas):
            to_return.append((self.__load_video__(item), self.__load_label__(labels[ind])))


        '''return (
            [self.__load_video__(data_piece) for data_piece in datas],
            [self.__load_label__(label_piece) for label_piece in labels]
        )'''

        # print(f"{len(to_return)}")
        return tuple(to_return)


    def __load_video__(self, video_path: str) -> torch.Tensor:
        """
        Loads a video from the dataset given its path

        Parameters:
            - `video_path`: the path of the video that must be loaded

        Returns:
            - `video` (`torch.Tensor`): the video as a PyTorch's `Tensor`
        """
        label_name = video_path[:-3] + "json"
        video_path = os.path.join(self.data_path, video_path)
        cap = cv2.VideoCapture(video_path)
        if self.debug:
            #print(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            print(f"[DEBUG] Trying to open the video at path {video_path}")
        to_return = np.ndarray(shape =(75,100,150))

        # homog, prev_frame = True, None

        for i in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
            _, frame = cap.read()
            gframe = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY).astype('uint8')  # Format to 8-bit image. 'int8' doesn't seem to do the job either
            '''if self.debug:

                cv2.imshow("Frame", gframe)
                cv2.waitKey(0)
                cv2.destroyAllWindows()
                cv2.imwrite("/workspace/GUNILEO/tests/gframe001.jpg", gframe)

                prev_frame = gframe.shape if prev_frame == None else prev_frame
                homog = False if prev_frame != gframe.shape else True
                print(gframe.shape, homog)'''

            facedetect = self.face_detector(gframe)

            #HAVE A CHECK IF THE FACE IS FOUND OR NOT

            try:
                face_landmarks = self.landmark(gframe, facedetect[0])
                xleft = face_landmarks.part(48).x - self.CROPMARGIN
                xright = face_landmarks.part(54).x + self.CROPMARGIN
                ybottom = face_landmarks.part(57).y + self.CROPMARGIN
                ytop = face_landmarks.part(50).y - self.CROPMARGIN

                mouth = gframe[ytop:ybottom, xleft:xright]
                mouth = cv2.resize(mouth, (150, 100))

                mean = np.mean(mouth)
                std_dev = np.std(mouth)
                mouth = (mouth - mean) / std_dev
                to_return[i] = mouth
            except IndexError:
                # naughty_boys.add(video_path)
                # naughty_labels.add("data/matching/labels/" + label_name)
                to_return[i] = np.zeros((100, 150))

        cap.release()

        if self.debug:
            print(f"[DEBUG] Video {video_path} opened")
            print(f"[DEBUG] Shape of video: {to_return.shape}")

        to_return = np.array([to_return])

        return torch.tensor(to_return, dtype=torch.float32)


    def __load_label__(self, label_path: str) -> torch.Tensor:
        """
        Loads a label from the dataset given its path

        Parameters:
            - `label_path`: the path of the label that must be loaded;

        Returns:
            - `label` (`torch.Tensor`): the label as a PyTorch's tensor
        """

        encoding = [
            {"b":"bin","l":"lay","p":"place","s":"set"},
            {"b":"blue","g":"green","r":"red","w":"white"},
            {"a":"at","b":"by","i":"in","w":"with"},
            "letter",
            {"z":"zero","1":"one","2":"two","3":"three","4":"four","5":"five","6":"six","7":"seven","8":"eight","9":"nine"},
            {"a":"again","n":"now","p":"please","s":"soon"}
            ]

        code = label_path.split(".")[0].split("_")[-1]

        sentence = []
        for i, letter in enumerate(code):
            corresponding_dict = encoding[i]
            next = letter if corresponding_dict == "letter" else corresponding_dict[letter]
            sentence = sentence + [" "] + [x for x in next]

        # Adapting the labels to be all of equal size
        for i in range(37 - len(sentence)):
            sentence.append(" ")

        enl = self.encoder.batch_encode(sentence)
        enl = enl.type(torch.FloatTensor)
        if self.debug: print(f"[DEBUG] Label: {enl}\n[DEBUG] Sentence: {sentence}\n[DEBUG] Length: {len(sentence)}\n")
        return enl


# CNN

In [None]:
import torch
from torch import nn
import torchinfo

class SelectItem(nn.Module):
    def __init__(self, item_index):
        super(SelectItem, self).__init__()
        self._name = 'selectitem'
        self.item_index = item_index

    def forward(self, inputs):
        return inputs[self.item_index]
    
class LabialCNN(nn.Module):
    def __init__(self, debug: bool = False):
        super().__init__()

        self.debug = debug
        self.cnn = nn.Sequential(
            nn.Conv3d(in_channels=1, out_channels=8, kernel_size=(3, 5, 5), padding=(1, 2, 2), stride=(1, 2, 2)),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),
        
            nn.Conv3d(in_channels=8, out_channels=16, kernel_size=(3, 5, 5), padding=(1, 2, 2), stride=(1, 1, 1)),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),
        
            nn.Conv3d(in_channels=16, out_channels=32, kernel_size=(3, 5, 5), padding=(1, 2, 2), stride=(1, 1, 1)),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),
             # Left as default, check later if it causes problems
            
        )    
        self.gru = nn.Sequential(
            nn.GRU(input_size=1728, hidden_size=256, num_layers=2, dropout=0.5, bidirectional=True),
            SelectItem(0),

            nn.Linear(in_features=512, out_features=38),
            nn.Softmax()
        )

    # Remember to put FALSE
    def forward(self, x):
        x = self.cnn(x) # Run through the model
        
        sh = x.shape
        x = torch.reshape(x, (sh[1], sh[0], sh[2], sh[3])) # Reshape so that the channels are flattened, not frames
        x = nn.Flatten()(x)
        x = self.gru(x)
      
        
        if self.debug: print(f"Layer's shape: {sh}")
        #x = torch.flatten(x, 1)     # Flatten layer
        #if debug: print(f"  Layer's shape: {x.shape}")
        if self.debug: print(f"Summary of the layer: a")

        return x

# Loops

In [None]:
import numpy as np
import torchmetrics
import torch
from torch import nn

metric = torchmetrics.Accuracy(task="multiclass", num_classes=38)
batch_size = 32

def train_loop(device, dataloader, model, loss_fn, optimizer, batch_index: int, epochs: int, epoch: int, debug: bool=True):
    """Trains an epoch of the model

    Parameters:
        - `device`: destination device
        - `dataloader`: the dataloader of the dataset
        - `model`: the model used
        - `loss_fn`: the loss function of the model
        - `optimizer`: the optimizer
        - `batch_index`: the number of the currently processed batch
        - `epochs`: the number of epochs
        - `epoch`: the index of the epoch
        - `debug`: (default `True`): prints debug info
    """
    model.train()
    size = len(dataloader)
    predictions = torch.zeros((batch_size, 75, 38)).to(device)  #np.ndarray(shape=(batch_size, 75, 38))
    labels = torch.zeros((batch_size, 37)).to(device)  #np.ndarray(shape=(batch_size, 37))

    # Get the item from the dataset
    for item, (x, y) in enumerate(dataloader):
        #print(f"{x} -> {x.shape}")
        #for index, video in enumerate(x):
            # Move data to the device used
        video = x.to(device)
        label = y.to(device)

        # Compute the prediction and the loss
        pred = model(video)
        predictions[item] = pred
        labels[item] = label

            # if debug: print(video, video.shape, pred, pred.shape, label, label.shape, sep="\n\n========================================================\n\n")
        # total_acc = metric(pred.permute(1, 0), label)
        # print(f"[DEBUG] Accuracy: {total_acc}")

        # if debug: print(f"[DEBUG] Preds: {pred.shape}\n[DEBUG] Label: {label.shape}")

    loss = loss_fn(
        predictions.permute(1, 0, 2),
        labels,
        torch.full(size=(batch_size, ), fill_value=75, dtype=torch.long), # torch.Size([32])
        torch.full(size=(batch_size, ), fill_value=37, dtype=torch.long)  # torch.Size([32])
    )

    # Adjust the weights
    # mean_loss = total_loss//batch_size
    # avg_acc=total_acc//batch_size
    optimizer.step()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.zero_grad()

    if debug: print(f"→ Loss: {loss} [Batch {batch_index + 1}/{size}, Epoch {epoch + 1}/{epochs}]")

    """predictions = torch.stack(predictions)
    labels = torch.stack(labels)
    preds_shape = predictions.shape
    labels_shape = labels.shape
    predictions = torch.reshape(predictions, (preds_shape[1], preds_shape[0], preds_shape[2]))
    """

    """
    print(
    f"Predictions:\n{predictions}\n\nSize of predictions: {preds_shape}",
    f"Labels:\n{y}\n\nLabels shape: {y.shape}",
    f"Input size:\n{torch.full(size=(batch_size, ), fill_value=75, dtype=torch.long)}",
    f"Labels size:\n{torch.full(size=(batch_size, ), fill_value=37, dtype=torch.long)}",
    sep="\n\n===============================================\n\n"
    )
    """


    # Print some information

    # if debug: print(f"Accuracy of item {item}/{size}: {GNLAccuracy(predictions, y)}")

    #accuracy = metric.compute()
    print(f"===     The batch {batch_index + 1}/125 has finished training     ===")
    #if debug: print(f"→ Final accuracy of the epoch: {accuracy}")
    #metric.reset()


def GNLAccuracy(preds, labels) -> float:
    alphabet = [x for x in "abcdefghijklmnopqrstuvwxyz0123456789 "]
    total = 0
    for index, video in enumerate(preds):
        correct = 0
        pred_label = []
        label = [i for i in labels[index] if i != " "]
        for frame in video:
            letter = alphabet[torch.argmax(frame)]
            if letter != " ": pred_label.append(letter)

        for i, c in enumerate(pred_label):
            if c == label[i]:
                correct += 1
        total += correct / len(pred_label)
    return total / batch_size


def test_loop(device, dataloader, model, loss_fn, debug=True):
    model.eval()
    size = len(dataloader)

    # Disable the updating of the weights
    with torch.no_grad():
        for item, (x, y) in enumerate(dataloader):
            # Move data to the device used
            video = x.to(device)
            label = y.to(device)

            # Compute the prediction and the loss
            pred = model(video)

            # Get the accuracy score
            acc = metric(pred, label)
            acc = metric.compute()
            if debug: print(f"→ Accuracy for image {item}: {acc}")
    # if debug: print(f"→ Final testing accuracy of the model: {acc}")
    # metric.reset()


# Data Loading

In [None]:
# Create the dataloaders of our project
path_data = "/kaggle/input/gunileo/matching/fronts" # "data/lombardgrid_front/lombardgrid/front"
path_labels = "/kaggle/input/gunileo/matching/labels" # "data/lombardgrid_alignment/lombardgrid/alignment"

dataset = GNLDataLoader(path_labels, path_data, transform=None, debug=False)

print(
    f"[DEBUG] Items in the data folder: {len(sorted(os.listdir(path_data)))}",
    f"[DEBUG] Items in the labels folder: {len(sorted(os.listdir(path_labels)))}",
    sep="\n"
)

# Hyperparameters

batch_size = 32

# Model + Hyperparameters

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = LabialCNN(debug=False).to(device)

# Print the summary of the model
# torchinfo.summary(model, (1,75, 100, 150), col_names = ("input_size", "output_size", "num_params", "kernel_size", "mult_adds"), verbose = 1)

epochs = 2
folds = 5
learning_rate = 10 ** (-4)
dropout = 0.5

metric = torchmetrics.Accuracy(task="multiclass", num_classes=37)

loss_fn = nn.CTCLoss(reduction="mean", zero_infinity=True, blank=36)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate

# Training + Testing

In [None]:
for epoch_ind in range(epochs): # Epochs
    index = 0
    for fold in range(folds):   # k-fold Cross Validation
        for batch_index in range(125 // folds):    # 125
            print(f"[DEBUG] Loading of batch {index + 1} for training (Index: {index})")
            current_batch = dataset[batch_size*index : batch_size*(index + 1)]
            #print(f"[DEBUG] {type(current_batch), len(current_batch)}\n-> {type(current_batch[0]), len(current_batch[0])}\n-> {current_batch[0][0].shape}")

            print(f"[DEBUG] Starting training of batch {batch_index + 1} (Index: {batch_index})")
            train_loop(device, current_batch, model, loss_fn, optimizer, index, epochs, epoch_ind, debug=True)
            index += 1
        print("===          The training has finished          ===")
        for batch_index in range(35 // folds):    # 35
            print(f"[DEBUG] Loading of batch {index + 1} for testing (Index: {index})")
            current_batch = dataset[batch_size*index : batch_size*(index + 1)]

            print(f"[DEBUG] Starting testing of batch {index + 1} (Index: {index})")
            test_loop(device, current_batch, model, loss_fn, debug=True)
            index += 1
        print("===          The testing has finished          ===")
        print(f"===              Finished fold {fold}/{folds}              ===")
print("=== === ==> SAVING THE MODEL...<== === ===")
torch.save(model, "/kaggle/working/gunileo.pt")
print("Goodbye, and thank you for all the fish")