$$
\Huge G \cup n \; \sqrt{-1} \; \ell \; e \; \emptyset
$$

<p style="text-align: center">A lipsync project, made by Nil Atabey, Leonardo Biason and Günak Yuzak</p>


---

<h2 align="center"><b>Table of Contents</b></h2>

1. [Code structure](#1-code-structure)
2. [Import of the Packages](#2-import-of-the-packages)
3. [Data Loading](#3-data-loading)
4. [Model Settings](#4-model-settings)

$$
\newcommand{\goto}{\; \longrightarrow \;}
\newcommand{\tdconv}{\text{2D Convolution} }
\newcommand{\relu}{\text{ReLU} }
$$

---

## 1) Code Structure

The code structure is the following:

```python
project
 ├ assets
 │  ├ cnn.py
 │  └ dataloader.py
 └ data
```

---

## 2) Import of the Packages

Standard packages needed that can be installed with either `conda` or `pip`:

In [10]:
# Pytorch imports
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch import nn
import torchmetrics

# Utils imports
import numpy as np
import matplotlib.pyplot as plt
import pandas
import cv2
import os
import dlib
import json
from torchnlp.encoders import LabelEncoder

Custom imports from our libraries:

In [2]:
from assets.gnldataloader import GNLDataLoader
from assets.cnn import LabialCNN

---

##  Data Preprocessing

In [19]:
p1 ="data/lombardgrid_front/lombardgrid/front"
p2 ="data/lombardgrid_alignment/lombardgrid/alignment"
face_detector = dlib.get_frontal_face_detector()
landmark = dlib.shape_predictor("shape_predictor_68_face_landmarks_GTX.dat")
alphabet = [x for x in "abcdefghijklmnopqrstuvwxyz0123456789 "]
encoder = LabelEncoder(alphabet, reserved_labels=['unknown'], unknown_index=0)
CROPMARGIN = 20


def videoload(path):
    cap = cv2.VideoCapture(path)
    print(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret,frame = cap.read()
        gframe = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)
        
        facedetect =face_detector(gframe)
        
        #HAVE A CHECK IF THE FACE IS FOUND OR NOT

        face_landmarks = landmark(gframe, facedetect[0])
        xleft = face_landmarks.part(48).x -CROPMARGIN
        xright = face_landmarks.part(54).x +CROPMARGIN
        ybottom = face_landmarks.part(57).y +CROPMARGIN
        ytop = face_landmarks.part(50).y -CROPMARGIN

        mouth = gframe[ytop:ybottom,xleft:xright]
        mouth = cv2.resize(mouth,(150,100))
        
        mean = np.mean(mouth)
        std_dev = np.std(mouth)
        mouth = (mouth - mean) / std_dev
        
        return torch.tensor(mouth)
    cap.release()
print(videoload("data/lombardgrid_front/lombardgrid/front/s2_p_lbis6s.mov"))

def alignload(path):
    encoding =[ {"b":"bin","l":"lay","p":"place","s":"set"},
                {"b":"blue","g":"green","r":"red","w":"white"},
                {"a":"at","b":"by","i":"in","w":"with"},
                "letter",
                {"0":"zero","1":"one","2":"two","3":"three","4":"four","5":"five","6":"six","7":"seven","8":"eight","9":"nine"},
                {"a":"again","n":"now","p":"please","s":"soon"}]
    code = path.split(".")[0].split("_")[-1]
    sentence = []
    for i, letter in enumerate(code):
        corresponding_dict = encoding[i]
        next =""
        if corresponding_dict == "letter":
            next = letter
        else:next = corresponding_dict[letter]
        sentence = sentence + [" "] + [x for x in next]
  

    enl = encoder.batch_encode(sentence)
    return enl

def loadbothdata(p1,p2):
    #MIGHT NEED TO TENSORIZE VIDEO, ALIGN IS ALREADY TENSORIZED
    return videoload(p1),alignload(p2)
""" 
FOR SINGULAR TESTS
testvid = "data/lombardgrid_front/lombardgrid/front/s2_l_bbim3a.mov"
videoload(testvid) """

#FOR MULTIPLE TESTS
""" def test(path):
    for file in os.listdir(path):
        filename = os.path.join(path,file)
        if os.path.isdir(filename):
            test(filename)
        else:
            videoload(filename)
test("data/lombardgrid_front")
 """

62.0
tensor([[ 0.8260,  0.7259,  0.5924,  ...,  0.1252,  0.0919,  0.1252],
        [ 0.7927,  0.6925,  0.5591,  ...,  0.1252,  0.0585,  0.0919],
        [ 0.7927,  0.6925,  0.5591,  ...,  0.1252,  0.0585,  0.0585],
        ...,
        [ 0.2921,  0.3922,  0.4589,  ...,  0.1252,  0.0585,  0.0585],
        [ 0.2921,  0.3255,  0.3588,  ...,  0.0919,  0.0585,  0.0251],
        [ 0.2587,  0.2587,  0.2921,  ...,  0.0919,  0.0585, -0.0083]],
       dtype=torch.float64)


' def test(path):\n    for file in os.listdir(path):\n        filename = os.path.join(path,file)\n        if os.path.isdir(filename):\n            test(filename)\n        else:\n            videoload(filename)\ntest("data/lombardgrid_front")\n '

## 3) Data Loading

In [12]:
# Create the dataloaders of our project
# train_data = GNLDataLoader()

---

## 4) Model Settings

The following settings are applied:
> `device`: specifies where the model must be trained. If an Nvidia GPU is detected, then CUDA will be used;<br>
> `epochs`: the number of epochs;<br>
> `batch_size`: the size of each singular batch of analysed images;<br>
> `learning_rate`: `N/A`;<br>
> `loss_fn`: the loss function of the model;<br>
> `optimizer`: the optimizer of the model. For now it's `AdamW`, which is more performant than `SGD`.

The model has the following layers:

$$
\underbrace{x}_{\text{input}} \goto \underbrace{st_0(3, \; 5, \; 5)}_{\text{ST CNN}} \goto \underbrace{p_0(1, \; 2, \; 2)}_{\text{Normalization Pool}} \goto \underbrace{st_1(3, \; 5, \; 5)}_{\text{ST CNN}} \goto \underbrace{p_1(1, \; 2, \; 2)}_{\text{Normalization Pool}} \goto
$$
$$
\goto \underbrace{st_2(3, \; 5, \; 5)}_{\text{ST CNN}} \goto \underbrace{p_2(1, \; 2, \; 2)}_{\text{Normalization Pool}} \goto \underbrace{y}_{\text{Output}}
$$

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = LabialCNN(debug=True).to(device)

epochs = 2
batch_size = 16
learning_rate = 0.0001

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)