## Modelling

* count real occlusions over the whole dataset

```
Average length of occlusions: 2.92 frames
Average number of occluded frames per video: 24.15 frames
Average percentage of occluded frames per video: 3.17%
Total number of occluded frames: 2994 out of total for the dataset 101156 
```

* create X (three?) times as many fake occlusions with ground truth data

* "Target" for real occlusions: 

1. random numbers
2. linear interpolations
3. cubic interpolations

* add noise
* ✅ create DataLoader object
* define train-val-test split
* create bi-LSTM object
* create training loop
* create testing loop
* create predict function for one video
* save sample .csv of blendshapes for all models (3 so far)

In [7]:
import os
import datetime
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import ArtistAnimation
from sklearn.preprocessing import StandardScaler
from scipy import interpolate
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


# Standardization functions
def fit_and_standardize(data):
    shape = data.shape
    flat = data.reshape((-1, data.shape[-1]))
    scaler = StandardScaler().fit(flat)
    scaled = scaler.transform(flat).reshape(shape)
    return scaled, scaler


def standardize(data, scaler):
    shape = data.shape
    flat = data.reshape((-1, data.shape[-1]))
    scaled = scaler.transform(flat).reshape(shape)
    return scaled


def inv_standardize(data, scaler):
    shape = data.shape
    flat = data.reshape((-1, data.shape[-1]))
    scaled = scaler.inverse_transform(flat).reshape(shape)
    return scaled


# Data processing function
def process_data(data):
    gaps = np.isnan(data)
    all_null = np.sum(np.sum(gaps, axis=1), axis=1) > gaps.shape[1] * gaps.shape[2] * 0.8
    data = data[~all_null, :, :]
    gaps = gaps[~all_null, :, :]
    keep = ~np.isnan(np.sum(data, axis=(1, 2)))
    return data[keep, :, :], gaps[keep, :, :]

In [None]:
import os
import datetime
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import ArtistAnimation
from sklearn.preprocessing import StandardScaler
from scipy import interpolate
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class TimeSeriesDataset(Dataset):
    def __init__(self, data, labels=None, occ_prob=0.3, noise_std=0.01):
        """
        Args:
            data (np.array): Ground truth data of shape (num_videos, num_frames, blendshapes_dim).
            labels (np.array): Binary occlusion labels of shape (num_videos, num_frames).
            occ_prob (float): Probability of adding synthetic occlusions.
            noise_std (float): Standard deviation of Gaussian noise to add to data.
        """
        self.data = data.copy()
        self.labels = labels.copy()
        self.data, self.labels, self.mask = self.insert_fake_occlussions(self.data, self.labels, occ_prob=occ_prob) 

    def __len__(self):
        return len(self.data)
    
    def insert_fake_occlussions(self):
        data = self.data.copy()
        labels = self.labels.copy()
        # mask = np.ones_like(labels, dtype=int)  # 1 for real, 0 for synthetic
        # create one hot vector that says which data is real 1 and which data is fake 0
        mask = np.ones(len(data))   # masking some data
        for i in range(len(data)):    # iterate over time
            # insert occlussion with probability occ_prob
            if np.random.rand() < self.occ_prob:
                occ_len = int(np.clip(np.random.normal(2.92, 12.05), 2, 24))   # mean and std from the dataset
                print(f"Inserting occlussion of length {occ_len}")
                
                start = np.random.randint(0, len(data[i]) - occ_len)
                end = start + occ_len
                # check if the occlusion is not already there and update mask only where there wasn't an occlusion already
                mask_update_indices = (labels[i, start:end] != 1)
                mask[i, start:end][mask_update_indices] = 0  # update mask
                labels[i, start:end][mask_update_indices] = 1  # update labels to have 1 for occlusions

                mask[i, start:end] = 0  # update mask
                labels[i, start:end] = 1  # update labels to have 1 for occlusions
                # do nothing to the `data` as it stands for ground truth

        # percentage of all occlusions after inserting fake ones
        occ_perc = np.mean([np.mean(data[i] == 1) for i in range(len(data))])
        print(f"\nPercentage of occlusions: {occ_perc}\n")
        return data, labels, mask
    
    def add_noise(self, data):
        noise = np.random.normal(0, self.noise_std, data.shape)
        return data + noise

    def __getitem__(self, idx):
        sample = self.data[idx]

        # Apply noise and transformations
        sample = self.add_noise(sample)

        if self.labels is not None:
            sample_label = self.labels[idx]
        else:
            sample_label = None

        return sample, sample_label
    

def create_dataloader(all_data, batch_size=32, shuffle=True):
    stacked_data = []  # 125 videos, F frames, 51 features
    all_labels = []   # 125 videos, F frames
    for video, video_frames in all_data.items():
        data = []  # F, 51
        labels = []  # F
        for frame in video_frames:
            label = frame.pop("occluded")
            blendshapes = [frame[feature] for feature in frame if feature not in ["Timecode", "BlendshapeCount", "velocity", "smoothed_velocity"]]
            data.append(blendshapes)  # 51 features
            labels.append(label)
        stacked_data.append(data)
        all_labels.append(labels)

    print(len(stacked_data[0]))
    print(stacked_data[0])
    print(len(stacked_data))
    print(len(all_labels))
    dataset = TimeSeriesDataset(stacked_data, all_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [8]:
# read occlusions
with np.load("occlusions_results.npz", allow_pickle=True) as data:
    video_occlusions_frames = data["video_occlusions"].item()
print(len(video_occlusions_frames))

"""# read occlusions
with np.load("occlusions_results_timecodes.npz", allow_pickle=True) as data:
    video_occlusions = data["video_occlusions"].item()
print(len(video_occlusions))"""

# read blendshapes from npz
with np.load("blendshapes_timecodes_velocities_cubic_interpolated.npz", allow_pickle=True) as data:
    video_blendshapes_cubic = data["video_blendshapes"].item()
print(len(video_blendshapes_cubic))

125
125


In [21]:
def add_occlusion_labels(video_blendshapes, occlusions):
    for video_name, frames in video_blendshapes.items():
        occlusion_frames = occlusions.get(video_name, [])  # get occlusions for video_name, if not found return empty list
        occlusion_labels = np.zeros(len(frames), dtype=int)
        
        for start, end in occlusion_frames:
            occlusion_labels[start:end+1] = 1
        
        for i, frame in enumerate(frames):
            frame['occluded'] = occlusion_labels[i]
    
    return video_blendshapes

# Add occlusion labels to video_blendshapes_cubic
video_blendshapes_cubic = add_occlusion_labels(video_blendshapes_cubic, video_occlusions_frames)
video_blendshapes_cubic["varg_002_2_pmil"][0]["occluded"]

0

In [22]:
create_dataloader(video_blendshapes_cubic, batch_size=32, shuffle=True)

55
{'BrowDownLeft': 0.0632593184709549, 'BrowDownRight': 0.11483088880777359, 'BrowInnerUp': 0.012185418047010899, 'BrowOuterUpLeft': 0.020801110193133354, 'BrowOuterUpRight': 0.009751986712217331, 'CheekPuff': 7.73931333242217e-06, 'CheekSquintLeft': 2.073321354600921e-07, 'CheekSquintRight': 4.6928110464250494e-07, 'EyeBlinkLeft': 0.07462386041879654, 'EyeBlinkRight': 0.029695723205804825, 'EyeLookDownLeft': 0.238837331533432, 'EyeLookDownRight': 0.20948433876037598, 'EyeLookInLeft': 0.01261200848966837, 'EyeLookInRight': 0.22557225823402405, 'EyeLookOutLeft': 0.21054740250110626, 'EyeLookOutRight': 0.01838945783674717, 'EyeLookUpLeft': 0.053056828677654266, 'EyeLookUpRight': 0.05352300405502319, 'EyeSquintLeft': 0.35084065794944763, 'EyeSquintRight': 0.237966388463974, 'EyeWideLeft': 0.012582349590957165, 'EyeWideRight': 0.02496442012488842, 'JawForward': 2.069354013656266e-05, 'JawLeft': 0.0007355318521149457, 'JawOpen': 0.00710756191983819, 'JawRight': 3.4368607884971425e-05, 'Mou

<torch.utils.data.dataloader.DataLoader at 0x3660369a0>