## Modelling

* count real occlusions over the whole dataset

```
Average length of occlusions: 2.92 frames
Average number of occluded frames per video: 24.15 frames
Average percentage of occluded frames per video: 3.17%
Total number of occluded frames: 2994 out of total for the dataset 101156 
```

* "Target" for real occlusions: 

1. random numbers
2. linear interpolations
3. --> cubic interpolations

* ✅ add noise
* ✅ create DataLoader object
* scaling
* ✅ define train-val-test split
* ✅ create bi-LSTM object
* ✅ create training loop
* ✅ create testing loop
* ✅ create predict function for one video
* save sample .csv of blendshapes for all models (3 so far)

In [7]:
import os
import datetime
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import ArtistAnimation
from sklearn.preprocessing import StandardScaler
from scipy import interpolate
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


# Standardization functions
def fit_and_standardize(data):
    shape = data.shape
    flat = data.reshape((-1, data.shape[-1]))
    scaler = StandardScaler().fit(flat)
    scaled = scaler.transform(flat).reshape(shape)
    return scaled, scaler

def standardize(data, scaler):
    shape = data.shape
    flat = data.reshape((-1, data.shape[-1]))
    scaled = scaler.transform(flat).reshape(shape)
    return scaled


def inv_standardize(data, scaler):
    shape = data.shape
    flat = data.reshape((-1, data.shape[-1]))
    scaled = scaler.inverse_transform(flat).reshape(shape)
    return scaled


# Data processing function
def process_data(data):
    gaps = np.isnan(data)
    all_null = np.sum(np.sum(gaps, axis=1), axis=1) > gaps.shape[1] * gaps.shape[2] * 0.8
    data = data[~all_null, :, :]
    gaps = gaps[~all_null, :, :]
    keep = ~np.isnan(np.sum(data, axis=(1, 2)))
    return data[keep, :, :], gaps[keep, :, :]

In [8]:
# read blendshapes from npz
with np.load("blendshapes_timecodes_velocities_cubic_interpolated.npz", allow_pickle=True) as data:
    video_blendshapes_cubic = data["video_blendshapes"].item()
print(len(video_blendshapes_cubic))

# Define the rest face blendshapes
calibration_path = "./20241101_face-calib_002_2"
rest_face_blendshapes = {k:v for k, v in video_blendshapes_cubic["face-calib_002_2_pmil"][5].items()} # if k not in ["Timecode", "BlendshapeCount", "velocity", "smoothed_velocity"]}

smoothed_velocity_rest_face = rest_face_blendshapes.pop("smoothed_velocity")
for k in ["Timecode", "BlendshapeCount", "velocity"]:
    rest_face_blendshapes.pop(k, None)
rest_face_frame_as_list = list(rest_face_blendshapes.values())   # F frames, 51 features
len(rest_face_frame_as_list)

125


51

In [3]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class TimeSeriesDataset(Dataset):
    def __init__(self, data, labels=None, smoothed_velocities=None, occ_prob=0.3, noise_std=0.01):
        """
        Args:
            data (np.array): Ground truth data of shape (num_videos, num_frames, blendshapes_dim). 125 videos, F frames, 51 features
            labels (np.array): Binary occlusion labels of shape (num_videos, num_frames).  (125, F)
            smoothed_velocities (np.array): Smoothed velocity for each frame of the video (num_videos, num_frames).  (125, F)
            occ_prob (float): Probability of adding synthetic occlusions.
            noise_std (float): Standard deviation of Gaussian noise to add to data.
        """
        self.data = data.copy()
        self.labels = labels.copy()
        self.smoothed_velocities = smoothed_velocities.copy()
        self.occ_prob = occ_prob
        self.noise_std = noise_std

        # Add Gaussian noise to the entire dataset
        self.data = self.add_noise(self.data)

        self.data, self.labels, self.mask = self.insert_fake_occlussions() 

        # Sort data over second dimension for bucketing
        # print("len(self.data): ", len(self.data))
        # self.data = [sorted(video, key=lambda x: len(x), reverse=True) for video in self.data]

    def __len__(self):
        return len(self.data)
    
    def insert_fake_occlussions(self):
        data = self.data.copy()  # (125 videos, F frames, 51 features)
        labels = self.labels.copy()
        # Create one hot vector that says which data is real 1 and which data is fake 0
        # mask = np.ones_like(labels, dtype=int)  # 1 for real, 0 for synthetic
        mask = [[1] * len(video) for video in data]

        total_frames = sum([len(video) for video in data])
        occ_p_init = sum([sum(video) for video in labels]) / total_frames
        print(f"\nPercentage of REAL occlusions: {occ_p_init}\n")
        
        for video_i, video in enumerate(data):
            len_video = len(video)
            average_velocity = np.mean(self.smoothed_velocities[video_i])
            std_velocity = np.std(self.smoothed_velocities[video_i])
            print(f"Video {video_i} length: {len_video}")
            for frame_n, frame in enumerate(video):

                # if the current frame velocity is higher than the average + 1 std, then add an occlusion with a probability
                if self.smoothed_velocities[video_i][frame_n] > average_velocity + std_velocity:
                    
                    if np.random.rand() < self.occ_prob:
                        occ_len = int(np.clip(np.random.normal(2.92, 12.05), 2, 24))   # mean and std from the dataset
                        # print(f"Occ len: {occ_len}")
                        # start = np.random.randint(0, len_video - occ_len)
                        start = frame_n
                        end = start + occ_len
                        if end + 1 >= len_video:
                            end = len_video - 2
                        # check if the occlusion is not already there and update mask only where there wasn't an occlusion already
                        mask_update_indices = (np.array(labels[video_i][start:end+1]) != 1)
                        # print(f"Mask update indices: {mask_update_indices}")
                        mask[video_i][start:end+1] = [0 if mask_update_indices[k] else el for k, el in enumerate(mask[video_i][start:end+1])]  # update mask with zeros where fake occlusions are
                        labels[video_i][start:end+1] = [1 if mask_update_indices[k] else el for k, el in enumerate(labels[video_i][start:end+1])] # update labels to have 1 for occlusions
                        # do nothing to the `data` as it stands for ground truth

        # total percentage of all occlusions after inserting fake ones 
        occ_p = sum([sum(video) for video in labels]) / total_frames
        print(f"\nPercentage of occlusions: {occ_p}\n")
        # percentage of masked data
        mask_p = 1 - (sum([sum(video) for video in mask]) / total_frames)
        print(f"\nPercentage of masked data: {mask_p}\n")
        return data, labels, mask
    
    def add_noise(self, data):
        # Add Gaussian noise across all frames in each video
        all_blendshapes = []
        for video in data:
            blendshapes_in_video = []
            for frame in video:
                # order the keys in dict `frame` alphabetically
                frame = dict(sorted(frame.items()))
                blendshapes = frame.values()  # 51 features
                if len(blendshapes) != 51:
                    print("ERROR: Number of blendshapes is not 51")
                    blendshapes = [0.0] * 51
                blendshapes = np.array(list(blendshapes))
                blendshapes_in_video.append(blendshapes)
            # Apply noise across all frames in the video
            blendshapes_in_video = np.array(blendshapes_in_video)  # F, 51
            print(blendshapes_in_video.shape)
            noise = np.random.normal(0, self.noise_std, blendshapes_in_video.shape)
            blendshapes_in_video = blendshapes_in_video + noise
            all_blendshapes.append(blendshapes_in_video)
        # noise = np.random.normal(0, self.noise_std, data_blendshapes_lists.shape)
        return all_blendshapes

    def __getitem__(self, idx):
        sample = self.data[idx]
        mask = self.mask[idx]

        if self.labels is not None:
            sample_label = self.labels[idx]
        else:
            sample_label = None

        return torch.tensor(np.array(sample), dtype=torch.float32), \
               torch.tensor(np.array(sample_label), dtype=torch.float32), \
               torch.tensor(np.array(mask), dtype=torch.float32)
        

def collate_fn(batch):
    data = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    mask = [item[2] for item in batch]

    sequence_lengths = torch.tensor([len(seq) for seq in data])

    # padding the sorted sequences and keeping their lengths
    data = nn.utils.rnn.pad_sequence(data, batch_first=True, padding_value=0.0)  # rest_face_frame_as_list)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)
    mask = nn.utils.rnn.pad_sequence(mask, batch_first=True, padding_value=1)
    return data, labels, mask, sequence_lengths


def create_dataloader(all_data, batch_size=8, shuffle=True):
    stacked_data = []  # 125 videos, F frames, 51 features
    all_labels = []   # 125 videos, F frames
    smoothed_velocities = []  # 125 videos, F frames
    for video, video_frames in all_data.items():
        data = []  # F, 51
        labels = []  # F
        smoothed_velocities_per_video = []  # F
        for frame in video_frames:
            label = frame.pop("occluded")
            smoothed_velocity = frame.pop("smoothed_velocity")  # to create fake occlusions mostly where the changes are and not in the rest frames (window=5)
            for k in ["Timecode", "BlendshapeCount", "velocity"]:
                frame.pop(k, None)
            data.append(frame)  # 51 features
            labels.append(label)
            smoothed_velocities_per_video.append(smoothed_velocity)
        stacked_data.append(data)
        all_labels.append(labels)
        smoothed_velocities.append(smoothed_velocities_per_video)  # 125 videos, F frames

        print(f"Video {video} length: {len(video_frames)}")

    print(len(stacked_data[0]))
    # print(stacked_data[0])
    print(len(stacked_data))
    print(len(all_labels))
    dataset = TimeSeriesDataset(stacked_data, all_labels, smoothed_velocities)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

Using device: cuda


In [10]:
# read occlusions
with np.load("occlusions_results.npz", allow_pickle=True) as data:
    video_occlusions_frames = data["video_occlusions"].item()
print(len(video_occlusions_frames))

"""# read occlusions
with np.load("occlusions_results_timecodes.npz", allow_pickle=True) as data:
    video_occlusions = data["video_occlusions"].item()
print(len(video_occlusions))"""

print(video_occlusions_frames['varg_002_2_pmil'])

print(len(video_blendshapes_cubic['varg_002_2_pmil']))

125
[(151, 153), (320, 321), (534, 542), (553, 560), (561, 562), (564, 566), (581, 593), (594, 594), (595, 595), (596, 596), (597, 600), (601, 605), (613, 614), (616, 618), (625, 627), (652, 664), (824, 829), (830, 834), (841, 845), (865, 879), (880, 883), (884, 885), (929, 930), (932, 933), (934, 942), (943, 946)]
1054


In [5]:
def add_occlusion_labels(video_blendshapes, occlusions):
    for video_name, frames in video_blendshapes.items():
        occlusion_frames = occlusions.get(video_name, [])  # get occlusions for video_name, if not found return empty list
        occlusion_labels = np.zeros(len(frames), dtype=int)
        
        for start, end in occlusion_frames:
            occlusion_labels[start:end+1] = 1
        
        for i, frame in enumerate(frames):
            frame['occluded'] = occlusion_labels[i]
    
    return video_blendshapes


# train test val split of original npz before creating fake occlusions and before packing it into dataloader
# TODO: separate out real occlusions into test and val, train only on fake occlusions
def train_test_val_split(data, train_p=0.7, test_p=0.15, val_p=0.15):
    # Split by number of videos
    """
    Args:
        data (dict of dicts): List of dictionaries where each dictionary represents a video. 125 videos, 51 features for each frame in each video 
    """
    np.random.seed(42)
    data_values, video_names = list(data.values()), list(data.keys())
    np.random.shuffle(data_values)

    train_data = data_values[:int(len(data_values) * train_p)]
    test_data = data_values[int(len(data_values) * train_p):int(len(data_values) * (train_p + test_p))]
    val_data = data_values[int(len(data_values) * (train_p + test_p)):]

    print(f"Train data: {len(train_data)} videos, Test data: {len(test_data)} videos, Validation data: {len(val_data)} videos")

    # turn the list of dictionaries back into a dictionary of dictionaries
    train_data = {video_names[i]: train_data[i] for i in range(len(train_data))}
    test_data = {video_names[i]: test_data[i] for i in range(len(test_data))}
    val_data = {video_names[i]: val_data[i] for i in range(len(val_data))}

    return train_data, test_data, val_data


# Add occlusion labels to video_blendshapes_cubic
video_blendshapes_cubic = add_occlusion_labels(video_blendshapes_cubic, video_occlusions_frames)
print(video_blendshapes_cubic["varg_002_2_pmil"][0]["occluded"])

# Split the data into train, test, and validation sets
train_data, test_data, val_data = train_test_val_split(video_blendshapes_cubic)

0
Train data: 87 videos, Test data: 19 videos, Validation data: 19 videos


In [6]:
test_loader = create_dataloader(test_data, batch_size=8, shuffle=False)

Video static_ROOF_fingers_contact_at_45_degrees_neutral_location_001_1_pmil length: 592
Video varg_001_1_pmil length: 442
Video bi_005_5_pmil length: 1557
Video fran_001_1_pmil length: 768
Video parti-vill-kalla-grupp-for-terrorister-1-det_001_1_pmil length: 410
Video parti-vill-kalla-grupp-for-terrorister-4-nu_002_2_pmil length: 1289
Video katt_001_1_pmil length: 447
Video bok_002_2_pmil length: 328
Video varg_003_3_pmil length: 1078
Video parti-vill-kalla-grupp-for-terrorister-1-det_003_3_pmil length: 937
Video djur_001_1_pmil length: 442
Video abborre_002_2_pmil length: 62
Video parti-vill-kalla-grupp-for-terrorister-5-bosattarna_003_3_pmil length: 827
Video parti-vill-kalla-grupp-for-terrorister-7-det_001_1_pmil length: 1027
Video hundvalp_001_1_pmil length: 983
Video left_hand_on_top_of_the_battery_neutral_location_001_1_pmil length: 153
Video parti-vill-kalla-grupp-for-terrorister-5-bosattarna_001_1_pmil length: 956
Video touch_chin_with_index_finger_side_of_the_chin_far_from_act

In [7]:
for batch in test_loader:
    print(batch[0].shape)  # sample
    print(batch[1].shape)  # sample_label
    print(batch[2].shape)  # mask
    print(batch[3].shape)
    print(batch[3])  # sequence lengths

torch.Size([8, 1557, 51])
torch.Size([8, 1557])
torch.Size([8, 1557])
torch.Size([8])
tensor([ 592,  442, 1557,  768,  410, 1289,  447,  328])
torch.Size([8, 1078, 51])
torch.Size([8, 1078])
torch.Size([8, 1078])
torch.Size([8])
tensor([1078,  937,  442,   62,  827, 1027,  983,  153])
torch.Size([3, 956, 51])
torch.Size([3, 956])
torch.Size([3, 956])
torch.Size([3])
tensor([956, 824, 747])


In [8]:
len(test_loader)

3

In [9]:
train_loader = create_dataloader(train_data, batch_size=8, shuffle=True)
val_loader = create_dataloader(val_data, batch_size=8, shuffle=False)

Video static_ROOF_fingers_contact_at_45_degrees_neutral_location_001_1_pmil length: 531
Video varg_001_1_pmil length: 656
Video bi_005_5_pmil length: 1136
Video fran_001_1_pmil length: 451
Video parti-vill-kalla-grupp-for-terrorister-1-det_001_1_pmil length: 438
Video parti-vill-kalla-grupp-for-terrorister-4-nu_002_2_pmil length: 366
Video katt_001_1_pmil length: 1191
Video bok_002_2_pmil length: 491
Video varg_003_3_pmil length: 521
Video parti-vill-kalla-grupp-for-terrorister-1-det_003_3_pmil length: 583
Video djur_001_1_pmil length: 955
Video abborre_002_2_pmil length: 132
Video parti-vill-kalla-grupp-for-terrorister-5-bosattarna_003_3_pmil length: 504
Video parti-vill-kalla-grupp-for-terrorister-7-det_001_1_pmil length: 2488
Video hundvalp_001_1_pmil length: 1358
Video left_hand_on_top_of_the_battery_neutral_location_001_1_pmil length: 662
Video parti-vill-kalla-grupp-for-terrorister-5-bosattarna_001_1_pmil length: 1319
Video touch_chin_with_index_finger_side_of_the_chin_far_from_a

In [21]:
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=3, dropout=0.3, bidirectional=True):
        super(BiLSTMModel, self).__init__()
        self.num_directions = 2 if bidirectional else 1
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size * self.num_directions, output_size)
        self.fc2 = nn.Linear(output_size, input_size)
        self.dropout = nn.Dropout(dropout)
        self.act = nn.ReLU()

    def forward(self, x, seq_lens):
        # print("forward x.shape: ", x.shape, "seq_lens_sort.shape: ", seq_lens.shape)  # forward x.shape:  torch.Size([8, 2060, 51]) seq_lens_sort.shape:  torch.Size([8])
    
        # Check if single sequence
        if x.size(0) == 1:
            # Directly pass the single sequence through the LSTM
            y, _ = self.lstm(x)  # No packing needed
            y = self.fc(y)
            y = self.act(y)
            y = self.dropout(y)
            y = self.fc2(y)
            return y
        
        # sort input by descending length
        _, idx_sort = torch.sort(seq_lens, dim=0, descending=True)
        idx_sort = idx_sort.to(device)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        idx_unsort = idx_unsort.to(device)
        x_sort = torch.index_select(x, dim=0, index=idx_sort)
        x_sort = x_sort.to(device)
        seq_lens_sort = torch.index_select(seq_lens, dim=0, index=idx_sort)
        seq_lens_sort = seq_lens_sort.to(device)

        seq_lens_sort = seq_lens_sort.cpu()
        x_packed = pack_padded_sequence(
            x_sort, seq_lens_sort, batch_first=True)  #, enforce_sorted=False)
        
        # forward x_packed.data.shape:  torch.Size([6653, 51])
        # print("forward x_packed.data.shape: ", x_packed.data.shape)

        # pass through rnn
        y_packed, _ = self.lstm(x_packed)

        # unpack output
        y_sort, length = pad_packed_sequence(y_packed.to(device), batch_first=True)
        # unsort output to original order
        y = torch.index_select(y_sort, dim=0, index=idx_unsort)

        out = self.fc(y)
        out = self.act(out)
        out = self.dropout(out)
        out = self.fc2(out)

        #print("forward out.shape: ", out.shape)
        # forward out.shape:  torch.Size([8, 51])
        return out


def masked_loss(output, target, labels, mask, loss_fn, epoch):
    # refuralize between frame
    loss = loss_fn(output, target)
    # Warm-up for the first 4 epochs on ground truth data
    #if epoch < 4:  
        # loss over only fake occlusions
        # mask = 1 for real, 0 for fake data
        # only fake occlusions = all zeros in mask
        # flip the mask to get only fake occlusions
    #    mask = mask.unsqueeze(-1)
    #    return (loss * (1 - mask)).sum() / (1 - mask).sum()
    # else:
        # loss over all occlusions
        # labels = 1 for occluded, 0 for not occluded
    labels = labels.unsqueeze(-1)
    return (loss * labels).sum() / labels.sum()


def train_model(model, train_loader, val_loader, criterion, optimizer, epochs, writer):
    for epoch in range(epochs):
        model.train()

        running_loss = 0.0
        # data, labels, mask, sequence_lengths
        for batch_features, batch_labels, batch_mask, batch_lengths in train_loader:
            batch_features = batch_features.to(device)
            batch_labels = batch_labels.to(device)
            batch_mask = batch_mask.to(device)
            batch_lengths = batch_lengths.to(device)

            # Forward pass 
            #print("batch_features.shape: ", batch_features.shape, "batch_mask.shape: ", batch_mask.shape)
            #print("batch_mask.unsqueeze(-1).shape: ", batch_mask.unsqueeze(-1).shape)
            batch_features_masked = batch_features * batch_mask.unsqueeze(-1)
            batch_features_masked = batch_features_masked.to(device)
            outputs = model(batch_features_masked, batch_lengths)
            #print("outputs.shape: ", outputs.shape)
            
            # Compute masked loss
            #print("batch_labels.shape: ", batch_labels.shape)
            loss = masked_loss(outputs, batch_features, batch_labels, batch_mask, criterion, epoch)
            running_loss += loss.item()

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        running_loss /= len(train_loader)
        print(f"Epoch {epoch + 1}, Average loss: {running_loss:.4f}")
        writer.add_scalar("Loss/train", running_loss, epoch)

        # Validation loss
        model.eval()
        with torch.no_grad():
            val_loss = 0
            for batch_features, batch_labels, batch_mask, batch_lengths in val_loader:
                batch_features = batch_features.to(device)
                batch_labels = batch_labels.to(device)
                batch_mask = batch_mask.to(device)
                batch_lengths = batch_lengths.to(device)
            
                batch_features_masked = batch_features * batch_mask.unsqueeze(-1)
                batch_features_masked = batch_features_masked.to(device)
                # Forward pass on validation data
                outputs = model(batch_features_masked, batch_lengths)

                # Compute masked loss
                loss = masked_loss(outputs, batch_features, batch_labels, batch_mask, criterion, epoch)
                val_loss += loss.item()
            
            val_loss /= len(val_loader)

            print(f"Average validation Loss: {val_loss:.4f}")
            writer.add_scalar("Loss/val", val_loss, epoch)

In [13]:
!pip install tensorboard

Collecting tensorboard
  Using cached tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.68.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Using cached Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting protobuf!=4.24.0,>=3.19.6 (from tensorboard)
  Downloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Using cached tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Using cached tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
Using cached absl_py-2.1.0-py3-none-any.whl (133 kB)
Downloading g

In [44]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()


# Example setup
if __name__ == "__main__":
    # Example data
    #num_samples = 100
    #time_steps = 50
    #features = 10
    #data = np.random.rand(num_samples, time_steps, features)
    #labels = np.random.rand(num_samples, 1)

    # Preprocess data
    # standardized_data, scaler = fit_and_standardize(data)

    # Create DataLoader
    batch_size = 8

    # Initialize model, loss, and optimizer
    input_size = 51
    hidden_size = 128  # 128 on 80 epochs looked alright
    num_layers = 2  # 3 initially
    dropout = 0.4
    output_size = 51

    model = BiLSTMModel(input_size, hidden_size, output_size, num_layers=num_layers, dropout=dropout).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss(reduction='none')

    # Train model
    epochs = 100
    train_model(model, train_loader, val_loader, criterion, optimizer, epochs, writer)

Epoch 1, Average loss: 21.7245
Average validation Loss: 2.4141
Epoch 2, Average loss: 23.2867
Average validation Loss: 2.0839
Epoch 3, Average loss: 14.5587
Average validation Loss: 1.9374
Epoch 4, Average loss: 18.3226
Average validation Loss: 1.8847
Epoch 5, Average loss: 14.6346
Average validation Loss: 1.8451
Epoch 6, Average loss: 13.4768
Average validation Loss: 1.8387
Epoch 7, Average loss: 11.6035
Average validation Loss: 1.8191
Epoch 8, Average loss: 16.5129
Average validation Loss: 1.8046
Epoch 9, Average loss: 7.6271
Average validation Loss: 1.7767
Epoch 10, Average loss: 7.3603
Average validation Loss: 1.7484
Epoch 11, Average loss: 5.2440
Average validation Loss: 1.7486
Epoch 12, Average loss: 5.0275
Average validation Loss: 1.7254
Epoch 13, Average loss: 4.5403
Average validation Loss: 1.7068
Epoch 14, Average loss: 6.4754
Average validation Loss: 1.6961
Epoch 15, Average loss: 5.3203
Average validation Loss: 1.6539
Epoch 16, Average loss: 4.2665
Average validation Loss: 

In [45]:
def test_model(model, test_loader, criterion, writer):
    model.eval()
    with torch.no_grad():
        test_loss = 0
        for batch_features, batch_labels, batch_mask, batch_lengths in test_loader:
            batch_features = batch_features.to(device)
            batch_labels = batch_labels.to(device)
            batch_mask = batch_mask.to(device)
            batch_lengths = batch_lengths.to(device)
            
            batch_features_masked = batch_features * batch_mask.unsqueeze(-1)
            batch_features_masked = batch_features_masked.to(device)
            # Forward pass on test data
            outputs = model(batch_features_masked, batch_lengths)

            # Compute masked loss
            loss = masked_loss(outputs, batch_features, batch_labels, batch_mask, criterion, epoch=None)
            test_loss += loss.item()
            
        test_loss /= len(test_loader)

        print(f"Average test Loss: {test_loss:.4f}")
        writer.add_scalar("Loss/test", test_loss)

test_model(model, test_loader, criterion, writer)
        

Average test Loss: 0.5661


```
input_size = 51
hidden_size = 128 
num_layers = 3
dropout = 0.4
output_size = 51
epochs = 80
```

Average test Loss: 0.6328 in `bilstm-40dropout-80.pt`

```
input_size = 51
hidden_size = 128  # 128 on 80 epochs looked alright
num_layers = 2  # 3 initially
dropout = 0.4
output_size = 51
epochs = 100
```

Average test Loss: 0.5661 in `bilstm-40dropout-2layers-100.pt`

In [8]:
# launch tensorboard
!tensorboard --logdir=runs --bind_all

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

TensorBoard 2.18.0 at http://signbot1:6006/ (Press CTRL+C to quit)
^C


In [46]:
# save model
torch.save(model.state_dict(), f"bilstm-40dropout-{num_layers}layers-{epochs}.pt")

In [22]:
# load saved model from state dict
input_size = 51
hidden_size = 128  # 128 on 80 epochs looked alright
num_layers = 2  # 3 initially
dropout = 0.4
output_size = 51
epochs = 100

model_path = "bilstm-40dropout-2layers-100.pt"
model = BiLSTMModel(input_size, hidden_size, output_size, num_layers=num_layers, dropout=dropout).to(device)
model.load_state_dict(torch.load(model_path))

  model.load_state_dict(torch.load(model_path))


<All keys matched successfully>

In [39]:
# Predict data for real occlusions on all of the data
# Save it as blendshapes csv for each video


def predict_data(model, data, mask, lengths):
    model.eval()
    with torch.no_grad():
        data = torch.tensor(data, dtype=torch.float32).to(device)
        mask = torch.tensor(mask, dtype=torch.float32).to(device)
        lengths = torch.tensor(lengths, dtype=torch.float32).to(device)
        print("data.shape, mask.shape, lengths.shape: ", data.shape, mask.shape, lengths.shape)  # ([354, 51]) ([354]) ([354])
        data_masked = data * mask.unsqueeze(-1)
        data_masked = data_masked.to(device)
        print("data_masked.shape: ", data_masked.shape)  # ([354, 51])
        outputs = model(data_masked, lengths)
    return outputs

def save_blendshapes(outputs, video_name, blendshape_names, real_meta_keys, output_dir):
    # outputs, video_name, blendshape_names, real_meta_keys, "predicted_blendshapes"
    # outputs: F, 51
    outputs = outputs.cpu().numpy()
    frame_as_list = []

    for i, frame in enumerate(outputs[0]):  # only one batch
        frame_dict = {}

        # take the rest of the keys from real data in video_blendshapes_cubic
        frame_dict["Timecode"] = real_meta_keys[i]["Timecode"]
        frame_dict["BlendshapeCount"] = real_meta_keys[i]["BlendshapeCount"]

        for j in range(len(frame)):
            frame_dict[blendshape_names[j]] = frame[j]

        # frame_dict["velocity"] = real_meta_keys[i]["velocity"]
        # frame_dict["smoothed_velocity"] = real_meta_keys[i]["smoothed_velocity"]
        frame_as_list.append(frame_dict)
    
    # save as csv file: list of dicts
    with open(f"{output_dir}/{video_name}_blendshapes.csv", "w") as f:
        # write headers
        f.write(",".join(["Timecode", "BlendshapeCount"] + blendshape_names) + "\n")
        for frame in frame_as_list:
            f.write(",".join([f"{frame[k]}" for k in frame_dict.keys()]) + "\n")


# extract a list of blendshape names
blendshape_names = list(rest_face_blendshapes.keys())
print(blendshape_names)

# read blendshapes from npz
with np.load("blendshapes_timecodes_velocities_cubic_interpolated.npz", allow_pickle=True) as data:
    video_blendshapes_cubic = data["video_blendshapes"].item()
print(len(video_blendshapes_cubic))
# Add occlusion labels to video_blendshapes_cubic
video_blendshapes_cubic = add_occlusion_labels(video_blendshapes_cubic, video_occlusions_frames)
print(video_blendshapes_cubic["varg_002_2_pmil"][0]["occluded"])
print(video_blendshapes_cubic["varg_002_2_pmil"][0].keys())

for video_name, video_frames in video_blendshapes_cubic.items():
    print(f"Predicting occluded blendshapes for video: {video_name}")
    data = []
    mask = []
    lengths = []
    real_meta_keys = []
    for frame in video_frames:
        label = frame.pop("occluded")   # 1 for occluded, 0 for not occluded
        # reverse mask, to mask occluded data and keep the rest, otherwise I have 1 for occluded and 0 for not occluded
        label = 1 - label
        real_timecode = frame.pop("Timecode", None)
        real_blendshape_count = frame.pop("BlendshapeCount", None)
        real_velocity = frame.pop("velocity", None)
        real_smoothed_velocity = frame.pop("smoothed_velocity", None)
        real_meta_keys.append({"Timecode": real_timecode, "BlendshapeCount": real_blendshape_count, "velocity": real_velocity, "smoothed_velocity": real_smoothed_velocity})

        data.append(list(frame.values()))
        mask.append(label)
        lengths.append(len(frame))

    # check data length
    try:
        data = np.array(data)
    except ValueError:
        print(f"Inhomogeneous number of blendshapes in {video_name}")
        new_data = []
        for frame in data:
            if len(frame) != 51:
                frame = [0.0] * 51
            new_data.append(frame)
        data = np.array(new_data)

    # outputs = predict_data(model, data, mask, lengths)
    # Perform inference
    model.eval()
    with torch.no_grad():
        data = torch.tensor(data, dtype=torch.float32).to(device)
        mask = torch.tensor(mask, dtype=torch.float32).to(device)
        lengths = torch.tensor(lengths, dtype=torch.float32).to(device)

        sequence = data.unsqueeze(0).to(device)  # Add batch dimension
        mask = mask.unsqueeze(0).to(device)  # Add batch dimension
        lengths = lengths.unsqueeze(0).to(device)  # Add batch dimension
        masked_sequence = sequence * mask.unsqueeze(-1)
        outputs = model(masked_sequence, lengths).to(device)
    
    save_blendshapes(outputs, video_name, blendshape_names, real_meta_keys, "predicted_blendshapes")

['BrowDownLeft', 'BrowDownRight', 'BrowInnerUp', 'BrowOuterUpLeft', 'BrowOuterUpRight', 'CheekPuff', 'CheekSquintLeft', 'CheekSquintRight', 'EyeBlinkLeft', 'EyeBlinkRight', 'EyeLookDownLeft', 'EyeLookDownRight', 'EyeLookInLeft', 'EyeLookInRight', 'EyeLookOutLeft', 'EyeLookOutRight', 'EyeLookUpLeft', 'EyeLookUpRight', 'EyeSquintLeft', 'EyeSquintRight', 'EyeWideLeft', 'EyeWideRight', 'JawForward', 'JawLeft', 'JawOpen', 'JawRight', 'MouthClose', 'MouthDimpleLeft', 'MouthDimpleRight', 'MouthFrownLeft', 'MouthFrownRight', 'MouthFunnel', 'MouthLeft', 'MouthLowerDownLeft', 'MouthLowerDownRight', 'MouthPressLeft', 'MouthPressRight', 'MouthPucker', 'MouthRight', 'MouthRollLower', 'MouthRollUpper', 'MouthShrugLower', 'MouthShrugUpper', 'MouthSmileLeft', 'MouthSmileRight', 'MouthStretchLeft', 'MouthStretchRight', 'MouthUpperUpLeft', 'MouthUpperUpRight', 'NoseSneerLeft', 'NoseSneerRight']
125
0
dict_keys(['BrowDownLeft', 'BrowDownRight', 'BrowInnerUp', 'BrowOuterUpLeft', 'BrowOuterUpRight', 'Cheek