### Imports

In [1]:
import random
from typing import Any, Dict, List
from pathlib import Path
import json
from importlib import import_module

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as opt
from tqdm.auto import tqdm
from sklearn.metrics import recall_score, precision_score, precision_recall_curve

from src.model import StochasticModel, SimpleFastTextClassifier, SimpleAudioClassifier, ComplicatedAudioClassifier, ResNet, AST
from utils.metrics import binary_weighted_accuracy
from utils.parameters import SEED, LABELS_NAMES

### Hyperparams

In [2]:
DEVICE = "cuda:0"

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Data loading

In [3]:
DATA_DIR = Path("data/CMU-MOSEI")
LABELS_DIR = DATA_DIR / "Labels"
AUDIO_DIR = DATA_DIR / "Audio_chunk"

In [4]:
train_mod_csv = pd.read_csv(LABELS_DIR / "Data_Train_modified.csv")
val_mod_csv = pd.read_csv(LABELS_DIR / "Data_Val_modified.csv")
test_mod_csv = pd.read_csv(LABELS_DIR / "Data_Test_modified.csv")

val_orig_csv = pd.read_csv(LABELS_DIR / "Data_Val_original_without_neg_time.csv")
test_orig_csv = pd.read_csv(LABELS_DIR / "Data_Test_original_without_neg_time.csv")


train_info = None
for label_name in LABELS_NAMES:
    train_mod_csv.loc[train_mod_csv[label_name] > 0, label_name] = 1
    train_mod_csv[label_name] = train_mod_csv[label_name].astype(int)
    train_info = pd.concat([train_info, train_mod_csv[label_name].value_counts()], axis=1)

val_info = None
for label_name in LABELS_NAMES:
    val_mod_csv.loc[val_mod_csv[label_name] > 0, label_name] = 1
    val_mod_csv[label_name] = val_mod_csv[label_name].astype(int)
    val_info = pd.concat([val_info, val_mod_csv[label_name].value_counts()], axis=1)

test_info = None
for label_name in LABELS_NAMES:
    test_orig_csv.loc[test_orig_csv[label_name] > 0, label_name] = 1
    test_orig_csv[label_name] = test_orig_csv[label_name].astype(int)
    test_info = pd.concat([test_info, test_orig_csv[label_name].value_counts()], axis=1)

In [5]:
def get_text_features(csv: Path, dir_name: str):
    csv = csv[["video", "start_time", "end_time"]]
    markuped_features_dir = Path("data/CMU-MOSEI/fasttext_featrues_markuped_text")
    recognised_features_dir = Path("data/CMU-MOSEI/fasttext_featrues_recognised_text")
    features: List[np.ndarray] = list()

    progress_bar = tqdm(csv.values, desc="Features loading")
    for ytid, start_time, end_time in progress_bar:
        file_name = f"{ytid}_{float(start_time):.4f}_{float(end_time):.4f}.npy"
        try:
            markuped_feature = np.load(markuped_features_dir / dir_name / file_name)
        except FileNotFoundError as exception:
            markuped_feature = np.zeros(300)
            print(exception)
        try:
            recognised_feature = np.load(recognised_features_dir / dir_name / file_name)
        except FileNotFoundError as exception:
            recognised_feature = np.zeros(300)
            print(exception)
        feature = np.concatenate((markuped_feature, recognised_feature), axis=0)
        features.append(torch.from_numpy(feature))

    return torch.stack(features).to(torch.float32)

def get_audio_features(csv: Path, dir_name: str):
    csv = csv[["video", "start_time", "end_time"]]
    features_dir = Path("data/CMU-MOSEI/audio_featrues")
    features: List[torch.Tensor] = list()

    progress_bar = tqdm(csv.values, desc="Features loading")
    for ytid, start_time, end_time in progress_bar:
        file_name = f"{ytid}_{float(start_time):.4f}_{float(end_time):.4f}.pt"
        try:
            feature = torch.load(features_dir / dir_name / file_name)
            features.append(feature)
        except FileNotFoundError as exception:
            print(exception)
    return torch.stack(features).to(torch.float32)

In [6]:
val_targets = torch.tensor(val_mod_csv[LABELS_NAMES].values, dtype=torch.float32)
val_text_features = get_text_features(val_mod_csv, "val_modified")
val_audio_features = get_audio_features(val_mod_csv, "val_modified")

Features loading:   0%|          | 0/1861 [00:00<?, ?it/s]

Features loading:   0%|          | 0/1861 [00:00<?, ?it/s]

In [7]:
test_targets = torch.tensor(test_orig_csv[LABELS_NAMES].values, dtype=torch.float32)
test_text_features = get_text_features(test_orig_csv, "test_original")
test_audio_features = get_audio_features(test_orig_csv, "test_original")

Features loading:   0%|          | 0/4662 [00:00<?, ?it/s]

[Errno 2] No such file or directory: 'data/CMU-MOSEI/fasttext_featrues_recognised_text/test_original/180971_0.0000_0.5720.npy'
[Errno 2] No such file or directory: 'data/CMU-MOSEI/fasttext_featrues_recognised_text/test_original/194299_0.0000_0.5920.npy'
[Errno 2] No such file or directory: 'data/CMU-MOSEI/fasttext_featrues_recognised_text/test_original/267466_0.0000_0.6520.npy'
[Errno 2] No such file or directory: 'data/CMU-MOSEI/fasttext_featrues_recognised_text/test_original/46495_0.0000_0.6620.npy'
[Errno 2] No such file or directory: 'data/CMU-MOSEI/fasttext_featrues_recognised_text/test_original/DjcZrtcBZi4_0.0000_1.7297.npy'


Features loading:   0%|          | 0/4662 [00:00<?, ?it/s]

### Models loading

In [8]:
def load_model(config_path: str, checkpoint_path: str, device: str):
    config: Dict[str, Dict[str, Any]] = json.loads(Path(config_path).read_text())
                                                   
    module = import_module(config["model"]["source"])
    ModelClass = getattr(module, config["model"]["name"])
    model: nn.Module = ModelClass(**config["model"]["prams"]).to(device)

    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["state_dict"])
    return model.eval()

In [9]:
SimpleFastTextClassifier_config_path = "experiments/fasttext_classifier/exp2-simple-model-weighted/config.json"
SimpleFastTextClassifier_checkpoint_path = "experiments/fasttext_classifier/exp2-simple-model-weighted/checkpoints/best_model-36500.pt"
ComplicatedFastTextClassifier_config_path = "experiments/fasttext_classifier/exp3-complicated-model-weighted/config.json"
ComplicatedFastTextClassifier_checkpoint_path = "experiments/fasttext_classifier/exp3-complicated-model-weighted/checkpoints/best_model-28000.pt"

SimpleAudioClassifier_config_path = "experiments/audio_classifier/exp1-simple-model-weighted/config.json"
SimpleAudioClassifier_checkpoint_path = "experiments/audio_classifier/exp1-simple-model-weighted/checkpoints/best_model-25.pt"
ComplicatedAudioClassifier_config_path = "experiments/audio_classifier/exp2-complicated-model-weighted/config.json"
ComplicatedAudioClassifier_checkpoint_path = "experiments/audio_classifier/exp2-complicated-model-weighted/checkpoints/250.pt"
ResNet_config_path = "experiments/audio_classifier/exp3-resnet-weighted/config.json"
ResNet_checkpoint_path = "experiments/audio_classifier/exp3-resnet-weighted/checkpoints/best_model-48.pt"
AST_config_path = "experiments/audio_classifier/exp4-AST-weighted/config.json"
AST_checkpoint_path = "experiments/audio_classifier/exp4-AST-weighted/checkpoints/16.pt"

SimpleMultimodalClassifier_config_path = "experiments/multimodal_classifier/exp1-simple-model-weighted/config.json"
# SimpleMultimodalClassifier_checkpoint_path = "experiments/multimodal_classifier/exp1-simple-model-weighted/checkpoints/best_model-105.pt"
SimpleMultimodalClassifier_checkpoint_path = "experiments/multimodal_classifier/exp1-simple-model-weighted/checkpoints/960.pt"

models: Dict[str, nn.Module] = {
    "Stochastic Model": StochasticModel(train_mod_csv).eval().to(DEVICE),
    
    "Simple FastText Classifier": load_model(SimpleFastTextClassifier_config_path, SimpleFastTextClassifier_checkpoint_path, DEVICE),
    "Complicated FastText Classifier": load_model(ComplicatedFastTextClassifier_config_path, ComplicatedFastTextClassifier_checkpoint_path, DEVICE),
    
    "Simple Audio Classifier": load_model(SimpleAudioClassifier_config_path, SimpleAudioClassifier_checkpoint_path, DEVICE),
    "Complicated Audio Classifier": load_model(ComplicatedAudioClassifier_config_path, ComplicatedAudioClassifier_checkpoint_path, DEVICE),
    "ResNet": load_model(ResNet_config_path, ResNet_checkpoint_path, DEVICE),
    "AST": load_model(AST_config_path, AST_checkpoint_path, DEVICE),

    "Simple Multimodal Classifier": load_model(SimpleMultimodalClassifier_config_path, SimpleMultimodalClassifier_checkpoint_path, DEVICE),
}

Using cache found in /home/sazerlife/.cache/torch/hub/pytorch_vision_v0.10.0


In [10]:
# config_path = Path("experiments/fasttext_classifier/exp2-simple-model-weighted/config.json")
# simple_fasttext_classifier_config: Dict[str, Dict[str, Any]] = json.loads(config_path.read_text())
# config_path = Path("experiments/audio_classifier/exp1-simple-model-weighted/config.json")
# simple_audio_classifier_config: Dict[str, Dict[str, Any]] = json.loads(config_path.read_text())

# config_path = Path("experiments/audio_classifier/exp2-complicated-model-weighted/config.json")
# complicated_audio_classifier_config: Dict[str, Dict[str, Any]] = json.loads(config_path.read_text())

# models: Dict[str, nn.Module] = {
#     "Stochastic Model": StochasticModel(train_mod_csv).eval().to(DEVICE),
    
#     "Simple FastText Classifier": SimpleFastTextClassifier(**simple_fasttext_classifier_config["model"]["prams"]).eval().to(DEVICE),
#     "Simple Audio Classifier": SimpleAudioClassifier(**simple_audio_classifier_config["model"]["prams"]).eval().to(DEVICE),
    
#     "Complicated Audio Classifier": ComplicatedAudioClassifier(**complicated_audio_classifier_config["model"]["prams"]).eval().to(DEVICE),
#     "ResNet": ResNet(**simple_audio_classifier_config["model"]["prams"]).eval().to(DEVICE),
#     "AST": AST(**simple_audio_classifier_config["model"]["prams"]).eval().to(DEVICE),
# }

# simple_fasttext_classifier_checkpoint = torch.load("experiments/fasttext_classifier/exp2-simple-model-weighted/checkpoints/best_model-36500.pt")
# models["Simple FastText Classifier"].load_state_dict(simple_fasttext_classifier_checkpoint["state_dict"])

# simple_audio_classifier_checkpoint = torch.load("experiments/audio_classifier/exp1-simple-model-weighted/checkpoints/best_model-25.pt")
# models["Simple Audio Classifier"].load_state_dict(simple_audio_classifier_checkpoint["state_dict"])

In [11]:
@torch.no_grad()
def predict(fetures: torch.Tensor, model: nn.Module, threshold: int, device: str, batch_size: int = 512):
    predicts = list()
    for batched_feature in torch.split(fetures, batch_size, dim=0):
        batched_predict: torch.Tensor = model(batched_feature.to(device))
        predicts.append(batched_predict)
    predicts = torch.vstack(predicts)
    
    predicts[predicts > threshold] = 1
    predicts[predicts <= threshold] = 0
    fetures = fetures.cpu().detach()
    torch.cuda.empty_cache()
    return predicts.cpu()

def get_binary_weighted_accuracy_row(predicts: torch.Tensor, targets: torch.Tensor):
    binary_weighted_accuracy_row = list()
    average_bWA = 0
    for label_index in range(len(LABELS_NAMES)):
        bWA = binary_weighted_accuracy(predicts[:,label_index], targets[:,label_index])
        binary_weighted_accuracy_row.append(bWA)
        average_bWA += bWA
    
    average_bWA = average_bWA / len(LABELS_NAMES)
    binary_weighted_accuracy_row.append(average_bWA)
    
    return binary_weighted_accuracy_row

### Threshold selection

In [12]:
# fig, ax = plt.subplots(2, 3)
# fig.set_figwidth(20)
# fig.set_figheight(15)

# # with torch.no_grad():
# #     predicts: torch.Tensor = models["Simple FastText Classifier"](text_features.to(DEVICE)).cpu().detach()
# #     text_features = text_features.cpu().detach()
# #     torch.cuda.empty_cache()

# with torch.no_grad():
#     predicts: torch.Tensor = models["Simple Audio Classifier"](audio_features.to(DEVICE)).cpu().detach()
#     audio_features = audio_features.cpu().detach()
#     torch.cuda.empty_cache()

# for label_index in range(len(LABELS_NAMES)):
#     thresholds = np.arange(0, 1.0, 0.05)
#     bWAs = list()
#     for threshold in thresholds:
#         tmp_predicts = torch.t_copy(predicts[:,label_index])
#         tmp_predicts[tmp_predicts > threshold] = 1
#         tmp_predicts[tmp_predicts <= threshold] = 0
#         targets = torch.t_copy(predicts[:,label_index])
#         bWA = binary_weighted_accuracy(tmp_predicts, targets)
#         bWAs.append(bWA)
    
#     i, j = label_index // 3, label_index % 3
#     ax[i][j].plot(thresholds, bWAs, color="blue")
#     ax[i][j].set_xlabel(f"{LABELS_NAMES[label_index]} thresholds")
#     ax[i][j].set_ylabel("binary weighted accuracy")
# plt.show()

### Evaluation and reporting

In [13]:
WA_df = list()
models_names = list()

# predicts = torch.stack([predict(target, models["Stochastic Model"], 0.5, DEVICE, 1) for target in test_targets])
predicts = predict(test_targets, models["Stochastic Model"], 0.5, DEVICE, 1)
binary_weighted_accuracy_row = get_binary_weighted_accuracy_row(predicts.to(int), test_targets.to(int))
WA_df.append(binary_weighted_accuracy_row)
models_names.append("Stochastic Model")
torch.cuda.empty_cache()


for text_model_name in ["Simple FastText Classifier", "Complicated FastText Classifier"]:
    predicts = predict(test_text_features, models[text_model_name], 0.5, DEVICE)
    binary_weighted_accuracy_row = get_binary_weighted_accuracy_row(predicts.to(int), test_targets.to(int))
    
    WA_df.append(binary_weighted_accuracy_row)
    models_names.append(text_model_name)
    torch.cuda.empty_cache()


for audio_model_name in ["Simple Audio Classifier", "Complicated Audio Classifier", "ResNet", "AST"]:
    predicts = predict(test_audio_features, models[audio_model_name], 0.5, DEVICE, 64)
    binary_weighted_accuracy_row = get_binary_weighted_accuracy_row(predicts.to(int), test_targets.to(int))
    
    WA_df.append(binary_weighted_accuracy_row)
    models_names.append(audio_model_name)
    torch.cuda.empty_cache()


with torch.no_grad():
    predicts = models["Simple Multimodal Classifier"](test_text_features.to(DEVICE), test_audio_features.to(DEVICE)).cpu()
    predicts[predicts > 0.5] = 1
    predicts[predicts <= 0.5] = 0
    test_text_features = test_text_features.cpu().detach()
    test_audio_features = test_audio_features.cpu().detach()
    torch.cuda.empty_cache()
binary_weighted_accuracy_row = get_binary_weighted_accuracy_row(predicts.to(int), test_targets.to(int))
WA_df.append(binary_weighted_accuracy_row)
models_names.append("Simple Multimodal Classifier")


WA_df = pd.DataFrame(np.asarray(WA_df), columns=LABELS_NAMES+["AVERAGE"], index=models.keys())

In [14]:
WA_df

Unnamed: 0,anger,disgust,fear,happy,sad,surprise,AVERAGE
Stochastic Model,0.322179,0.345989,0.435221,0.254934,0.304376,0.412698,0.345899
Simple FastText Classifier,0.384277,0.415272,0.458601,0.313707,0.375161,0.452166,0.399864
Complicated FastText Classifier,0.385135,0.413664,0.458709,0.316924,0.378915,0.452703,0.401008
Simple Audio Classifier,0.385135,0.413664,0.458709,0.268662,0.378915,0.452703,0.392964
Complicated Audio Classifier,0.385135,0.413664,0.458709,0.231338,0.378915,0.452703,0.386744
ResNet,0.385135,0.413664,0.458709,0.307701,0.378915,0.452703,0.399471
AST,0.385135,0.413664,0.458709,0.268662,0.378915,0.452703,0.392964
Simple Multimodal Classifier,0.385135,0.413664,0.458709,0.313492,0.378915,0.452703,0.400436


In [15]:
WA_df.to_csv("WA_df.csv")