# Loading data onto the disk

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/My Drive

/content/drive/My Drive


In [None]:
# !ls drive/My\ Drive/

In [None]:
# !cp train_ende.zip .
# !cp test.zip .

cp: 'train_ende.zip' and './train_ende.zip' are the same file
cp: 'test.zip' and './test.zip' are the same file


In [None]:
!git clone https://github.com/XL2248/MSCTD

Cloning into 'MSCTD'...
remote: Enumerating objects: 1217, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 1217 (delta 13), reused 7 (delta 3), pack-reused 1190[K
Receiving objects: 100% (1217/1217), 102.24 MiB | 16.05 MiB/s, done.
Resolving deltas: 100% (617/617), done.
Updating files: 100% (934/934), done.


In [None]:
# !cp MSCTD/MSCTD_data/ende/english_*.txt .
# !cp MSCTD/MSCTD_data/ende/image_index_*.txt .
# !cp MSCTD/MSCTD_data/ende/sentiment_*.txt .

import os
import shutil

for file in os.listdir('MSCTD/MSCTD_data/ende'):
    if file.startswith('english_'):
        shutil.copy('MSCTD/MSCTD_data/ende/' + file, file)
    if file.startswith('image_index_'):
        shutil.copy('MSCTD/MSCTD_data/ende/' + file, file)
    if file.startswith('sentiment_'):
        shutil.copy('MSCTD/MSCTD_data/ende/' + file, file)

In [None]:
!pip install --upgrade --no-cache-dir gdown
!gdown --id 1GAZgPpTUBSfhne-Tp0GDkvSHuq6EMMbj
!gdown --id 1B9ZFmSTqfTMaqJ15nQDrRNLqBvo-B39W

In [None]:
%%bash
for x in *.zip
do
  unzip -qq $x
done;

In [None]:
# !mkdir dataset
# !cd dataset; mkdir train test dev

os.makedirs('dataset', exist_ok=True)
os.makedirs('dataset/train', exist_ok=True)
os.makedirs('dataset/test', exist_ok=True)
os.makedirs('dataset/dev', exist_ok=True)

In [None]:
# !mv *train* dataset/train
# !mv *test* dataset/test
# !mv *dev* dataset/dev

for file in os.listdir():
    if 'train' in file:
        shutil.move(file, 'dataset/train')
    if 'test' in file:
        shutil.move(file, 'dataset/test')
    if 'dev' in file:
        shutil.move(file, 'dataset/dev')

# Dataset and Dataloader

In [3]:
!pip install mtcnn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mtcnn
  Downloading mtcnn-0.1.1-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mtcnn
Successfully installed mtcnn-0.1.1


In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms as T
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import cv2
from mtcnn.mtcnn import MTCNN
import linecache

In [5]:
class MSCTD_Dataset (Dataset):
  def __init__(self, dataset_dir, images_dir, conversation_dir, texts, sentiments,
                transform=None, preprocess_func=None, pad_idx=None, max_len=None):
    self.dataset_path = Path(dataset_dir)
    self.images_path = self.dataset_path / images_dir
    self.sentiment_path = self.dataset_path / sentiments
    self.text_path = self.dataset_path / texts
    self.conversations_path = self.dataset_path / conversation_dir

    self.transform = transform

    self.preprocess_func = preprocess_func
    self.pad_idx = pad_idx
    self.max_len = max_len

    with open(self.text_path, 'r') as f:
        self.texts = f.read().splitlines()

    with open(self.sentiment_path, 'r') as f:
        self.sentiments = np.array(f.read().splitlines()).astype("int32")

    with open(self.conversations_path, 'r') as f:
        self.conversations = np.array(f.read().splitlines())
    
  def __len__(self):
        return len(self.sentiments)

  def __getitem__(self, idx):
        img_path = self.images_path / f'{idx}.jpg'
        image = Image.open(img_path)
        # image = read_image(str(img_path))
        if self.transform:
            image = self.transform(image)
       
        text = self.texts[idx].strip()

        if self.preprocess_func is not None:
            text = self.preprocess_func(text)
            if self.max_len is not None:
                text = text[:self.max_len]
            if self.pad_idx is not None:
                text = F.pad(torch.tensor(text), (0, self.max_len - len(text)), 'constant', self.pad_idx)
        
        sentiment = self.sentiments[idx]

        data_dict = {"text":text,
                     "image":image,
                     "sentiment":sentiment}
        return data_dict

In [6]:
class Text_MSCTD(MSCTD_Dataset):
    def __init__(self, dataset_dir, conversation_dir, texts, sentiments,
                preprocess_func=None, pad_idx=None, max_len=None, transform=None, images_dir=''):
        super().__init__(dataset_dir, images_dir, conversation_dir, texts, sentiments, transform)
        self.preprocess_func = preprocess_func
        self.pad_idx = pad_idx
        self.max_len = max_len

    
    def __getitem__(self, idx):
        text = self.texts[idx]
        if self.preprocess_func is not None:
            text = self.preprocess_func(text)
            if self.max_len is not None:
                text = text[:self.max_len]
            if self.pad_idx is not None:
                text = F.pad(torch.tensor(text), (0, self.max_len - len(text)), 'constant', self.pad_idx)
        labels = self.sentiments[idx]
        return text, labels

In [7]:
class IMAGE_Dataset (Dataset):
  def __init__(self, dataset_dir, images_dir, conversation_dir, texts, sentiments, transform=None):
    self.dataset_path = Path(dataset_dir)
    self.images_path = self.dataset_path / images_dir
    self.sentiment_path = self.dataset_path / sentiments
    self.text_path = self.dataset_path / texts
    self.conversations_path = self.dataset_path / conversation_dir

    self.transform = transform

    with open(self.sentiment_path, 'r') as f:
      self.length = len(f.readlines())

    with open(self.text_path, 'r') as f:
        self.texts = f.read().splitlines()

    with open(self.sentiment_path, 'r') as f:
        self.sentiments = np.array(f.read().splitlines()).astype("int32")
    
    with open(self.conversations_path, 'r') as f:
        self.conversations = np.array(f.read().splitlines())
    
  def __len__(self):
        return self.length

  def __getitem__(self, idx):
        img_path = self.images_path / f'{idx}.jpg'
        image = Image.open(img_path)
        if self.transform:
            image = self.transform(image)
        sentiment = self.sentiments[idx]
        return image,sentiment

In [10]:
!pip install pyenchant

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyenchant
  Downloading pyenchant-3.2.2-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.7/55.7 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyenchant
Successfully installed pyenchant-3.2.2


In [11]:
import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

import enchant
english_dict = enchant.Dict("en_US")

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

In [14]:
NUM = '<NUM>'
UNK = '<UNK>'

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def sent_preprocess(sent, lower=True, remove_punct=True, remove_stopwords=True,
                    lemmatize=True, handle_nums=True, handle_unknowns=True):
    if lower:
        sent = sent.lower()
    
    if remove_punct:
        sent = sent.translate(str.maketrans('', '', string.punctuation))
    
    word_tokens = word_tokenize(sent)

    if remove_stopwords:
        word_tokens = [w for w in word_tokens if not w in stop_words]

    if lemmatize:
        word_tokens = [lemmatizer.lemmatize(w) for w in word_tokens]

    if handle_nums:
        
        def is_number(s):
            if s.isdigit():
                return True
            if s[:-2].isdigit():
                if s[-2:] == 'th' or s[-2:] == 'st' or s[-2:] == 'nd' or s[-2:] == 'rd':
                    return True
            return False

        word_tokens = [NUM if is_number(w) else w for w in word_tokens]

    if handle_unknowns:
        word_tokens = [w if english_dict.check(w) else UNK for w in word_tokens]

    return word_tokens

# Model

### Bert Congfiguration

In [15]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1


In [16]:
from transformers import BertConfig, BertTokenizer
from transformers import BertModel, AutoModel, BertForSequenceClassification
from transformers import AdamW

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [18]:
# general config
MAX_LEN = 30

TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
TEST_BATCH_SIZE = 64

EPOCHS = 4
LEARNING_RATE = 5e-5

MODEL_NAME = 'bert-base-uncased'

In [19]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
config = BertConfig.from_pretrained(MODEL_NAME)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [20]:
def bert_preprocess(text):
    return tokenizer.encode_plus(
        text,
        max_length=MAX_LEN,
        truncation=True,
        add_special_tokens=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        padding='max_length',
        return_tensors='pt',
    )

In [21]:
# change this if needed
transform = T.Compose([T.ToTensor()])

trainset = MSCTD_Dataset('dataset/train', 'train_ende', 'image_index_train.txt', 'english_train.txt', 'sentiment_train.txt', preprocess_func=bert_preprocess, transform=transform)
# do we have all parts of devset?
devset = MSCTD_Dataset('dataset/dev', 'dev_ende', 'image_index_dev.txt', 'english_dev.txt', 'sentiment_dev.txt', preprocess_func=bert_preprocess, transform=transform)
testset = MSCTD_Dataset('dataset/test', 'test_ende', 'image_index_test.txt', 'english_test.txt', 'sentiment_test.txt', preprocess_func=bert_preprocess, transform=transform)

In [22]:
train_loader = DataLoader(trainset, batch_size=32, shuffle=True)
dev_loader = DataLoader(devset, batch_size=32, shuffle=False)
test_loader = DataLoader(testset, batch_size=32, shuffle=False)

# Get and save embedding of models

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [25]:
train_dataset = Text_MSCTD('dataset/train', 'image_index_train.txt', 'english_train.txt', 'sentiment_train.txt')

In [26]:
dev_dataset = Text_MSCTD('dataset/dev', 'image_index_dev.txt', 'english_dev.txt', 'sentiment_dev.txt')
test_dataset = Text_MSCTD('dataset/test', 'image_index_test.txt', 'english_test.txt', 'sentiment_test.txt')

In [27]:
train_data = [item for item in train_dataset]
train_text, train_labels = zip(*train_data)

dev_data = [item for item in dev_dataset]
dev_text, dev_labels = zip(*dev_data)

all_texts = train_text + dev_text
all_labels = train_labels + dev_labels

test_data = [item for item in test_dataset]
test_text, test_labels = zip(*test_data)

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_sents = [sent_preprocess(text, remove_stopwords=False, handle_nums=False) for text in train_text]
dev_sents = [sent_preprocess(text, remove_stopwords=False, handle_nums=False) for text in dev_text]
all_sents = train_sents + dev_sents
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(all_sents)

TfidfVectorizer(analyzer=<function <lambda> at 0x7fa04e68ed30>)

In [29]:
# computing max length of a sentence
max_len = max([len(text) for text in all_sents])
max_len

19

In [30]:
# general config
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
TEST_BATCH_SIZE = 64

EPOCHS = 4
LEARNING_RATE = 5e-5

MODEL_NAME = 'bert-base-uncased'

In [31]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
config = BertConfig.from_pretrained(MODEL_NAME)

In [32]:
def bert_preprocess(text):
    return tokenizer.encode_plus(
        text,
        max_length=max_len + 6,
        truncation=True,
        add_special_tokens=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        padding='max_length',
        return_tensors='pt',
    )


train_dataset = Text_MSCTD('dataset/train', 'image_index_train.txt', 'english_train.txt', 'sentiment_train.txt', preprocess_func=bert_preprocess)
dev_dataset = Text_MSCTD('dataset/dev', 'image_index_dev.txt', 'english_dev.txt', 'sentiment_dev.txt', preprocess_func=bert_preprocess)
test_dataset = Text_MSCTD('dataset/test', 'image_index_test.txt', 'english_test.txt', 'sentiment_test.txt' , preprocess_func=bert_preprocess)

In [35]:
transform = transforms.Compose([transforms.ToTensor()
                                ,transforms.Resize((288,288),transforms.InterpolationMode("bicubic"))
                                ,transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])])     
trainset_image = IMAGE_Dataset('dataset/train', 'train_ende', 'image_index_train.txt', 'english_train.txt', 'sentiment_train.txt',transform=transform)
devset_image = IMAGE_Dataset('dataset/dev', 'dev', 'image_index_dev.txt', 'english_dev.txt', 'sentiment_dev.txt',transform=transform)
testset_image = IMAGE_Dataset('dataset/test', 'test', 'image_index_test.txt', 'english_test.txt', 'sentiment_test.txt',transform=transform)

In [36]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=False)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=False)

train_loader_image = torch.utils.data.DataLoader(trainset_image, batch_size=TRAIN_BATCH_SIZE, shuffle=False)
dev_loader_image = torch.utils.data.DataLoader(devset_image, batch_size=VALID_BATCH_SIZE, shuffle=False)
test_loader_image = torch.utils.data.DataLoader(test_loader, batch_size=TEST_BATCH_SIZE, shuffle=False)

In [None]:
text_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)
text_model.to(device);
text_model.classifier = nn.Sequential()
#text_model.load_state_dict(torch.load("models/bert.pth"),strict=False) #Import model here

In [None]:
from torchvision.models import efficientnet_b2, EfficientNet_B2_Weights

image_model = efficientnet_b2(weights=EfficientNet_B2_Weights.IMAGENET1K_V1).to(device)
image_model.classifier = nn.Sequential()
image_model.load_state_dict(torch.load("models/face_aug_modal.pth",map_location=torch.device('cpu')),strict=False)

In [43]:
import tqdm
import pickle
iterator_text = iter(train_loader)
iterator_image = iter(train_loader_image)
vectors = []
with tqdm.tqdm(enumerate(train_loader), total=len(train_loader)) as pbar:
      with torch.no_grad():  
        for i, _ in pbar:
          if i==3 :
            break
          data_i, y = next(iterator_text)
          image_i, y = next(iterator_image)
          (input_ids, attention_mask, token_type_ids) = data_i.values()
          input_ids, attention_mask, token_type_ids = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
          y = y.long().to(device)
          input_ids = input_ids.squeeze(1)
          attention_mask = attention_mask.squeeze(1)
          token_type_ids = token_type_ids.squeeze(1)
          output = text_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
          rep_text = output.logits
          image_i = image_i.to(device).float()
          rep_image = image_model(image_i)
          vector = torch.cat((rep_text,rep_image),dim=1)
          for v in vector:
              vectors.append(v)
with open("models/cat_vectors.txt", 'wb') as f:
      pickle.dump(vectors, f)
with open("models/cat_vectors.txt", 'rb') as f:
      vectors = list(pickle.load(f))

  1%|          | 2/317 [01:17<3:23:25, 38.75s/it]


KeyboardInterrupt: ignored

In [None]:
import tqdm
import pickle
iterator_text = iter(test_loader)
iterator_image = iter(test_loader_image)
vectors = []
with tqdm.tqdm(enumerate(test_loader), total=len(test_loader)) as pbar:
      with torch.no_grad():  
        for i, _ in pbar:
            data_i, y = next(iterator_text)
            image_i, y = next(iterator_image)
            (input_ids, attention_mask, token_type_ids) = data_i.values()
            input_ids, attention_mask, token_type_ids = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
            y = y.long().to(device)
            input_ids = input_ids.squeeze(1)
            attention_mask = attention_mask.squeeze(1)
            token_type_ids = token_type_ids.squeeze(1)
            output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            rep_text = output.logits
            image_i = image_i.to(device).float()
            rep_image = net(image_i)
            vector = torch.cat((rep_text,rep_image),dim=1)
            for v in vector:
               vectors.append(v)
with open("models/cat_vectors_test.txt", 'wb') as f:
      pickle.dump(vectors, f)
with open("models/cat_vectors_test.txt", 'wb') as f:
      vec = pickle.load(f)
print(len(vec))

### Model Architecture

In [None]:
class MultiModalModel(nn.Module):

    def __init__(self, tokenizer=tokenizer, num_classes=3, max_len=MAX_LEN) -> None:
        super().__init__()
        self.cnn = None  # dfine the CNN model
        self.cnn.requires_grad_(False)
        self.bert = BertModel.from_pretrained(
            'bert-base-uncased',
            num_labels = 3,
            output_attentions = False,
            output_hidden_states = False,
        )
        self.bert.requires_grad_(False)
        # size of concatenated vector (this may raise error)
        self.embedding_size = self.cnn(torch.rand(1, 3, 224, 224)).shape[1] + self.bert(torch.rand(1, 25))['pooler_output'].shape[1]
        self.fc = nn.Linear(self.embedding_size, num_classes)

        self.tokenizer = tokenizer
        self.max_len = max_len

    def forward(self, image, text_data):
        image = self.cnn(image)
        # TODO: if they are of shape (N, 1, D), then they should be squeezed to (N, D)
        input_ids = text_data['input_ids'].flatten().to(device)
        attention_mask = text_data['attention_mask'].flatten().to(device)
        token_type_ids = text_data['token_type_ids'].flatten().to(device)
        x = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        pooler_output = x['pooler_output']
        x = torch.cat((image, pooler_output), dim=1)
        x = self.fc(x)

# Training

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

In [None]:
import tqdm

def one_epoch(model, loader, criterion, optimizer=None, epoch='', train=True, set_name='Train', metrics=None):
    total_loss = 0
    N = len(loader.dataset)
    Y = []
    Y_pred = []
    if train:
        model.train()
    else:
        model.eval()

    with torch.set_grad_enabled(train), tqdm.tqdm(enumerate(loader), total=len(loader)) as pbar:
        for i, data_i in pbar:
            if train:
                optimizer.zero_grad()

            image, text, y = data_i['image'], data_i['text'], data_i['sentiment']
            image = image.to(device)
            y = y.long().to(device)

            p = model(image, text)

            loss = criterion(p, y.long())

            total_loss += loss.item() * len(y)
            pbar.set_description(f'{epoch}: {set_name} Loss: {total_loss / N:.3e}')
            if train:
                loss.backward()
                optimizer.step()

            y_pred = p.argmax(dim=-1)
            Y.append(y.cpu().numpy())
            Y_pred.append(y_pred.cpu().numpy())

    total_loss /= N

    Y = np.concatenate(Y)
    Y_pred = np.concatenate(Y_pred)
    acc = accuracy_score(Y_pred, Y)
    print(f'Accuracy of {set_name} set: {acc}')

    result = {'loss': total_loss, 'accuracy': acc}
    if metrics is not None:
        result.update({metric: metric_func(Y, Y_pred) for metric, metric_func in metrics.items()})

    return result

In [None]:
def train_model(model, dataloaders, num_epochs, criterion, optimizer, model_name='pytroch-model', scheduler=None):
    train_loader, val_loader = dataloaders
    min_val_loss = np.inf

    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(num_epochs):
        result = one_epoch(model, train_loader, criterion, optimizer, epoch, train=True, set_name='Train')
        train_loss = result['loss']
        train_acc = result['accuracy']
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        val_result = one_epoch(model, val_loader, criterion, epoch=epoch, train=False, set_name='Validation')
        val_loss = val_result['loss']
        val_acc = val_result['accuracy']
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        
        print('\n', '-' * 60)

        if val_loss < min_val_loss:
            min_val_loss = val_loss
            torch.save(model.state_dict(), f'{model_name}.pt')

        if scheduler:
            scheduler.step(val_loss)

    plt.plot(train_losses, label='train')
    plt.plot(val_losses, label='val')
    plt.title('loss history of training and val sets')
    plt.legend()
    plt.show()

    plt.plot(train_accuracies, label='train')
    plt.plot(val_accuracies, label='val')
    plt.title('Accuracy history of training and val sets')
    plt.legend()
    plt.show()

    model.load_state_dict(torch.load(f'{model_name}.pt'))
    return model, min_val_loss

In [None]:
model = MultiModalModel().to(device)

In [None]:
# Training Configuration
LEARNING_RATE = 1e-3
EPOCH = 20
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True, factor=0.5)
criterion = nn.CrossEntropyLoss()

model, min_val_loss = train_model(model, (train_loader, dev_loader), EPOCH, criterion, optimizer, model_name='bert_cnn', scheduler=scheduler)

# Evaluation

In [None]:
average_policy = 'macro'
metrics = {'accuracy': accuracy_score, 'precision': lambda y1, y2: precision_score(y1, y2, average=average_policy),
           'recall': lambda y1, y2: recall_score(y1, y2, average=average_policy),
           'f1': lambda y1, y2: f1_score(y1, y2, average=average_policy),
           'confusion_matrix': confusion_matrix}

In [None]:
def eval_model(model, loader, metrics=metrics, set_name='Test', plot_confusion_matrix=True):
    results = one_epoch(model, loader, criterion, train=False, set_name=set_name, metrics=metrics)
    disp = ConfusionMatrixDisplay(results.pop('confusion_matrix'))
    if plot_confusion_matrix:
        disp.plot()
    return results

In [None]:
eval_model(model, test_loader)