# Dataset

## - dataset.py

In [None]:
import os
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer

import torch
from torch.utils.data import Dataset

In [2]:
class DepressionDataset(Dataset):
    '''create a training, develop, or test dataset
       and load the participant features if it's called 
    '''
    def __init__(self,
                 root_dir,
                 mode,
                 transform=None):
        super(DepressionDataset, self).__init__()
        
        # only train, develop, test dataset allow
        assert mode in ["train", "dev", "test"],\
            "Argument --mode could only be ['train', 'dev', 'test']"
        
        self.mode = mode
        self.root_dir = root_dir
        self.transform = transform
        self.train_data_path = os.path.join(self.root_dir, 'train_split_Depression_AVEC2017.csv')
        self.dev_data_path = os.path.join(self.root_dir, 'dev_split_Depression_AVEC2017.csv')
        self.test_data_path = os.path.join(self.root_dir, 'full_test_split.csv')
        # load sent2vec model for converting text file to 2D array
        self.sent2vec = SentenceTransformer('all-mpnet-base-v2')  # output dimension 768
        
        # load training data # 107 sessions
        if self.mode == "train":
            # pre-checking and cleaning the data
            self.data_df = self.pre_check(pd.read_csv(self.train_data_path))
            # store ground truth
####################################################################################################
#             self.patientIDs = self.data_df['Participant_ID'].to_numpy()
            self.patientIDs = np.array([303,321,362,363,426]) # for debugging on my laptop
####################################################################################################
            self.phq_binay_gt = self.data_df['PHQ8_Binary'].to_numpy()
            self.phq_score_gt = self.data_df['PHQ8_Score'].to_numpy()
            self.gender_gt = self.data_df['Gender'].to_numpy()
            self.phq_subscores_gt = self.data_df.iloc[:, 4:].to_numpy()
        
        # load development data # 35 sessions
        if self.mode == "dev":
            # pre-checking and cleaning the data
            self.data_df = self.pre_check(pd.read_csv(self.dev_data_path))
            # store ground truth
            self.patientIDs = self.data_df['Participant_ID'].to_numpy()
            self.phq_binay_gt = self.data_df['PHQ8_Binary'].to_numpy()
            self.phq_score_gt = self.data_df['PHQ8_Score'].to_numpy()
            self.gender_gt = self.data_df['Gender'].to_numpy()
            self.phq_subscores_gt = self.data_df.iloc[:, 4:].to_numpy()
        
        # load test data # 47 sessions
        if self.mode == "test":
            # pre-checking and cleaning the data
            self.data_df = self.pre_check(pd.read_csv(self.test_data_path))
            # store ground truth
            self.patientIDs = self.data_df['Participant_ID'].to_numpy()
            self.phq_binay_gt = self.data_df['PHQ_Binary'].to_numpy()
            self.phq_score_gt = self.data_df['PHQ_Score'].to_numpy()
            self.gender_gt = self.data_df['Gender'].to_numpy()
            self.phq_subscores_gt = None
    
    
    def pre_check(self, data):
        '''
        Basic cleaning process to make sure no missing value
        and that the sum of each PHQ subscore equals to PHQ score 
        Argument:
            data: numpy array
        Return:
            data: numpy array with type "int"
        '''
        # make sure no NaN, Inf, -Inf
        if data.isin([np.nan, np.inf, -np.inf]).any(1).sum():
            print('Replacing NaN, Inf, or -Inf ...')
            data = data.replace([np.inf, -np.inf, np.nan], 0).astype('int')
        else: 
            data = data.astype('int')
            
        # compare the sum of each PHQ subscore to PHQ score
        unequal = data.iloc[:, 4:].sum(axis=1) != data.iloc[:,2]
        if unequal.any() and self.mode != 'test':
            lines = np.where(unequal)
            raise ValueError(("The sum of each PHQ subscore at line {} "
                              "is unequal to the PHQ score").format(lines[0]))
        
        return data
    
    
    def __len__(self):
        return len(self.patientIDs)
    
    
    def __iter__(self):
        return iter(self.patientIDs)
    
    
    def __getitem__(self, idx):
        '''
        Essentional function for creating dataset in PyTorch, which will automatically be
        called in Dataloader and load all the extracted features of the patient in the Batch
        based on the index of self.patientIDs
        Argument:
            idx: int, index of the patient ID in self.patientIDs
        Return:
            session: dict, contains all the extracted features and ground truth of a patient/session 
        '''
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # get the patient session path
        session_num = self.patientIDs[idx]
        session_path = os.path.join(self.root_dir, '{}_P'.format(session_num))
        
        # TODO: if other feature is needed, add more in the following part...
        
        # get text path
        text_path = os.path.join(session_path, '{}_TRANSCRIPT.csv'.format(session_num))
        
        # text feature
        self.text_feature = self.load_sent2vec(text_path, speaker='Participant')
        sentence_embedding = self.text_feature['sentence_embeddings']
        
        # summary
        session = {'patientID': session_num,
                   'session_path': session_path,
                   'sentence_embeddings': sentence_embedding, 
                   'phq_score_gt': self.phq_score_gt[idx], 
                   'phq_binay_gt': self.phq_binay_gt[idx],
                   'phq_subscores_gt': self.phq_subscores_gt[idx],
                   'gender_gt': self.gender_gt[idx]}
        
        if self.transform:
            session = self.transform(session)
        
        return session
    
    
    def load_sent2vec(self, text_path, speaker='Participant'):
        '''
        load the text file and use sent2vec model from SentenceTransformer
        for sentence embeddings, which generates 2D array
        Arguments:
            text_path: string, absolute path to transcipt file
            speaker: certain string, which transcript of the speaker to load
        Return:
            text_feature: dict, contain converted embedding vectors, sentences, etc.
        '''
        
        # only 'Ellie', 'Participant', 'both' are allow
        assert speaker in ['Ellie', 'Participant', 'both'],\
            "Argument --speaker could only be ['Ellie', 'Participant', 'both']"
        
        text_file = pd.read_csv(text_path)
        # tokenize the text file, filter out all \t space and unnecessary columns such as time, participent 
        tokenized_words = self.tokenize_corpus(text_file.values.tolist()[i][0] for i in range(text_file.shape[0]))
        
        sentences = []
        sentences_idx = []
        
        if speaker == 'Ellie':
            for idx, sentence in enumerate(tokenized_words):
                if sentence[2] == 'Ellie':
                    sentences.append(sentence[3:])
                    sentences_idx.append(idx)
        elif speaker == 'Participant':
            for idx, sentence in enumerate(tokenized_words):
                if sentence[2] == 'Participant':
                    sentences.append(sentence[3:])
                    sentences_idx.append(idx)
                    
        else: # speaker == 'both'
            sentences = [tokenized_words[i][3:] for i in range(len(tokenized_words))]
            sentences_idx = list(range(len(tokenized_words)))
    
        # recombine 2D list of words into 1D list of sentence
        final_sentences = [" ".join(sentences[i]).lower() for i in range(len(sentences))]
        # convert sentence to vector with SentenceTransformer pretrained model
        sentence_embeddings = self.sent2vec.encode(final_sentences)
        
        # summary
        text_feature = {'speaker': speaker,
                        'sentence_embeddings': sentence_embeddings,
                        'sentences': final_sentences, 
                        'indices': sentences_idx}
        
        return text_feature
     
        
    def tokenize_corpus(self, corpus):
        '''tokenzie a given list of string into list of words
        Argument:
            corpus: 1D list of string, each element is a sting of sentence
        Return:
            tokens: 2D list of string, each raw is a list of words splitted from sentence 
        '''
        tokens = [x.split() for x in corpus]
        return tokens

    
class Padding(object):
    ''' pad zero to each feature matrix so that they all have the same size '''
    def __init__(self, text_output_size=(386, 768)):
        super(Padding, self).__init__()
        '''
        Each output size could be 'int' or 'tuple'. 
        Integer would be the number of desired rows
        and Tuple would be the desired 2D array size.
        
        Here is recommended to keep the number of columns 
        as they are and only set the number of rows with int
        
        To find the maximum length of rows, please use the 
        'find_max_length' function in utils to search through. 
        
        The value 389 are the maximum length in our case.
        '''
        assert isinstance(text_output_size, (int, tuple))
        self.text_output_size = text_output_size
    
    
    def __call__(self, session):
        sentence_embeddings = session['sentence_embeddings']
        
        # text padding
        if isinstance(self.text_output_size, int):
            shape = sentence_embeddings.shape
            assert self.text_output_size >= shape[0],\
                "audio output size should be bigger than {}".format(shape[0])
            padded_text = np.zeros((self.text_output_size, shape[1]))
            padded_text[:shape[0],:shape[1]] = sentence_embeddings
        else:
            shape = sentence_embeddings.shape
            assert self.text_output_size[0] >= shape[0] and self.text_output_size[1] >= shape[1],\
                "audio output size should be bigger than {}".format(shape)
            padded_text = np.zeros(self.text_output_size)
            padded_text[:shape[0],:shape[1]] = sentence_embeddings
        
        # summary
        padded_session = {'patientID': session['patientID'],
                          'session_path': session['session_path'],
                          'sentence_embeddings': padded_text, 
                          'phq_score_gt': session['phq_score_gt'], 
                          'phq_binay_gt': session['phq_binay_gt'],
                          'phq_subscores_gt': session['phq_subscores_gt'],
                          'gender_gt': session['gender_gt']}
        
        return padded_session
    

class ToTensor(object):
    """Convert ndarrays in sample to Tensors or np.int to torch.tensor."""
    
    def __call__(self, session):
        sentence_embeddings = session['sentence_embeddings']
        
        converted_session = {'patientID': session['patientID'],
                             'session_path': session['session_path'],
                             'sentence_embeddings': torch.from_numpy(session['sentence_embeddings']), 
                             'phq_score_gt': torch.tensor(session['phq_score_gt']), 
                             'phq_binay_gt': torch.tensor(session['phq_binay_gt']),
                             'phq_subscores_gt': session['phq_subscores_gt'],
                             'gender_gt': torch.tensor(session['gender_gt'])}
        
        return converted_session

## - utils.py

In [3]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# from skimage import io, transform
# from mpl_toolkits.mplot3d import Axes3D

In [4]:
# Data-related functions

def minmax_scaler(data):
    '''recale the data, which is a 2D matrix, to 0-1'''
    return (data - data.min(axis=1)[:,np.newaxis])/(data.max(axis=1) - data.min(axis=1))[:, np.newaxis]


def cosine_similarity(u, v):
    '''Calculate the similarity between 1D arrays'''
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


def similarity_matrix(array):
    '''Calculate the similarity matrix by given a 2D array'''
    shape = array.shape
    similarity = np.zeros((shape[0],shape[0]))

    for i in range(shape[0]):
        for k in range(shape[0]):
            similarity[i][k] = cosine_similarity(array[i], array[k])

    return similarity


def load_text_file(text_path, speaker='Participant'):
    '''load transcript file and extract the text of the given speaker'''
    
    def tokenize_corpus(corpus):
        '''tokenzie a given list of string into list of words'''
        tokens = [x.split() for x in corpus]
        return tokens

    # only 'Ellie', 'Participant', 'both' are allow
    assert speaker in ['Ellie', 'Participant', 'both'],\
        "Argument --speaker could only be ['Ellie', 'Participant', 'both']"

    text_file = pd.read_csv(text_path)
    # tokenize the text file, filter out all \t space and unnecessary columns such as time, participent 
    tokenized_words = tokenize_corpus(text_file.values.tolist()[i][0] for i in range(text_file.shape[0]))

    sentences = []
    sentences_idx = []

    if speaker == 'Ellie':
        for idx, sentence in enumerate(tokenized_words):
            if sentence[2] == 'Ellie':
                sentences.append(sentence[3:])
                sentences_idx.append(idx)
    elif speaker == 'Participant':
        for idx, sentence in enumerate(tokenized_words):
            if sentence[2] == 'Participant':
                sentences.append(sentence[3:])
                sentences_idx.append(idx)

    else: # speaker == 'both'
        sentences = [tokenized_words[i][3:] for i in range(len(tokenized_words))]
        sentences_idx = list(range(len(tokenized_words)))

    # recombine 2D list of words into 1D list of sentence
    final_sentences = [" ".join(sentences[i]).lower() for i in range(len(sentences))]

    return final_sentences


def find_max_length(root_dir):
    '''find out the maximum lenghth of each features among all patients'''

    # initialize each value
    max_length = {'landmarks': 0, 
                  'gaze_samples': 0, 
                  'sentences': 0}

    for name in os.listdir(root_dir):
        name_path = os.path.join(root_dir, name)
        if os.path.isdir(name_path) and name.endswith('_P'):
            session = name.split('_')[0]
            print('searching through patient {} ...'.format(session))

            facial_landmarks_path = os.path.join(name_path, '{}_CLNF_features3D.txt'.format(session))
            gaze_direction_path = os.path.join(name_path, '{}_CLNF_gaze.txt'.format(session))
            text_path = os.path.join(name_path, '{}_TRANSCRIPT.csv'.format(session))

            facial_landmarks = pd.read_csv(facial_landmarks_path)
            if len(facial_landmarks) > max_length['landmarks']:
                max_length['landmarks'] = len(facial_landmarks)

            gaze_direction = pd.read_csv(gaze_direction_path)
            if len(gaze_direction) > max_length['gaze_samples']:
                max_length['gaze_samples'] = len(gaze_direction)

            sentences = load_text_file(text_path, speaker='Participant')
            if len(sentences) > max_length['sentences']:
                max_length['sentences'] = len(sentences)

    if max_length['gaze_samples'] != max_length['landmarks']:
        max_length['gaze_samples'] = max_length['landmarks']

    return max_length


def show_text_correlation(text_feature, start_sent, sent_len):
    """Show the correlation between each sentence.
    Arguments:
        text_feature: dict, one attribute of DepressionDataset, which
                      includes converted sentence embedding vectors (2D numpy.ndarray)
        start_sent: int, start index of the sentence you want
        sent_len: int, number of sentence you want to compare 
                  (size of correlation matrix)
    Return:
        plot the correlation matrix between sentences
    """
    # calculate correlation matrix
    correlation = np.corrcoef(text_feature['sentence_embeddings'][int(start_sent):int(start_sent+sent_len)])
    plt.figure(figsize=(12,12))
    # plot heatmap
    heatmap = sns.heatmap(correlation, annot=True,  fmt='.2g')  # cbar_kws={'label': 'correlation'}
    # set scale label
    heatmap.set_xticklabels(text_feature['indices'][int(start_sent):int(start_sent+sent_len)]) # rotation=-30
    heatmap.set_yticklabels(text_feature['indices'][int(start_sent):int(start_sent+sent_len)], rotation=0)
    # set label
    plt.xlabel("sentence number in conversation")
    plt.ylabel("sentence number in conversation") 
    plt.show()


def show_similarity_matrix(text_feature, start_sent, sent_len):
    '''plot the result of similarity matrix as heatmap'''
    # calculate similarity
    similarity = similarity_matrix(text_feature['sentence_embeddings'][int(start_sent):int(start_sent+sent_len)])
    # plot heatmap
    plt.figure(figsize=(16,16))
    heatmap = sns.heatmap(similarity, annot=True,  fmt='.2g')  # cbar_kws={'label': 'correlation'}
    # set scale label
    heatmap.set_xticklabels(text_feature['indices'][int(start_sent):int(start_sent+sent_len)]) # rotation=-30
    heatmap.set_yticklabels(text_feature['indices'][int(start_sent):int(start_sent+sent_len)], rotation=0)
    # set label
    plt.xlabel("sentence number in conversation")
    plt.ylabel("sentence number in conversation")
    plt.show()

##### test block

In [86]:
from torch.utils.data import DataLoader
from torchvision import transforms

# test 3: try to load the dataset with DataLoader
transformed_dataset = DepressionDataset(os.path.join(os.getcwd(), 'DAIC-WOZ Dataset'), 'train',
                                        transform=transforms.Compose([Padding(), ToTensor()]))
# create dataloader
dataloader = DataLoader(transformed_dataset, 
                        batch_size=2,
                        shuffle=False, 
                        num_workers=0)
# iterate through batches
for i_batch, sample_batched in enumerate(dataloader):
    print('Batch number: ', i_batch, ', sentence embeddings: ', sample_batched['sentence_embeddings'].size())
    print('=================================')

Batch number:  0 , sentence embeddings:  torch.Size([2, 386, 768])
Batch number:  1 , sentence embeddings:  torch.Size([2, 386, 768])
Batch number:  2 , sentence embeddings:  torch.Size([1, 386, 768])


In [None]:
show_similarity_matrix(getattr(transformed_dataset, 'text_feature'), 10, 10)

# Model

## - transformer.py

In [55]:
# d_model = emsize = embedding_dim
# d_hid = d_hid = feedforward_dim

In [2]:
import math
from typing import Tuple

import torch 
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
# from torch.utils.data import dataset
# import torch.nn.functional as F

In [56]:
class PositionalEncoding(nn.Module):

    def __init__(self, embedding_dim: int, dropout: float = 0.1, max_len: int = 500):
        super(PositionalEncoding, self).__init__()
        
        self.dropout = nn.Dropout(p=dropout)
        
        position = torch.arange(max_len).unsqueeze(1)  # shape: max_len x 1
        div_term = torch.exp(torch.arange(0, embedding_dim, 2) * (-math.log(10000.0) / embedding_dim)) # shape, (embedding_dim-1)//2 + 1
        pe = torch.zeros(max_len, 1, embedding_dim)  # shape: max_len x 1 x embedding_dim
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

    
class TransformerModel(nn.Module):
    
    def __init__(self,  
                 input_embedding_dim: int,
                 output_feature_dim:int,
                 nhead: int, 
                 feedforward_dim: int,
                 nlayers: int,
                 dropout: float = 0.5):  # ntoken
        super(TransformerModel, self).__init__()
        
        self.model_type = 'Transformer'
        self.input_embedding_dim = input_embedding_dim
        self.output_feature_dim = output_feature_dim

        self.pos_encoder = PositionalEncoding(input_embedding_dim, dropout)
        encoder_layers = TransformerEncoderLayer(input_embedding_dim, nhead, feedforward_dim, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        
##################################################################
        self.mlp_head = nn.Linear(input_embedding_dim, output_feature_dim)

#         self.mlp_head = nn.Sequential(
#             nn.LayerNorm(input_embedding_dim),
#             nn.Linear(input_embedding_dim, output_feature_dim)
#         )
#         self.decoder = nn.Linear(embedding_dim, ntoken) # ntoken = output dimension, set it yourself
#         self.encoder = nn.Embedding(ntoken, embedding_dim)
##################################################################
        
        self.init_weights()
    
    
    def init_weights(self) -> None:
        init_range = 0.1
        self.mlp_head.bias.data.zero_()
        self.mlp_head.weight.data.uniform_(-init_range, init_range)
#         self.encoder.weight.data.uniform_(-init_range, init_range)
        
    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
#         src = self.encoder(src) * math.sqrt(self.embedding_dim)
        src = self.pos_encoder(src * math.sqrt(self.input_embedding_dim))
        output = self.transformer_encoder(src, src_mask)
        output = self.mlp_head(output)
        
        return output
    
    
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

##### test block

In [57]:
# Initiate an instance
# The model hyperparameters are defined below. The vocab size is equal to the length of the vocab object.
test_input_embedding_dim = 768  # embedding dimension
test_output_feature_dim = 1024  # output feature dimension
test_nhead = 2  # number of heads in nn.MultiheadAttention
test_feedforward_dim = 768  # dimension of the feedforward network model in nn.TransformerEncoder
test_nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
test_dropout = 0.2  # dropout probability

# test_ntokens = 386  # size of vocabulary

test_model = TransformerModel(test_input_embedding_dim,
                              test_output_feature_dim,
                              test_nhead,
                              test_feedforward_dim, 
                              test_nlayers,
                              test_dropout)  # .to(device)

##### Run the model - example code

We use `CrossEntropyLoss` with the `SGD (stochastic gradient descent)` optimizer. The `learning rate` is initially set to 5.0 and follows a `StepLR schedule`. During training, we use `nn.utils.clip_grad_norm_` to prevent gradients from exploding.

In [60]:
import copy
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# config ...
# global variable ...
bptt = 35
batch_size = 20
eval_batch_size = 10

# create dataloader
# TODO

# create the model
test_model = TransformerModel(test_ntokens,
                              test_embedding_dim,
                              test_nhead,
                              test_feedforward_dim, 
                              test_nlayers,
                              test_dropout).to(device)

# create criterion, optimizer, scheduler
criterion = nn.CrossEntropyLoss()
lr = 5.0
optimizer = torch.optim.SGD(test_model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [None]:
def train(model: nn.Module) -> None:
    
    # turn in train mode
    test_model.train()
    
    # variable initialization
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask().to(device)
    
    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
#         data, targets = get_batch(train_data, i)
#         batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()
            
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [None]:
best_val_loss = float('inf')
epochs = 3
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, val_data)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()

In [None]:
test_loss = evaluate(best_model, test_data)
test_ppl = math.exp(test_loss)
print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}')
print('=' * 89)

## - evaluator.py (MUSDL)

In [None]:
import torch.nn as nn

# TODO !!!
# from opts import *

# TODO !!!

In [None]:
class MLP_block(nn.Module):

    def __init__(self, feature_dim, output_dim):
        super(MLP_block, self).__init__()
        self.activation = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)
        self.layer1 = nn.Linear(feature_dim, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = self.activation(self.layer1(x))
        x = self.activation(self.layer2(x))
        output = self.softmax(self.layer3(x))
        return output


class Evaluator(nn.Module):

    def __init__(self, feature_dim, output_dim, num_subscores=None):
        super(Evaluator, self).__init__()

        self.model_type = 'MUSDL'
        
        assert num_subscores is not None, 'num_subscores is required in MUSDL'
        self.evaluator = nn.ModuleList([MLP_block(feature_dim, output_dim) for _ in range(num_subscores)])

    def forward(self, feats_avg):  # data: NCTHW
        probs = [evaluator(feats_avg) for evaluator in self.evaluator]  # len=num_subscores
        return probs

# Configs

In [None]:
# where to store the outputs either relative path or absolute
OUTPUT_DIR: exp
# where to store the all the checkpoints of the model
CKPTS_DIR: ckpts
# randon seed 
MANUAL_SEED: 1
# A: audio, V: visual, L: linguistic(text), AVL: combination
# choices=['A+MUSDL', 'V+MUSDL', 'L+MUSDL','AV+MUSDL', 'AL+MUSDL', 'AVL+MUSDL']
TYPE: L+MUSDL
LOG_TITLE: L+MUSDL

DATA:
  ROOT_DIR: None
  BATCH_SIZE: 2
  SHUFFLE: False
  NUM_WORKERS: 2
    
MODEL:
  PATH: model_weights # path to folder where stores the best model weights
  WEIGHTS: new  # new, last or custom path
  epochs:
  TRANSFORMER:
    INPUT_EMBEDDING_DIM: 768
    OUTPUT_FEATURE_DIM: 1024
    N_HEAD: 2
    FEEDFORWARD_DIM: 768
    N_LAYERS: 2
    DROPOUT: 0.2
  MUSDL:
    N_CLASSES: 4  # [0, 1, 2, 3]
    N_SUBSCORES: 8
        


In [None]:
import argparse

In [None]:
def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config_file',
                        type=str,
                        help="path to yaml file",
                        required=True,
                        default=None)
    
    parser.add_argument('--device',
                        type=str,
                        help="set up torch device: 'cpu' or 'cuda' (GPU)",
                        required=False,
                        default=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    
    # remember to set the gpu device number 
    parser.add_argument('--gpu',
                        type=str,
                        help='id of gpu device(s) to be used',
                        default='2, 3')

    return parser

# utils.py

In [None]:
import os
import sys
import random
import logging
import numpy as np


import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms

# local functions
from dataset import DepressionDataset, Padding, ToTensor
from models.transformer import TransformerModel
from models.evaluator import Evaluator


# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

In [62]:
def init_seed(manual_seed):
    """
    Set random seed for torch and numpy.
    """
    random.seed(manual_seed)
    np.random.seed(manual_seed)
    torch.manual_seed(manual_seed)

    torch.cuda.manual_seed_all(manual_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    
def get_logger(filepath, log_title):
    logger = logging.getLogger(filepath)
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler(filepath)
    fh.setLevel(logging.INFO)
    logger.addHandler(fh)
    logger.info('-' * 30 + log_title + '-' * 30)
    return logger


def log_and_print(logger, msg):
    logger.info(msg)
    print(msg)


def worker_init_fn(worker_id):
    """
    Init worker in dataloader.
    """
    np.random.seed(np.random.get_state()[1][0] + worker_id)
    



def get_dataloaders(config):
    
    train_dataset = DepressionDataset(config['ROOT_DIR'], 'train',
                                      transform=transforms.Compose([Padding(), ToTensor()]))
    validation_dataset = DepressionDataset(config['ROOT_DIR'], 'dev',
                                           transform=transforms.Compose([Padding(), ToTensor()]))
    
    dataloaders = {}
    
    dataloaders['train'] = DataLoader(DepressionDataset,
                                      batch_size=config['BATCH_SIZE'],
                                      shuffle=config['SHUFFLE'], 
                                      num_workers=config['NUM_WORKERS'])
    
    dataloaders['validation'] = DataLoader(DepressionDataset, 
                                           batch_size=config['BATCH_SIZE'],
                                           shuffle=config['SHUFFLE'],
                                           num_workers=config['NUM_WORKERS'])
    
    return dataloaders


def get_models(config, args):
    """
    Get the Transformer encoder backbone and the evaluator(MUSDL) with parameters moved to GPU.
    """
    
    transformer = TransformerModel(config['TRANSFORMER']['INPUT_EMBEDDING_DIM'], 
                                   config['TRANSFORMER']['OUTPUT_FEATURE_DIM'],
                                   config['TRANSFORMER']['N_HEAD'],
                                   config['TRANSFORMER']['FEEDFORWARD_DIM'],
                                   config['TRANSFORMER']['N_LAYERS'],
                                   config['TRANSFORMER']['DROPOUT'])
        
    evaluator = Evaluator(config['TRANSFORMER']['OUTPUT_FEATURE_DIM'],
                          config['MUSDL']['N_CLASSES'], 
                          config['MUSDL']['N_SUBSCORES'])

    if len(args.gpu.split(',')) > 1:
        transformer = nn.DataParallel(transformer)
        evaluator = nn.DataParallel(evaluator)
    
    # move to GPU
    transformer = transformer.to(args.device)
    evaluator = evaluator.to(args.device)
    
    return transformer, evaluator



# Main

Overall architecture of system folders and scripts

```
[Text+MUSDL]
 ├── [models]
 │    ├── transformer.py (Transformer model)
 │    └── evaluator.py (MUSDL model)
 ├── [dataset]
 │    ├── dataset.py
 │    └── utils.py
 ├── [cfg]
 │    ├── train_config.yaml
 │    ├── test_config.yaml
 │    └── inference_config.yaml
 ├── utils.py
 ├── train.py
 ├── test.py
 ├── inference.py
 │
 ├── [model_weights]
 │    └── store only one of the best model overall
 ├── [exp]
 │    └── 存所有實驗結果, train, test, inference
 ├── [ckpts]
 │    └── 暫存所有model weights
 └── ...
```

# main: Train

In [None]:
# TODO

import os
import sys

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms

from scipy import stats
from tqdm import tqdm
from autolab_core import YamlConfig

# local functions
from dataset import DepressionDataset, Padding, ToTensor
from models.transformer import TransformerModel
from models.evaluator import Evaluator

######################################################
from utils import *
# from opts import *
# from config import get_parser

# TODO

In [None]:

def compute_score(model_type, probs, data):
    if model_type == 'USDL':
        pred = probs.argmax(dim=-1) * (label_max / (output_dim['USDL']-1))
    else:
        # calculate expectation & denormalize & sort
        judge_scores_pred = torch.stack([prob.argmax(dim=-1) * judge_max / (output_dim['MUSDL']-1)
                                         for prob in probs], dim=1).sort()[0]  # N, 7

        # keep the median 3 scores to get final score according to the rule of diving
        pred = torch.sum(judge_scores_pred[:, 2:5], dim=1) * data['difficulty'].cuda()
    return pred


def compute_loss(model_type, criterion, probs, data):
    if model_type == 'USDL':
        loss = criterion(torch.log(probs), data['soft_label'].cuda())
    else:
        loss = sum([criterion(torch.log(probs[i]), data['soft_judge_scores'][:, i].cuda()) for i in range(num_judges)])
    return loss


def main(dataloaders, i3d, evaluator, base_logger, args):
    # print configuration
    '''
    use: print(config.file_contents)
    also: store the config.save(os.path.join(path, name))
        path: config['OUTPUT_DIR']
        name: config['SAVE_CONFIG_NAME']
    '''
    print('=' * 40)
    for k, v in vars(args).items():
        print(f'{k}: {v}')
    print('=' * 40)

    criterion = nn.KLDivLoss()
    optimizer = torch.optim.Adam([*i3d.parameters()] + [*evaluator.parameters()],
                                 lr=args.lr, weight_decay=args.weight_decay)

    epoch_best = 0
    rho_best = 0
    for epoch in range(args.num_epochs):
        log_and_print(base_logger, f'Epoch: {epoch}  Current Best: {rho_best} at epoch {epoch_best}')

        for split in ['train', 'test']:
            true_scores = []
            pred_scores = []

            if split == 'train':
                i3d.train()
                evaluator.train()
                torch.set_grad_enabled(True)
            else:
                i3d.eval()
                evaluator.eval()
                torch.set_grad_enabled(False)

            for data in tqdm(dataloaders[split]):
                true_scores.extend(data['final_score'].numpy())
                videos = data['video'].cuda()
                videos.transpose_(1, 2)  # N, C, T, H, W

                batch_size, C, frames, H, W = videos.shape
                clip_feats = torch.empty(batch_size, 10, feature_dim).cuda()
                for i in range(9):
                    clip_feats[:, i] = i3d(videos[:, :, 10 * i:10 * i + 16, :, :]).squeeze(2)
                clip_feats[:, 9] = i3d(videos[:, :, -16:, :, :]).squeeze(2)

                probs = evaluator(clip_feats.mean(1))
                preds = compute_score(args.type, probs, data)
                pred_scores.extend([i.item() for i in preds])

                if split == 'train':
                    loss = compute_loss(args.type, criterion, probs, data)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            rho, p = stats.spearmanr(pred_scores, true_scores)

            log_and_print(base_logger, f'{split} correlation: {rho}')

        if rho > rho_best:
            rho_best = rho
            epoch_best = epoch
            log_and_print(base_logger, '-----New best found!-----')
            if args.save:
                torch.save({'epoch': epoch,
                            'i3d': i3d.state_dict(),
                            'evaluator': evaluator.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'rho_best': rho_best}, f'ckpts/{args.type}.pt')

In [None]:
if __name__ == '__main__':
    
    args = get_parser().parse_args()
    
    # set up GPU
    if args.device == 'cuda':
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    
    # load config file into dict() format
    config = YamlConfig(args.config_file)    
    
    # create output folder if not exist
    if not os.path.exists(config['OUTPUT_DIR']):
        os.mkdir(config['OUTPUT_DIR'])
    if not os.path.exists(config['CKPTS_DIR']):
        os.mkdir(config['CKPTS_DIR'])
    if not os.path.exists(config['MODEL']['PATH']):
        os.mkdir(config['MODEL']['PATH'])
    
    # initialize random seed for torch and numpy
    init_seed(config['MANUAL_SEED'])
    
    # get logger os.path.join(config['OUTPUT_DIR'], f'{config['TYPE']}_{config['LOG_TITLE']}.log')
    file_name = os.path.join(config['OUTPUT_DIR'], '{}_{}.log'.format(config['TYPE'], config['LOG_TITLE']))
    base_logger = get_logger(file_name, config['LOG_TITLE'])
    # get dataloaders
    dataloaders = get_dataloaders(config['DATA'])
    # get models
    transformer, evaluator = get_models(config['MODEL'], args)
    

    main(dataloaders, i3d, evaluator, base_logger, args)

# Decide how to store the checkpoints

In [3]:
from datetime import datetime

# 先決定如何存ckpts
if args.save:
    # get the time for the file name
    timestamp = datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')  # exact: strftime('%Y-%m-%d %H%M%S%f')[:-3]
    
    # save
    file_path = os.path.join(config['CKPTS_DIR'], 
                             '{}_{}.pt'.format(config['TYPE'], timestamp))
    torch.save({'epoch': epoch,
                'i3d': i3d.state_dict(),
                'evaluator': evaluator.state_dict(),
                'optimizer': optimizer.state_dict(),
                'rho_best': rho_best}, file_path)

'''EXP
[ckpts (CKPTS_DIR)]
 └── [L+MUSDL (TYPE)]
      ├── L+MUSDL_2021-12-28_114455
      ├── L+MUSDL_2021-12-28_114458
      ├── L+MUSDL_2021-12-28_114501
      ├── L+MUSDL_2021-12-28_114504
      └── ...
'''

In [4]:
datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')  # strftime('%Y-%m-%d %H%M%S%f')[:-3]

'2022-01-04_152837'

In [5]:
# another way to get time

from datetime import datetime
print(datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3])
# >>>> OUTPUT >>>>
# 2020-05-04 10:18:32.926


import time 
print(time.time())

t = time.localtime()
timestamp = time.strftime('%Y-%m-%d_%H%M%S', t)
print(timestamp)

2022-01-04 15:29:04.898
1641310144.906261
2022-01-04_162904


## function for find the last checkpoint

In [None]:
def find_last_ckpts(path, key, date=None):
    """Finds the last checkpoint file of the last trained model in the
    model directory.
    Arguments:
        path: str, path to the checkpoint
        key: str, model type
        date: str, a specific date in string format 'YYYY-MM-DD'
    Returns:
        The path of the last checkpoint file
    """
    ckpts = list(sorted(os.listdir(path)))
    
    if date is not None:
        # match the date format
        date_format = "%Y-%m-%d"
        try:
            datetime.strptime(date, date_format)
            # print("This is the correct date string format.")
            matched = True
        except ValueError:
            # print("This is the incorrect date string format. It should be YYYY-MM-DD")
            matched = False
            
        assert matched, "The given date is the incorrect date string format. It should be YYYY-MM-DD"
        key = '{}_{}'.format(key, date)
    else:
        key = str(key)
    
    # filter the files
    ckpts = list(filter(lambda f:f.startswith(key), ckpts))
    # get whole file path
    last_ckpt = os.path.join(path, ckpts[-1])
    
    return last_ckpt
    

## visualize the function of generate_square_subsequent_mask 

In [2]:
import torch

def generate_square_subsequent_mask(sz: int):
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

result = generate_square_subsequent_mask(35)

In [3]:
result

tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]])

# other useful function for utils

In [None]:
def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
#     y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def plot_roc_curve(y_test, y_score):
    """
    Plots ROC curve for final trained model. Code taken from:
    https://vkolachalama.blogspot.com/2016/05/keras-implementation-of-mlp-neural.html
    """
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.05])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.legend(loc="lower right")
    plt.savefig(prefix+'images/BiLSTM_roc.png')
    plt.close()