In [2]:
# -*- coding: utf-8 -*-
import argparse

"""
Created on Mon Oct 25 14:07:29 2021

@author: chumache
"""
import os
import json
import numpy as np
import torch
from torch import nn, optim
from torch.optim import lr_scheduler

from opts import parse_opts
from model import generate_model
import transforms 
from dataset import get_training_set, get_validation_set, get_test_set
from utils import adjust_learning_rate, save_checkpoint
from train import train_epoch
from validation import val_epoch
import time
import logging
from torch.autograd import Variable
import time
from utils import AverageMeter, calculate_accuracy
def parse_opts():
    parser = argparse.ArgumentParser()
    parser.add_argument('--phase', default='train', type=str) 
    parser.add_argument('--certification_method', default='MMCert', type=str) 
    parser.add_argument('--k1', default='6', type=int) 
    parser.add_argument('--k2', default='900', type=int) 
    parser.add_argument('--k', default='3000', type=int) 
    parser.add_argument('--annotation_path', default='ravdess_preprocessing/annotations.txt', type=str, help='Annotation file path')
    parser.add_argument('--result_path', default='results', type=str, help='Result directory path')
    parser.add_argument('--store_name', default='model', type=str, help='Name to store checkpoints')
    parser.add_argument('--dataset', default='RAVDESS', type=str, help='Used dataset. Currently supporting Ravdess')
    parser.add_argument('--n_classes', default=8, type=int, help='Number of classes')
    
    parser.add_argument('--model', default='multimodalcnn', type=str, help='')
    parser.add_argument('--num_heads', default=1, type=int, help='number of heads, in the paper 1 or 4')
    
    parser.add_argument('--device', default='cuda', type=str, help='Specify the device to run. Defaults to cuda, fallsback to cpu')
    
    
    parser.add_argument('--sample_size', default=224, type=int, help='Video dimensions: ravdess = 224 ')
    parser.add_argument('--sample_duration', default=15, type=int, help='Temporal duration of inputs, ravdess = 15')
    
    parser.add_argument('--learning_rate', default=0.04, type=float, help='Initial learning rate (divided by 10 while training by lr scheduler)')
    parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
    parser.add_argument('--lr_steps', default=[40, 55, 65, 70, 200, 250], type=float, nargs="+", metavar='LRSteps', help='epochs to decay learning rate by 10')
    parser.add_argument('--dampening', default=0.9, type=float, help='dampening of SGD')
    parser.add_argument('--weight_decay', default=1e-3, type=float, help='Weight Decay')
    parser.add_argument('--lr_patience', default=10, type=int, help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.')
    parser.add_argument('--batch_size', default=8, type=int, help='Batch Size')
    parser.add_argument('--n_epochs', default=100, type=int, help='Number of total epochs to run')
    
    parser.add_argument('--begin_epoch', default=1, type=int, help='Training begins at this epoch. Previous trained model indicated by resume_path is loaded.')
    parser.add_argument('--resume_path', default='', type=str, help='Save data (.pth) of previous training')
    parser.add_argument('--pretrain_path', default='Pretrained_EfficientFace.tar', type=str, help='Pretrained model (.pth), efficientface')
    parser.add_argument('--no_train', action='store_true', help='If true, training is not performed.')
    parser.set_defaults(no_train=False)
    parser.add_argument('--no_val', action='store_true', help='If true, validation is not performed.')
    parser.set_defaults(no_val=False)
    parser.add_argument('--test', action='store_true', help='If true, test is performed.')
    parser.set_defaults(test=True)
    parser.add_argument('--test_subset', default='test', type=str, help='Used subset in test (val | test)')
    
    parser.add_argument('--n_threads', default=1, type=int, help='Number of threads for multi-thread loading')
    parser.add_argument('--video_norm_value', default=255, type=int, help='If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
 
    parser.add_argument('--manual_seed', default=1, type=int, help='Manually set random seed')
    parser.add_argument('--fusion', default='ia', type=str, help='fusion type: lt | it | ia')
    parser.add_argument('--mask', type=str, help='dropout type : softhard | noise | nodropout', default='softhard')
    args = parser.parse_args([])

    return args

def val_ensemble(data_loader, model, criterion, opt, dist = None ):
    print("val_ensemble")
    #for evaluation with single modality, specify which modality to keep and which distortion to apply for the other modaltiy:
    #'noise', 'addnoise' or 'zeros'. for paper procedure, with 'softhard' mask use 'zeros' for evaluation, with 'noise' use 'noise' 
    model.eval()
    
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    end_time = time.time()
    all_preds = []
    all_gts = []
    with torch.no_grad():
        for trial in range(100):
            preds =[]
            gts = []
            print(trial)
            for i, (inputs_audio, inputs_visual, targets) in enumerate(data_loader):
                data_time.update(time.time() - end_time)

                inputs_visual = inputs_visual.permute(0,2,1,3,4)
                inputs_visual = inputs_visual.reshape(inputs_visual.shape[0]*inputs_visual.shape[1], inputs_visual.shape[2], inputs_visual.shape[3], inputs_visual.shape[4])


                targets = targets.to(opt.device)
                with torch.no_grad():
                    inputs_visual = Variable(inputs_visual)
                    inputs_audio = Variable(inputs_audio)
                    targets = Variable(targets)
                outputs = model(inputs_audio.float(), inputs_visual.float())
                #print(torch.argmax(outputs,dim =1)[0],targets[0])
                preds.append(torch.argmax(outputs,dim =1)[0])
                gts.append(targets[0])
                loss = criterion(outputs, targets)
                prec1, prec5 = calculate_accuracy(outputs.data, targets.data, topk=(1,5))
                top1.update(prec1, inputs_audio.size(0))
                top5.update(prec5, inputs_audio.size(0))

                losses.update(loss.data, inputs_audio.size(0))

                batch_time.update(time.time() - end_time)
                end_time = time.time()
                """
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.5f} ({batch_time.avg:.5f})\t'
                      'Data {data_time.val:.5f} ({data_time.avg:.5f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.5f} ({top1.avg:.5f})\t'
                      'Prec@5 {top5.val:.5f} ({top5.avg:.5f})'.format(
                          0,
                          i + 1,
                          len(data_loader),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses,
                          top1=top1,
                          top5=top5))
                """
            all_preds.append(torch.tensor(preds))
            all_gts.append(torch.tensor(gts))
    return all_preds, all_gts
if __name__ == '__main__':
    
    opt = parse_opts()
    n_folds = 1
    test_accuracies = []

    if opt.device != 'cpu':
        opt.device = 'cuda' if torch.cuda.is_available() else 'cpu' 
    if opt.certification_method == "MMCert":
        opt.result_path = "C:\\Users\\dongs\\Documents\\multimodal-emotion-recognition-main\\results\\MMCert_multimodalcnn_15_best0.pth"
    if opt.certification_method == "randomized_ablation":
        #opt.result_path = "C:\\Users\\dongs\\Documents\\multimodal-emotion-recognition-main\\results\\randomized_ablation_multimodalcnn_15_checkpoint0.pth"
        opt.result_path = "C:\\Users\\dongs\\Documents\\multimodal-emotion-recognition-main\\results\\randomized_ablation_multimodalcnn_15_best1.pth"
        #opt.result_path = "C:\\Users\\dongs\\Documents\\multimodal-emotion-recognition-main\\results\\randomized_ablation_multimodalcnn_15_best0.pth"
    pretrained = opt.pretrain_path != 'None'    
    
    #opt.result_path = 'res_'+str(time.time())
    if not os.path.exists(opt.result_path):
        os.makedirs(opt.result_path)
        
    opt.arch = '{}'.format(opt.model)  
    opt.store_name = '_'.join([opt.dataset, opt.model, str(opt.sample_duration)])
            

    #if opt.dataset == 'RAVDESS':
    #    opt.annotation_path = '/lustre/scratch/chumache/ravdess-develop/annotations_croppad_fold'+str(fold+1)+'.txt'

    print(opt)
    torch.manual_seed(opt.manual_seed)
    model, parameters = generate_model(opt)
    print("model_created")
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(opt.device)

    video_transform = transforms.Compose([
        transforms.ToTensor(opt.video_norm_value)])

    test_data = get_test_set(opt, spatial_transform=video_transform) 
    print("dataset_created")
    #load best model
    best_state = torch.load(opt.result_path)
    model.load_state_dict(best_state['state_dict'])
    print("model_loaded")
    test_loader = torch.utils.data.DataLoader(
        test_data,
        batch_size=1,
        shuffle=False,
        num_workers=opt.n_threads,
        pin_memory=True)
    all_preds,all_gts = val_ensemble(test_loader, model, criterion, opt)

    print(all_preds)
    if opt.certification_method == "randomized_ablation":
        torch.save((all_preds,all_gts), 'output/'+opt.certification_method+"_k="+str(opt.k)+'_all_outputs.pth')
    if opt.certification_method == "MMCert":
        torch.save((all_preds,all_gts), 'output/'+opt.certification_method+"_k1="+str(opt.k1)+"_k2="+str(opt.k2)+'_all_outputs.pth')
    #print('Prec1: ' + str(test_prec1) + '; Loss: ' + str(test_loss))
    #test_accuracies.append(test_prec1) 
                


Namespace(phase='train', certification_method='MMCert', k1=6, k2=900, k=3000, annotation_path='ravdess_preprocessing/annotations.txt', result_path='C:\\Users\\dongs\\Documents\\multimodal-emotion-recognition-main\\results\\MMCert_multimodalcnn_15_best0.pth', store_name='RAVDESS_multimodalcnn_15', dataset='RAVDESS', n_classes=8, model='multimodalcnn', num_heads=1, device='cuda', sample_size=224, sample_duration=15, learning_rate=0.04, momentum=0.9, lr_steps=[40, 55, 65, 70, 200, 250], dampening=0.9, weight_decay=0.001, lr_patience=10, batch_size=8, n_epochs=100, begin_epoch=1, resume_path='', pretrain_path='Pretrained_EfficientFace.tar', no_train=False, no_val=False, test=True, test_subset='test', n_threads=1, video_norm_value=255, manual_seed=1, fusion='ia', mask='softhard', arch='multimodalcnn')
Initializing efficientnet
Total number of trainable parameters:  1854766
model_created
dataset_created
model_loaded
val_ensemble
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
2