In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [1]:
import importlib
import yaml, itertools
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio

import pandas as pd
import random

import numpy as np

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import os
import glob

import seaborn as sns


from sklearn.metrics import balanced_accuracy_score, confusion_matrix

In [31]:
optimal_batch_size = 8
device = 'cuda'

# MIPS(Million Instructions Per Second)

bundle = torchaudio.pipelines.WAV2VEC2_BASE
base_w2v = bundle.get_model()
base_w2v.to(device)
base_w2v.eval()
print(base_w2v.training)

for sec in [15, 10, 5, 3]:
    dummy_input = torch.randn(optimal_batch_size, int(16000*sec), dtype=torch.float).to(device)
    repetitions=100
    total_time = 0
    with torch.no_grad():
        for rep in range(repetitions):
            starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
            starter.record()
            _ = base_w2v(dummy_input)
            ender.record()
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)/1000
            total_time += curr_time
    Throughput =  (repetitions*optimal_batch_size)/total_time
    print(sec,' Final Throughput:',Throughput)

False
15  Final Throughput: 79.69875661325088
10  Final Throughput: 125.31364764875822
5  Final Throughput: 247.69535615132457
3  Final Throughput: 413.5603851008161


In [32]:
optimal_batch_size = 8
device = 'cuda'

# MIPS(Million Instructions Per Second)

bundle = torchaudio.pipelines.WAV2VEC2_LARGE
base_w2v = bundle.get_model()
base_w2v.to(device)
base_w2v.eval()
print(base_w2v.training)

for sec in [15, 10, 5, 3]:
    dummy_input = torch.randn(optimal_batch_size, int(16000*sec), dtype=torch.float).to(device)
    repetitions=100
    total_time = 0
    with torch.no_grad():
        for rep in range(repetitions):
            starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
            starter.record()
            _ = base_w2v(dummy_input)
            ender.record()
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)/1000
            total_time += curr_time
    Throughput =  (repetitions*optimal_batch_size)/total_time
    print(sec,' Final Throughput:',Throughput)

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_large_ls960.pth" to /home/ubuntu/.cache/torch/hub/checkpoints/wav2vec2_fairseq_large_ls960.pth


  0%|          | 0.00/1.18G [00:00<?, ?B/s]

False
15  Final Throughput: 34.51715504113194
10  Final Throughput: 52.420661430209655
5  Final Throughput: 110.9933839696896
3  Final Throughput: 198.5234116959865


In [2]:
def make_color_list(emo_list):
    color = {'neu':'slategrey', 'ang':'crimson', 'hap':'gold', 
             'sad':'darkblue',  'dis':'plum', 
             'sur':'steelblue', 'fea':'olivedrab'}
    
    color_list = []
    for e in emo_list:
        c = color[e[:3]]
        color_list.append(c)
        
    return color_list

def make_center_color(color_list):
    color = {'slategrey':'black', 'crimson':'orangered', 'gold':'yellow', 
             'darkblue':'blue',  'plum':'purple', 
             'steelblue':'dodgerblue', 'olivedrab':'darkolivegreen'}
    
    center_list = []
    for e in color_list:
        c = color[e]
        center_list.append(c)
        
    return center_list

In [3]:
class IEMOCAP_Dataset(Dataset):
    def __init__(self, data, hparams):
        
        self.hparams = hparams
        self.csv = data
        self.output_class = hparams['use_class']
        self.vad = hparams['vad']
        self.sr = 16000

        self.aranged_id_num = list(range(1,11))
        self.aranged_id_num.remove(2*hparams['fold_num']-1)
        self.aranged_id_num.remove(2*hparams['fold_num'])
    
    def __len__(self):
        return len(self.csv)

    def __getitem__(self, idx):

        data = self.csv.iloc[idx]
        ids = data['session']
        ans = self.output_class.index(data['emotion'])

        try:
            pid = self.aranged_id_num.index(data['id_num'])
        except:
            if data['id_num'] % 2 == 0:
                pid = 8
            else:
                pid = 9

        if self.vad:
            wav_path = '/media/ubuntu/SSD2/Dataset/IEMOCAP_VAD/Session'+str(data['fold'])\
                        +'/' + '_'.join(ids.split('_')[:-1]) + '/' + ids +'.wav'
        else:
            wav_path = '/media/ubuntu/SSD2/Dataset/IEMOCAP_full_release/Session'+str(data['fold'])\
                         +'/sentences/wav/' + '_'.join(ids.split('_')[:-1]) + '/' + ids +'.wav'
            if os.getcwd().split('/')[2] == 'cvnar2':
                wav_path = wav_path.replace('/media/ubuntu/SSD2/Dataset', '/home/cvnar2/Desktop/ssd')


        wav, sr = torchaudio.load(wav_path, normalize=True)
        if self.hparams['repeat_3_sec']:
            if wav.shape[-1] / self.sr < 3.0:
                n_repeat = int(3 // (wav.shape[-1] / self.sr))
                wav = wav.repeat((1, n_repeat))
                
        if (data['sec'] > self.hparams['max_sec']):
            max_len = int(16000 * self.hparams['max_sec'])
            offset = random.randint(0, wav.shape[1] - max_len - 1)
            wav = wav[:, offset:offset+max_len]

        inputs = wav.transpose(0, 1).squeeze(1)
        
        return inputs, ans, ids

In [4]:
def Pad_Collate(samples):
    from torch.nn.utils.rnn import pad_sequence
    """
    DataLoader collate_fn
    """
    #for i in range(len(samples)):
    #    print(samples[i][0])
    #    print()
    inputs = [sample[0].squeeze(0) for sample in samples]  
    padded_inputs = pad_sequence(inputs, batch_first=True)

    labels = [sample[1] for sample in samples]
    labels = torch.Tensor(labels).float()
    labels = labels.unsqueeze(1)

    ids = [sample[2] for sample in samples]
        
    return padded_inputs, labels, ids

In [5]:
main_path = 'weight/main_final2_load_emo/freeze'#'Vox_IEMO(3)/main_final2_load_emo_beta/all_finetune_beta(05)'
exp_list = os.listdir(main_path)
exp_list.sort()

bz = 1
if os.path.isdir(main_path + '/plot'):
    pass
else:
    os.mkdir(main_path + '/plot')

In [7]:
exp_list

['1_230413_0649',
 '2_230413_0800',
 '3_230413_0913',
 '4_230413_1020',
 '5_230413_1125',
 'plot']

In [8]:
import loss_class
import pool_module

class ID_Network(nn.Module):
    def __init__(self, hparams, n_class):
        super(ID_Network, self).__init__()

        self.hparams = hparams

        bundle = torchaudio.pipelines.WAV2VEC2_BASE
        self.w2v = bundle.get_model()

        self.pool = pool_module.AttentionalPool(hparams['fin_channel'], 4, 0.1, 'max')
        if self.hparams['id_hs_linear'] == 'linear':
            self.hs = nn.Linear(hparams['fin_channel'], n_class)
        elif self.hparams['id_hs_linear'] == 'hs':
            self.hs = loss_class.HS_Loss(n_class, hparams['id_scale'], hparams['id_margin'], hparams['fin_channel'])
        else:
            raise ValueError('hs_linear value error')
        
    def forward(self, x, ans):

        batch, _ = x.size()
        x, _ = self.w2v(x)

        x = self.pool(x)
        feat = x.view(batch, -1)

        if self.hparams['id_hs_linear'] == 'linear':    
            out = self.hs(feat)
        elif self.hparams['id_hs_linear'] == 'hs':
            out = self.hs(feat, ans.reshape(-1).long())
        else:
            raise ValueError('id_hs_linear value error')

        return out, ans, feat

    def get_feat(self, x):

        batch, _ = x.size()
        x, _ = self.w2v(x)

        x = self.pool(x)
        feat = x.view(batch, -1)

        return feat

    def get_close_id(self, x):

        batch, _ = x.size()
        x, _ = self.w2v(x)

        x = self.pool(x)
        feat = x.view(batch, -1)

        if self.hparams['id_hs_linear'] == 'linear':    
            out = self.hs(feat)

        elif self.hparams['id_hs_linear'] == 'hs':
            out = F.linear(F.normalize(feat), F.normalize(self.hs.fc))
            out = out.clamp(-1, 1)

        return feat, out

class Emotion_Network(nn.Module):
    def __init__(self, hparams):
        super(Emotion_Network, self).__init__()

        self.hparams = hparams

        self.id_net = ID_Network(hparams, 1251)

        bundle = torchaudio.pipelines.WAV2VEC2_BASE
        self.w2v = bundle.get_model()

        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(hparams['fin_channel'], hparams['fin_channel'])

        self.pool_layers = [pool_module.AttentionalPool(hparams['fin_channel'], 4, 0.1, 'max') for _ in range(hparams['pool_head'])]
        self.pool_layers = nn.ModuleList(self.pool_layers)

        if self.hparams['emo_hs_linear'] == 'linear':
            self.hs = nn.Linear(hparams['fin_channel'], 4)
        elif self.hparams['emo_hs_linear'] == 'hs':
            self.hs = loss_class.HS_Loss(4, hparams['emo_scale'], hparams['emo_margin'], hparams['fin_channel'])
        else:
            raise ValueError('emo_hs_linear value error')

        self.id_filter = nn.Parameter(torch.randn(1, int(hparams['pool_head']), 768).float())

    def forward(self, x, ans, ids):

        batch, _ = x.size()
        if self.hparams['id_net_freeze'] == 'freeze':
            with torch.no_grad():
            # id_loss, id_out, id_ans, id_feat
                id_feat, id_out = self.id_net.get_close_id(x)
        else:
            # out, ans, feat
            id_out, _, id_feat = self.id_net(x, ids)

        x, _ = self.w2v(x)

        x = self.relu(x)
        x = self.fc1(x)

        out_list = [id_feat]
        for pool_layer in self.pool_layers:
            tmp = pool_layer(x)
            tmp = tmp.view(batch, -1)
            out_list.append(tmp)

        feat = torch.stack(out_list, dim=1)
        # out = torch.stack((id_feat, n_out, a_out, h_out, s_out), dim=1)

        feat = feat * self.id_filter
        feat = feat.sum(dim=1)

        if self.hparams['emo_hs_linear'] == 'linear':    
            out = self.hs(feat)
        elif self.hparams['emo_hs_linear'] == 'hs':
            out = self.hs(feat, ans.reshape(-1).long())
        else:
            raise ValueError('emo_hs_linear value error')

        return out, id_out
    
    def get_feat(self, x):
        batch, _ = x.size()

        with torch.no_grad():

            id_feat, id_out = self.id_net.get_close_id(x) 
            x, norm_x = self.w2v.extract_features(x)

            x = self.relu(x[-1])
            x = self.fc1(x)

            feat_list = [id_feat]
            for pool_layer in self.pool_layers:
                tmp = pool_layer(x)
                tmp = tmp.view(batch, -1)
                feat_list.append(tmp)

            feat = torch.stack(feat_list, dim=1)
            # out = torch.stack((id_feat, n_out, a_out, h_out, s_out), dim=1)

            feat = feat * self.id_filter
            feat = feat.sum(dim=1)

            if self.hparams['emo_hs_linear'] == 'linear':    
                out = self.hs(feat)
            elif self.hparams['emo_hs_linear'] == 'hs':
                out = F.linear(F.normalize(feat), F.normalize(self.hs.fc))
                out = out.clamp(-1, 1)
            else:
                raise ValueError('emo_hs_linear value error')

            return [out, id_out], feat, feat_list


In [26]:
import my_utils

for p in [1,2,3,4]:
    print()
    print('pool_', p)
    main_path = 'weight/only_emo/pool_' + str(p) #'Vox_IEMO(3)/main_final2_load_emo_beta/all_finetune_beta(05)'
    exp_list = os.listdir(main_path)
    exp_list.sort()

    exp = exp_list[0]
    mode = 'best'
    device = 'cuda'

    optimal_batch_size = 8 

    lib_path = glob.glob(main_path+'/'+exp+'/*only_emo*.py')[0][:-3].replace('/', '.')
    saved_main = importlib.import_module(lib_path)

    u_path = glob.glob(main_path+'/'+exp+'/my_utils.py')[0][:-3].replace('/', '.')
    my_utils = importlib.import_module(u_path)

    torch.cuda.empty_cache()

    with open(main_path+'/'+exp+"/hparams.yaml") as f:
        hparams = yaml.load(f, Loader=yaml.FullLoader)

    seed = hparams['seed']
    my_utils.set_seed(seed)

    net = saved_main.Emotion_Network(hparams)

    weight = torch.load(main_path+'/'+exp+"/"+mode+"_model.pt")
    missing_keys = net.load_state_dict(weight['model_state_dict'], strict=True)
    print(missing_keys)

    net = net.cuda()
    net.eval()

    # MIPS(Million Instructions Per Second)
    for sec in [15, 10, 5, 3]:
        dummy_input = torch.randn(optimal_batch_size, int(16000*sec), dtype=torch.float).to(device)
        repetitions=100
        total_time = 0
        with torch.no_grad():
            for rep in range(repetitions):
                starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
                starter.record()
                _, _, _ = net.get_feat(dummy_input)
                ender.record()
                torch.cuda.synchronize()
                curr_time = starter.elapsed_time(ender)/1000
                total_time += curr_time
        Throughput =  (repetitions*optimal_batch_size)/total_time
        print(sec,' Final Throughput:',Throughput)


pool_ 1
<All keys matched successfully>
15  Final Throughput: 77.19154586201564
10  Final Throughput: 121.32358994750444
5  Final Throughput: 240.50776208573978
3  Final Throughput: 401.2057941055703

pool_ 2
<All keys matched successfully>
15  Final Throughput: 74.54757764696272
10  Final Throughput: 117.33124072091567
5  Final Throughput: 233.66060438224494
3  Final Throughput: 388.391619951946

pool_ 3
<All keys matched successfully>
15  Final Throughput: 72.24856833525553
10  Final Throughput: 113.88599196064663
5  Final Throughput: 227.27003918430196
3  Final Throughput: 378.93564673672876

pool_ 4
<All keys matched successfully>
15  Final Throughput: 70.33185145372038
10  Final Throughput: 111.05684636004726
5  Final Throughput: 221.42185378578756
3  Final Throughput: 368.89824865998315


In [17]:
import my_utils

exp = exp_list[0]
mode = 'best'
device = 'cuda'

optimal_batch_size = 8 

lib_path = glob.glob(main_path+'/'+exp+'/*main*.py')[0][:-3].replace('/', '.')
saved_main = importlib.import_module(lib_path)

u_path = glob.glob(main_path+'/'+exp+'/my_utils.py')[0][:-3].replace('/', '.')
my_utils = importlib.import_module(u_path)

torch.cuda.empty_cache()

with open(main_path+'/'+exp+"/hparams.yaml") as f:
    hparams = yaml.load(f, Loader=yaml.FullLoader)

seed = hparams['seed']
my_utils.set_seed(seed)

net = saved_main.Emotion_Network(hparams)

if hparams['id_net_freeze'] is not None:
    net.id_filter.data = torch.randn(1, hparams['pool_head']+1, hparams['fin_channel']) 

if hparams['id_net_freeze'] != 'freeze':
    net.id_net.hs.fc = nn.Parameter(torch.Tensor(8, hparams['fin_channel']))

weight = torch.load(main_path+'/'+exp+"/"+mode+"_model.pt")
missing_keys = net.load_state_dict(weight['model_state_dict'], strict=True)
print(missing_keys)

net = net.cuda()
net.eval()

# MIPS(Million Instructions Per Second)
for sec in [15, 10, 5, 3]:
    dummy_input = torch.randn(optimal_batch_size, int(16000*sec), dtype=torch.float).to(device)
    repetitions=100
    total_time = 0
    with torch.no_grad():
        for rep in range(repetitions):
            starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
            starter.record()
            _, _, _ = net.get_feat(dummy_input)
            ender.record()
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)/1000
            total_time += curr_time
    Throughput =  (repetitions*optimal_batch_size)/total_time
    print(sec,' Final Throughput:',Throughput)

<All keys matched successfully>
15  Final Throughput: 36.689831027403386
10  Final Throughput: 57.458186672329624
5  Final Throughput: 113.71012777813627
3  Final Throughput: 189.27038607823263


In [21]:
for sec in [15, 10, 5, 3]:
    dummy_input = torch.randn(optimal_batch_size, int(16000*sec), dtype=torch.float).to(device)
    repetitions=100
    total_time = 0
    with torch.no_grad():
        for rep in range(repetitions):
            starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
            starter.record()
            _ = net.id_net.get_feat(dummy_input)
            ender.record()
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)/1000
            total_time += curr_time
    Throughput =  (repetitions*optimal_batch_size)/total_time
    print(sec,' Final Throughput:',Throughput)

15  Final Throughput: 76.47565492882147
10  Final Throughput: 120.16652757372611
5  Final Throughput: 238.34744691195974
3  Final Throughput: 397.4481681954243


In [23]:
for sec in [15, 10, 5, 3]:
    dummy_input = torch.randn(optimal_batch_size, int(16000*sec), dtype=torch.float).to(device)
    repetitions=100
    total_time = 0
    with torch.no_grad():
        for rep in range(repetitions):
            starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
            starter.record()
            _, _ = net.w2v.extract_features(dummy_input)
            ender.record()
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)/1000
            total_time += curr_time
    Throughput =  (repetitions*optimal_batch_size)/total_time
    print(sec,' Final Throughput:',Throughput)

15  Final Throughput: 78.65637441178347
10  Final Throughput: 123.5405113223635
5  Final Throughput: 244.4819497883157
3  Final Throughput: 407.40979517856573
