In [1]:
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import IPython
from IPython.display import Audio
from IPython.display import Image
import matplotlib.pyplot as plt

from tqdm import tqdm
from pydub import AudioSegment
from ipywidgets import IntProgress
from collections import defaultdict

import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import operator
import json

In [2]:
class ParallelModel(nn.Module):
    def __init__(self,num_emotions):
        super().__init__()
        # conv block
        self.conv2Dblock = nn.Sequential(
            # 1. conv block
            nn.Conv2d(in_channels=1,
                       out_channels=16,
                       kernel_size=3,
                       stride=1,
                       padding=1
                      ),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=0.3),
            # 2. conv block
            nn.Conv2d(in_channels=16,
                       out_channels=32,
                       kernel_size=3,
                       stride=1,
                       padding=1
                      ),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
            # 3. conv block
            nn.Conv2d(in_channels=32,
                       out_channels=64,
                       kernel_size=3,
                       stride=1,
                       padding=1
                      ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
            # 4. conv block
            nn.Conv2d(in_channels=64,
                       out_channels=64,
                       kernel_size=3,
                       stride=1,
                       padding=1
                      ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3)
        )
        # Transformer block
        self.transf_maxpool = nn.MaxPool2d(kernel_size=[2,4], stride=[2,4])
        transf_layer = nn.TransformerEncoderLayer(d_model=64, nhead=4, dim_feedforward=512, dropout=0.4, activation='relu')
        self.transf_encoder = nn.TransformerEncoder(transf_layer, num_layers=4)
        # Linear softmax layer
        self.out_linear = nn.Linear(320,num_emotions)
        self.dropout_linear = nn.Dropout(p=0)
        self.out_softmax = nn.Softmax(dim=1)
    def forward(self,x):
        # conv embedding
        conv_embedding = self.conv2Dblock(x) #(b,channel,freq,time)
        conv_embedding = torch.flatten(conv_embedding, start_dim=1) # do not flatten batch dimension
        # transformer embedding
        x_reduced = self.transf_maxpool(x)
        x_reduced = torch.squeeze(x_reduced,1)
        x_reduced = x_reduced.permute(2,0,1) # requires shape = (time,batch,embedding)
        transf_out = self.transf_encoder(x_reduced)
        transf_embedding = torch.mean(transf_out, dim=0)
        # concatenate
        complete_embedding = torch.cat([conv_embedding, transf_embedding], dim=1) 
        # final Linear
        output_logits = self.out_linear(complete_embedding)
        output_logits = self.dropout_linear(output_logits)
        output_softmax = self.out_softmax(output_logits)
        return output_logits, output_softmax

In [3]:
def getMELspectrogram(audio, sample_rate):
    mel_spec = librosa.feature.melspectrogram(y=audio,
                                              sr=sample_rate,
                                              n_fft=1024,
                                              win_length = 512,
                                              window='hamming',
                                              hop_length = 256,
                                              n_mels=128,
                                              fmax=sample_rate/2
                                             )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

In [4]:
EMOTIONS = {1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 0:'surprise'}
SAMPLE_RATE = 48000
AUDIO_PATH = '/Users/winsteadx/Desktop/Podcast/podcasts-segments/all/'

In [14]:
LOAD_PATH = os.path.join(os.getcwd(),'/Users/winsteadx/Desktop/audio_process/feature_extraction')
model = ParallelModel(8)
model.load_state_dict(torch.load(os.path.join(LOAD_PATH,'cnn_transf_parallel_model.pt'), map_location=torch.device('cpu')))
print('Model is loaded from {}'.format(os.path.join(LOAD_PATH,'cnn_transf_parallel_model.pt')))

Model is loaded from /Users/winsteadx/Desktop/audio_process/feature_extraction/cnn_transf_parallel_model.pt


In [18]:
def predict_emotion(filename, AUDIO_PATH, SAMPLE_RATE, model):
    try:
        filepath = AUDIO_PATH + filename
        y, sr = librosa.load(filepath, sr=SAMPLE_RATE)
        
        signals = []
        # segments shorter than 2 seconds
        if y.shape[0] < (48000*2):
            signal = np.zeros((int(SAMPLE_RATE*3,)))
            audio = y[:]
            signal[:len(audio)] = audio
            signals.append(signal)

        for i in range(y.shape[0]//48000 - 1):
            signal = np.zeros((int(SAMPLE_RATE*3,)))
            audio = y[i*48000:(i+3)*48000]
            signal[:len(audio)] = audio
            signals.append(signal)
        signals = np.stack(signals,axis=0)

        X_test = signals

        mel_test = []
        #print("Calculatin mel spectrograms for test set")
        for i in range(X_test.shape[0]):
            mel_spectrogram = getMELspectrogram(X_test[i,:], sample_rate=SAMPLE_RATE)
            mel_test.append(mel_spectrogram)
            #print("\r Processed {}/{} files".format(i,X_test.shape[0]),end='')
        #print('')
        mel_test = np.stack(mel_test,axis=0)
        del X_test
        X_test = mel_test
        #print(f'X_test:{X_test.shape}')

        X_test = np.expand_dims(X_test, 1)

        scaler = StandardScaler()
        b,c,h,w = X_test.shape
        X_test = np.reshape(X_test, newshape=(b,-1))
        X_test = scaler.fit_transform(X_test)
        X_test = np.reshape(X_test, newshape=(b,c,h,w))

        X_test_tensor = torch.tensor(X_test).float()
        with torch.no_grad():
            model.eval()
            output_logits, output_softmax = model(X_test_tensor)
            predictions = torch.argmax(output_softmax,dim=1)
        
        predictions = predictions.tolist()
        softmaxs = output_softmax.tolist()

        EMOTIONS_counter = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 0:0}
        for pred in predictions:
            EMOTIONS_counter[pred] += 1
        pred_result = max(EMOTIONS_counter.items(), key=operator.itemgetter(1))[0]
        pred_emotion = EMOTIONS[pred_result]

        softmax = 0
        for sf in softmaxs:
            softmax += sf[pred_result]
        
        return pred_emotion, softmax
    # audio length = 0
    except ValueError:
        return 'neutral', 0

In [19]:
ep_features = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))

In [20]:
for filename in tqdm(os.listdir(AUDIO_PATH)):
    if filename.endswith(".wav"):
         show, show_name, ep_name, seg_num = filename.split('_')
         seg_num = int(seg_num.split('.')[0])
         
         emotion, softmax = predict_emotion(filename, AUDIO_PATH=AUDIO_PATH, SAMPLE_RATE=SAMPLE_RATE, model=model)
         
         ep_features[ep_name][seg_num]['paral_CNN_transf_emotion'] = emotion
         ep_features[ep_name][seg_num]['paral_CNN_transf_softmax'] = softmax

100%|██████████| 68337/68337 [28:10:20<00:00,  1.48s/it]


In [24]:
with open('/Users/winsteadx/Desktop/audio_process/feature_extraction/episodes_complex_v8_6_0-3.json', encoding='utf-8') as f:
    json_file = json.load(f)

In [25]:
for ep_dict in tqdm(json_file['episodes_speaker']):
    ep_name = ep_dict['episode_name']
    for i in range(len(ep_dict['transcripts'])):
        trans_dict = ep_dict['transcripts'][i]
        
        trans_dict['paral_CNN_transf_emotion'] = ep_features[ep_name][i]['paral_CNN_transf_emotion']
        trans_dict['paral_CNN_transf_softmax'] = ep_features[ep_name][i]['paral_CNN_transf_softmax']

100%|██████████| 855/855 [00:00<00:00, 11950.34it/s]


In [26]:
with open('/Users/winsteadx/Desktop/audio_process/feature_extraction/episodes_complex_v9_6_0-3.json', 'w', encoding='utf-8') as f:
    json.dump(json_file, f, ensure_ascii=False, indent=4)