In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.utils import shuffle, class_weight
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import librosa


In [2]:
DATASET_DIR = "Datasets/EMO-IIT_Separate_Speakers_v2"
SPECTROGRAM_DIR = "Spectrograms/EMO-IIT_Separate_Speakers_v3/Log"
TF_RECORDS_DIR = "TFRecords/EMO-IIT_Separate_Speakers_v3_MobileNetV3Large"
TF_RECORDS_NAME = "EMO-IIT_Separate_Speakers_v3_log_MobileNetV3Large.tfrecords"
MODEL_DIR = "Models"
MODEL_NAME = "EMO-IIT_Separate_Speakers_v3_log_MobileNetV3Large.h5"
NUM_CLASSES = 8
SAMPLE_RATE = 16000
BATCH_SIZE = 32
EPOCHS = 50
RANDOM_SEED = 42

In [3]:
def create_dataframe_emoiit():
    if DATASET_DIR != "Datasets/EMO-IIT" and DATASET_DIR != "Datasets/Converted Datasets/EMO-IIT":
        raise Exception("DATASET_DIR must be set to 'Datasets/EMO-IIT' or 'Datasets/Converted Datasets/EMO-IIT' for EMO-IIT dataset")
    file_emotion, file_path = [], []
    emotion_dir_list = os.listdir(DATASET_DIR)
    emotion_dir_list = [emotion_dir for emotion_dir in emotion_dir_list if emotion_dir != "irritation"]
    for emotion_dir in emotion_dir_list:
        file_list = os.listdir(os.path.join(DATASET_DIR, emotion_dir))
        for file in file_list:
            if file.endswith('.wav'):
                file_emotion.append(emotion_dir)
                file_path.append(os.path.join(DATASET_DIR, emotion_dir, file))
    file_dict = {'emotion': file_emotion, 'path': file_path}
    emoiit_df = pd.DataFrame(file_dict)
    emoiit_df = pd.DataFrame(shuffle(emoiit_df, random_state=RANDOM_SEED), columns=emoiit_df.columns).reset_index(drop=True, inplace=False)
    return emoiit_df

In [4]:
def create_dataframe_emoiit_gendered(irritation=True):
    genders = ["male", "female"]
    file_emotion, file_speaker_id, file_gender, file_path = [], [], [], []
    for gender in genders:
        emotion_dirs = os.listdir(os.path.join(DATASET_DIR, gender))
        if not irritation:
            emotion_dirs = [emotion_dir for emotion_dir in emotion_dirs if emotion_dir != "irritation"]
        for emotion_dir in emotion_dirs:
            for file in os.listdir(os.path.join(DATASET_DIR, gender, emotion_dir)):
                if file.endswith(".wav"):
                    speaker_id = file[:5] if file[:4].lower() == "b511" else file[:4].upper()
                    file_emotion.append(emotion_dir)
                    file_speaker_id.append(speaker_id)
                    file_gender.append(gender)
                    file_path.append(os.path.join(DATASET_DIR, gender, emotion_dir, file))
    emoiit_df = pd.DataFrame({'emotion': file_emotion, 'speaker_id': file_speaker_id, 'gender': file_gender, 'path': file_path}).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
    return emoiit_df

In [5]:
def preprocess_dataset(ser_df):
    audio_block_list = []
    emotion_list = []
    for row in tqdm(ser_df.itertuples(), desc=f"Preprocessing audio files dataset", total=len(ser_df)):
        data, _ = librosa.load(row.path, sr=SAMPLE_RATE)
        if data.shape[0] < SAMPLE_RATE:
            data = np.pad(data, (0, SAMPLE_RATE - data.shape[0]), 'constant')
        frames = librosa.util.frame(data, frame_length=SAMPLE_RATE, hop_length=int(SAMPLE_RATE/100)).T
        for frame in frames:
            audio_block_list.append(frame)
            emotion_list.append(row.emotion)
    audio_block_list = np.array(audio_block_list)
    emotion_list = np.array(emotion_list)
    ohe = OneHotEncoder(categories='auto', sparse=False)
    emotion_list = ohe.fit_transform(emotion_list[:, np.newaxis])
    return audio_block_list, emotion_list

In [6]:
ser_df = create_dataframe_emoiit_gendered(irritation=False)

In [7]:
audio_block_list, emotion_list = preprocess_dataset(ser_df)
num_spectograms_without_irritation = len(audio_block_list)

Preprocessing audio files dataset: 100%|██████████| 522/522 [00:02<00:00, 225.58it/s]


In [8]:
print(f"Number of spectrograms without irritation: {len(audio_block_list)}")

Number of spectrograms without irritation: 15054


In [9]:
ser_df

Unnamed: 0,emotion,speaker_id,gender,path
0,hapiness,B523,female,Datasets/EMO-IIT_Separate_Speakers_v2\female\h...
1,neutral,B301,female,Datasets/EMO-IIT_Separate_Speakers_v2\female\n...
2,anger,B308,male,Datasets/EMO-IIT_Separate_Speakers_v2\male\ang...
3,neutral,B326,male,Datasets/EMO-IIT_Separate_Speakers_v2\male\neu...
4,disgust,B326,male,Datasets/EMO-IIT_Separate_Speakers_v2\male\dis...
...,...,...,...,...
517,disgust,B313,male,Datasets/EMO-IIT_Separate_Speakers_v2\male\dis...
518,fear,B323,male,Datasets/EMO-IIT_Separate_Speakers_v2\male\fea...
519,anger,B523,female,Datasets/EMO-IIT_Separate_Speakers_v2\female\a...
520,hapiness,B523,female,Datasets/EMO-IIT_Separate_Speakers_v2\female\h...


In [10]:
count_by_speaker_gender = ser_df.groupby(['speaker_id', 'gender']).size()
sorted_counts = count_by_speaker_gender.sort_index()
sorted_counts

speaker_id  gender
B301        female    24
B303        female    28
B306        female    27
B307        male      26
B308        male      28
B313        male      28
B318        female    26
B319        male      18
B323        male      28
B326        male      28
B329        male      16
B403        female    27
B410        male      24
B424        male      28
B511a       female    21
            male       7
B512        female    28
B518        female    56
B523        female    54
dtype: int64

In [11]:
ser_df["gender"].value_counts()

female    291
male      231
Name: gender, dtype: int64

In [12]:
len(sorted_counts)

19

In [13]:
ser_df['emotion'].value_counts().sort_index()

anger       76
boredom     71
disgust     63
fear        80
hapiness    79
neutral     79
sadness     74
Name: emotion, dtype: int64

In [14]:
count_by_emotion_gender = ser_df.groupby(['emotion', 'gender']).size()
sorted_counts = count_by_emotion_gender.sort_index()
sorted_counts

emotion   gender
anger     female    43
          male      33
boredom   female    43
          male      28
disgust   female    35
          male      28
fear      female    43
          male      37
hapiness  female    42
          male      37
neutral   female    42
          male      37
sadness   female    43
          male      31
dtype: int64

In [15]:
person_list = sorted(ser_df['speaker_id'].unique().tolist())
for person in person_list:
    print(f"\nPerson {person}")
    print(ser_df[ser_df['speaker_id'] == person]['emotion'].value_counts().sort_index())


Person B301
anger       4
boredom     4
fear        4
hapiness    4
neutral     4
sadness     4
Name: emotion, dtype: int64

Person B303
anger       4
boredom     4
disgust     4
fear        4
hapiness    4
neutral     4
sadness     4
Name: emotion, dtype: int64

Person B306
anger       4
boredom     4
disgust     3
fear        4
hapiness    4
neutral     4
sadness     4
Name: emotion, dtype: int64

Person B307
anger       4
boredom     3
disgust     3
fear        4
hapiness    4
neutral     4
sadness     4
Name: emotion, dtype: int64

Person B308
anger       4
boredom     4
disgust     4
fear        4
hapiness    4
neutral     4
sadness     4
Name: emotion, dtype: int64

Person B313
anger       4
boredom     4
disgust     4
fear        4
hapiness    4
neutral     4
sadness     4
Name: emotion, dtype: int64

Person B318
anger       4
boredom     4
disgust     4
fear        4
hapiness    3
neutral     3
sadness     4
Name: emotion, dtype: int64

Person B319
anger       4
fear        4


In [16]:
ser_df = create_dataframe_emoiit_gendered()

In [17]:
audio_block_list, emotion_list = preprocess_dataset(ser_df)
num_spectograms_with_irritation = len(audio_block_list)

Preprocessing audio files dataset: 100%|██████████| 562/562 [00:00<00:00, 1162.24it/s]


In [18]:
print(f"Number of audio blocks with irritation included: {len(audio_block_list)}")

Number of audio blocks with irritation included: 33949


In [19]:
print(f"The number of audio blocks without irritation is {num_spectograms_without_irritation} and the number of audio blocks with irritation is {num_spectograms_with_irritation}, so with the addition of irritation, the number of audio blocks increased with a percent of {100*(num_spectograms_with_irritation - num_spectograms_without_irritation)/num_spectograms_without_irritation:.2f}%")

The number of audio blocks without irritation is 15054 and the number of audio blocks with irritation is 33949, so with the addition of irritation, the number of audio blocks increased with a percent of 125.51%


In [20]:
count_by_emotion_gender = ser_df.groupby(['emotion', 'gender']).size()
sorted_counts = count_by_emotion_gender.sort_index()
sorted_counts

emotion     gender
anger       female    43
            male      33
boredom     female    43
            male      28
disgust     female    35
            male      28
fear        female    43
            male      37
hapiness    female    42
            male      37
irritation  female    17
            male      23
neutral     female    42
            male      37
sadness     female    43
            male      31
dtype: int64