## Build Speech data files

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

%matplotlib inline

In [2]:
df = pd.read_csv('data/pre-processed/audio_featuresall.csv')
df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]
print(df.shape)
display(df.head())

# change 7 to 2
df['label'] = df['label'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})
df.head()

(16826, 10)


Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,Ses01F_impro03_F000,1,0.015638,0.024199,0.019682,0.014036,0.303644,-0.005139,0.404377,0.714805
1,Ses01F_impro03_F001,1,0.011377,0.020261,0.014726,0.013883,0.47138,-0.008229,0.314151,0.828564
2,Ses01F_impro03_F002,1,0.035397,0.058327,0.044082,0.038644,0.342342,-0.006294,2.469553,5.852177
4,Ses01F_impro03_F004,1,0.053091,0.081314,0.066765,0.046364,0.196923,-0.004571,4.549671,8.876014
5,Ses01F_impro03_F005,1,0.030576,0.047235,0.038321,0.027501,0.161435,-0.001675,1.547291,3.472386


Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,Ses01F_impro03_F000,1,0.015638,0.024199,0.019682,0.014036,0.303644,-0.005139,0.404377,0.714805
1,Ses01F_impro03_F001,1,0.011377,0.020261,0.014726,0.013883,0.47138,-0.008229,0.314151,0.828564
2,Ses01F_impro03_F002,1,0.035397,0.058327,0.044082,0.038644,0.342342,-0.006294,2.469553,5.852177
4,Ses01F_impro03_F004,1,0.053091,0.081314,0.066765,0.046364,0.196923,-0.004571,4.549671,8.876014
5,Ses01F_impro03_F005,1,0.030576,0.047235,0.038321,0.027501,0.161435,-0.001675,1.547291,3.472386


In [3]:
df.to_csv('data/no_sample_dfall.csv')

# oversample fear
fear_df = df[df['label']==3]
for i in range(30):
    df = df.append(fear_df)

sur_df = df[df['label']==4]
for i in range(10):
    df = df.append(sur_df)
    
df.to_csv('data/modified_dfall.csv')

In [4]:
emotion_dict = {'ang': 0,
                'hap': 1,
                'sad': 2,
                'neu': 3,}

# emotion_dict = {'ang': 0,
#                 'hap': 1,
#                 'exc': 2,
#                 'sad': 3,
#                 'fru': 4,
#                 'fea': 5,
#                 'sur': 6,
#                 'neu': 7,
#                 'xxx': 8,
#                 'oth': 8}

scalar = MinMaxScaler()
df[df.columns[2:]] = scalar.fit_transform(df[df.columns[2:]])
df.head()

Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,Ses01F_impro03_F000,1,0.052863,0.062417,0.054806,0.07643,0.394965,0.169213,0.00489,0.002524
1,Ses01F_impro03_F001,1,0.037904,0.051938,0.040475,0.075592,0.613149,0.168608,0.003798,0.002926
2,Ses01F_impro03_F002,1,0.122242,0.153241,0.12536,0.211455,0.445302,0.168986,0.02988,0.020684
4,Ses01F_impro03_F004,1,0.18437,0.214418,0.19095,0.253817,0.256148,0.169324,0.055051,0.031373
5,Ses01F_impro03_F005,1,0.105315,0.123722,0.108703,0.150315,0.209987,0.169891,0.01872,0.012272


In [5]:
x_train, x_test = train_test_split(df, test_size=0.20)

x_train.to_csv('data/s2e/audio_train.csv', index=False)
x_test.to_csv('data/s2e/audio_test.csv', index=False)

print(x_train.shape, x_test.shape)

(17052, 10) (4264, 10)


## Define preprocessing functions for text

In [6]:
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

## Build Text data files

In [7]:
import re
import os
import pickle

useful_regex = re.compile(r'^(\w+)', re.IGNORECASE)

file2transcriptions = {}

for sess in range(1, 6):
    transcripts_path = 'data/IEMOCAP_full_release/Session{}/dialog/transcriptions/'.format(sess)
    transcript_files = os.listdir(transcripts_path)
    for f in transcript_files:
        with open('{}{}'.format(transcripts_path, f), 'r') as f:
            all_lines = f.readlines()

        for l in all_lines:
            audio_code = useful_regex.match(l).group()
            transcription = l.split(':')[-1].strip()
            # assuming that all the keys would be unique and hence no `try`
            file2transcriptions[audio_code] = transcription
# save dict
with open('data/t2e/audiocode2text.pkl', 'wb') as file:
    pickle.dump(file2transcriptions, file)
len(file2transcriptions)

10087

In [8]:
audiocode2text = pickle.load(open('data/t2e/audiocode2text.pkl', 'rb'))

In [9]:
# Prepare text data
text_train = pd.DataFrame()
text_train['wav_file'] = x_train['wav_file']
text_train['label'] = x_train['label']
text_train['transcription'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]

text_test = pd.DataFrame()
text_test['wav_file'] = x_test['wav_file']
text_test['label'] = x_test['label']
text_test['transcription'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]

text_train.to_csv('data/t2e/text_train.csv', index=False)
text_test.to_csv('data/t2e/text_test.csv', index=False)

print(text_train.shape, text_test.shape)

(17052, 3) (4264, 3)
