In [1]:
import re
import glob
import gensim

import numpy as np
import pandas as pd
import tensorflow as tf

from random import shuffle

This notebook parses the data from the IEMOCAP data set. Three different data types are used: 
- audio features (precomputed with pyAudioAnalysis module) of sentenses
- aligned transcripts
- emotional ratings

Corrensponding values are identified by a file name of an original sentence, e.g. 'Ses01F_script03_1_F009'. In the whole preprocessing pipeline these names are used as dictionary keys in all dicts. The complete list of files can be seen by calling .keys() method on a dictionary.

There are 10039 numpy files and emotional assignments, but 10037 transcripts, so the further number of files is used.

### FILES
Just reading lists of all files of interest

In [2]:
DATA_PATH = '/home/karolina/Documents/GSOC/data/'
# file names are *.waw_st.npy
numpy_features = glob.glob(DATA_PATH+'IEMOCAP_full_release/Session*/sentences/wav/*/*st.npy')
# file names are *.wdseg
forced_alignments = glob.glob(DATA_PATH+'IEMOCAP_full_release/Session*/sentences/ForcedAlignment/Ses*/*.wdseg')
# *.txt files have assignments for the whole session not sentences
emo_evaluations = glob.glob(DATA_PATH+'IEMOCAP_full_release/Session*/dialog/EmoEvaluation/*.txt')

### DICTS 
Creating dictionaries for each data type, always using the filename as a key.

In [3]:
emotions_names = {'ang': np.int32(0), 'dis':np.int32(1), 'fea':np.int32(2), 'fru':np.int32(3), 
                  'hap':np.int32(4), 'neu':np.int32(5), 'sad':np.int32(6)}

In [4]:
# audio features dict - assigning paths to file names *(without extension)
features_dict = {}
for p in numpy_features:
    features_dict[p.split('/')[-1].split('.')[0]] = np.load(p)

# transcript files dict - assigning paths to file names *(without extension)    
transcripts_paths_dict = {}
for p in forced_alignments:
    transcripts_paths_dict[p.split('/')[-1].split('.')[0]] = p
    
# messy evaluation dict - assigning evaluations (grand average) - from summary files to file names
file = emo_evaluations[1]
mean_emo_eval_dict = {}
emotional_eval_dict = {}

DATA_FLAG = False
values = []     # this is an empty value needed fot the loop
key = ''        # this as well
for file in emo_evaluations:
    with open(file) as f:
        for line in f:
            split = line.split('\t')
            #print(split)
            if len(split)>1:
                split2 = split[1].split(';')
                if len(split2)==1:
                    if len(values)!=0:
                        #print('saving',key,values)
                        real_values = np.array([float(re.sub("[^0-9.]"," ", v)) for v in values.split(' ')], dtype=np.float32)
                        mean_emo_eval_dict[key]=real_values
                        if emo_tag in emotions_names:
                            emotional_eval_dict[key] = emotions_names[emo_tag]
                    key = split2[0]
                    emo_tag = split[2]
                    values = split[3]
#                     # this part is for extracting the more detailed info, but that's omitted at the moment
#                     values = []
#                 if len(split2)>1:
#                     values.append(split2)
real_values = np.array([float(re.sub("[^0-9.]"," ", v)) for v in values.split(' ')], dtype=np.float32)
mean_emo_eval_dict[key]=real_values #appending the last evaluation
if emo_tag in emotions_names:
    emotional_eval_dict[key] = emotions_names[emo_tag]

### WORD2VEC embedding

Binary from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing


In [5]:
model = gensim.models.Word2Vec.load_word2vec_format(DATA_PATH+'GoogleNews-vectors-negative300.bin', binary=True)  

### READING THE TRANSCRIPT & CORREESPONDING EMBEDDING

The majority of sentences is not longer than 18 words, so the rest of the sentences is rejected to avoid extra padding.

In [6]:
def read_transcript(path):
    """
    Reads and cleans the transcript from the forced-alignment files.
    Cleans the 'non-words'
    """
    df = pd.read_csv(path,delim_whitespace=True, usecols=['SFrm','EFrm','Word']).dropna()
    df = df[~df.Word.str.contains('<')]
    df = df[~df.Word.str.contains('"')]
    df = df[~df.Word.str.contains('LAUGHTER')]
    df = df[~df.Word.str.contains('BREATHING')]
    df = df[~df.Word.str.contains('GARBAGE')]
    n_rows = df.shape[0]
    return df, n_rows

def clean_word(word):
    return re.sub("[^a-zA-Z']"," ", word).lower()

def pad_feature(feature,audio_start,audio_end):
    empty_feature = np.zeros((34,240), dtype=np.float32) #all features have this size
    empty_feature[:,:audio_end-audio_start] = feature[:,audio_start:audio_end]
    audio_feature = empty_feature[:,:,None]
    return audio_feature

def pad_feature_sequence(feature_sequence):
    sequence = np.zeros((34,240,18),dtype=np.float32)
    sequence[:,:,:feature_sequence.shape[2]] = feature_sequence
    return sequence

def pad_word_sequence(word_sequence):
    sequence = np.zeros((300,18),dtype=np.float32)
    sequence[:,:word_sequence.shape[1]] = word_sequence
    return sequence

In [7]:
ready_audio_dict = {}
features_len_dict = {}
ready_word_embed_dict = {}
words_dict = {}

ALL_KEYS = emotional_eval_dict.keys() #transcripts_paths_dict.keys()
for key in list(ALL_KEYS):
    features = features_dict[key]
    transcript, n_rows = read_transcript(transcripts_paths_dict[key])
    OUTPUT_FLAG = False
    for n in range(n_rows):
        w = transcript.Word.iloc[n]
        lower = clean_word(w) #cleaning up the transcript
        
        if lower in model.vocab:
            audio_start = int(transcript.SFrm.iloc[n])
            audio_end = int(transcript.EFrm.iloc[n])
            feature = features_dict[key]
            audio_feature = pad_feature(feature,audio_start,audio_end)

            word_embed = model[lower][:,None]
            if OUTPUT_FLAG:
                audio = np.concatenate((audio,audio_feature),axis=2)
                word = np.concatenate((word,word_embed),axis=1)
                all_words.append(lower)
            else:
                audio = audio_feature
                word = word_embed
                all_words = [lower]
                OUTPUT_FLAG = True
    if word.shape[1] < 19:          
        ready_audio_dict[key] = pad_feature_sequence(audio)
        ready_word_embed_dict[key] = pad_word_sequence(word)
        features_len_dict[key] = np.int32(audio_end-audio_start)
        words_dict[key] = all_words
del model
GOOD_KEYS = ready_audio_dict.keys()

In [10]:
list(GOOD_KEYS)[8]

'Ses03M_script02_1_M017'

In [11]:
words_dict[list(GOOD_KEYS)[8]]

['you', 'my']

### DIVIDING DATA INTO SPLITS WITH EVENLY DISTRIBUTED SPEAKERS AND SCENES

The audio files (keys) are separatred depending on the condition they belong to. The number of splits is determined by the minimal numer of files from one condition, here 8.
These splits can be used e.g. for the crossvalidation.
In most of the cases there is only one realisation of a particular label for a given setup, so this is excluded from the mixing procedure.


In [8]:
hash_dictionary = {}
for sess_name in list(GOOD_KEYS):
    #label = str(ready_audio_dict[sess_name])
    split = sess_name.split('_')
    ses_num = split[0][3:5]
    ses_g = split[0][-1]
    scenario = re.sub("[^a-z]"," ", split[1]).strip(' \t\n\r')
    scenario_num = re.sub("[^0-9]"," ", split[1]).strip(' \t\n\r')
    if len(split)==4:
        division_num = split[2]
        g = split[3][0]
    else:
        division_num = '0'
        g = split[2][0]
    hash_key = ses_num+ses_g+scenario+scenario_num+division_num+g#+label
    if hash_key in hash_dictionary:
        hash_dictionary[hash_key].append(sess_name)
    else:
        hash_dictionary[hash_key] = [sess_name]

In [9]:
n_splits = 8
splits = np.empty(n_splits,dtype=object) #numpy array with splitted data names

In [10]:
EMPTY_FLAG = True
for k, v in hash_dictionary.items():
    a = np.arange(len(v))
    shuffle(v)
    if EMPTY_FLAG:
        for i in range(n_splits):
            splits[i] = list(np.array(v)[a%n_splits==i])
        EMPTY_FLAG = False
    else:
        for i in range(n_splits):
            splits[i].extend(list(np.array(v)[a%n_splits==i]))      

In [11]:
np.sum([len(s) for s in splits])

6031

### WRITING TO BINARIES

In [12]:
BINARIES_PATH = DATA_PATH+'audio_features/IEMOCUP/'

In [13]:
for j,split in enumerate(list(splits)):
    for i,key in enumerate(split):

        # dividing into files
        if i%100==0:
            train_filename = BINARIES_PATH+'split_'+str(j)+'_'+str(i//100)+'.tfrecords'

        writer = tf.python_io.TFRecordWriter(train_filename)
        
        audio = ready_audio_dict[key].tobytes()
        audio_len = features_len_dict[key].tobytes()
        word = ready_word_embed_dict[key].tobytes()
        emo = mean_emo_eval_dict[key].tobytes()
        label = emotional_eval_dict[key].tobytes()
        example = tf.train.Example(features=tf.train.Features(feature={
            'audio_features'    : tf.train.Feature(bytes_list=tf.train.BytesList(value=[audio])),
            'audio_len'         : tf.train.Feature(bytes_list=tf.train.BytesList(value=[audio_len])),
            'word_embeddings'   : tf.train.Feature(bytes_list=tf.train.BytesList(value=[word])),
            'y'                 : tf.train.Feature(bytes_list=tf.train.BytesList(value=[emo])),
            'label'             : tf.train.Feature(bytes_list=tf.train.BytesList(value=[label])),
            }))

        writer.write(example.SerializeToString())
        writer.close()