# Process IEMOCAP

# 1 - Extract Label Transcript

In [None]:
import os
import csv
import sys

import numpy as np
import pandas as pd

import pickle

from util import *

In [None]:
create_folder('../FY Project/My Project/data/processed/IEMOCAP')


In [None]:
out_file = '../My Project/data/processed/IEMOCAP/processed_tran.csv'
out_file_trans = '../My Project/data/processed/IEMOCAP/sentence_only.txt'
os.system('rm ' + out_file)  # Remove out file if it exists
os.system('rm ' + out_file_trans)


In [None]:
def extract_transcript(list_files, out_file, out_file_trans, test_data=False):
    '''
    Extracts transcript for each uniques session.

        Parameters:
            list_files (list): A list of files (with fullnames) to process transcript
            out_file (string): Out file to write processed transcript

        Returns:
            None
    '''
    file_lines = []

    if test_data:
        out_file = out_file[:-4] + '_TESTDATA' + out_file[-4:]
        out_file_trans = out_file_trans[:-4] + \
            '_TESTDATA' + out_file_trans[-4:]

    for file in list_files: # Processes each file in file list

        with open(file, 'r') as in_file:
            file_lines = in_file.readlines()

        with open(out_file, 'a') as outfile:
            csv_writer = csv.writer(outfile)
            file_lines = sorted(file_lines)

            for line in file_lines:
                line_split = line.split(':')

                # Select session name i.e. (Ses01F_impro01_F000)
                name = line_split[0].split(' ')[0].strip()

                # Unwanted case
                if name[:3] != 'Ses':
                    continue
                elif name[-3:-1] == 'XX':
                    continue
                
                transcript = line_split[1].strip()

                # cnt += 1
                csv_writer.writerow([name, transcript])

                with open(out_file_trans, 'a') as outfile_trans:
                    outfile_trans.write(transcript + '\n')


In [None]:
list_files = []

for x in range(1, 5):
    sess_title = 'Session' + str(x)

    path = f'./data/raw/IEMOCAP_full_release/{sess_title}/dialog/transcriptions'

    file_search(path, list_files)
    list_files = sorted(list_files)

    print(f"{sess_title}, # Num of files: {len(list_files)}")

extract_transcript(list_files, out_file, out_file_trans)


## 1.1 - Extract Transcript (Test Data)

In [None]:
list_files = []

for x in range(5, 6):
    sess_title = 'Session' + str(x)

    path = f'./data/raw/IEMOCAP_full_release/{sess_title}/dialog/transcriptions'

    file_search(path, list_files)
    list_files = sorted(list_files)

    print(f"{sess_title}, # Num of files: {len(list_files)}")

extract_transcript(list_files, out_file, out_file_trans, test_data=True)


## 2 - Extract Label

In [None]:
out_file = '../My Project/data/processed/IEMOCAP/label.csv'
os.system('rm ' + out_file)  # Remove out file if it exists


In [None]:
category_list = ['ang', 'hap', 'sad', 'neu', 'fru', 'exc', 'fea', 'sur', 'dis', 'oth', 'xxx']
category = {}

for cat_type in category_list:
    if cat_type in category:
        continue
    else:
        category[cat_type] = len(category)


In [None]:
def find_category(lines):
    '''
    Find ground truth category for each session recording in txt file.

        Parameters:
            lines (list): Lines extracted from each sessions Emoevaluation txt file

        Returns:
            cat_emo_list (list): List contains each Session name with groud-truth emotion \
                i.e. [['Ses01F_impro01_F000, 'neu']]

    '''
    cat_emo_list = []
    is_target_line = True

    for line in lines:

        # Check if line is in format --> [START_TIME - END_TIME] TURN_NAME EMOTION [V, A, D]
        if is_target_line == True:

            try:
                line_split = line.split('\t')

                session_id = line_split[1].strip()
                cat_label = line_split[2].strip()

                if cat_label not in category:  # Confirm cat_label is in category dictionary
                    print(f'Invalid key --> {cat_label}')
                    sys.exit()  # Exit script

                cat_emo_list.append([session_id, cat_label])
                is_target_line = False  # Subsequent lines are not target line i.e. C-E2:	Neutral;	()

            except:
                print(f'ERROR --> {line}')  # Error encontered on line
                sys.exit()

        else:
            if line == '\n':
                is_target_line = True

    return cat_emo_list


In [None]:
def extract_labels(list_files, out_file, test_data=False):
    '''
    Extracts labels for each unique session.

        Parameters:
            list_files (list): A list of files (with fullnames) to process transcript
            out_file (string): Out file to write processed transcript

        Returns:
            None
    '''
    lines = []
    sorted_cat_emo_list = []

    if test_data:
        out_file = out_file[:-4] + '_TESTDATA' + out_file[-4:]
    
    for file in list_files:

        with open(file, 'r') as in_file:
            lines = in_file.readlines()

            # Remove header --> '% [START_TIME - END_TIME] TURN_NAME EMOTION [V, A, D]'
            lines = lines[2:]
            cat_emo_list = find_category(lines)

        sorted_cat_emo_list = sorted(cat_emo_list)

        with open(out_file, 'a') as outfile:
            csv_writer = csv.writer(outfile)
            csv_writer.writerows(sorted_cat_emo_list)


In [None]:
list_files = []
skip_dir = ['Attribute', 'Categorical', 'Self-evaluation']

for x in range(1, 5):
    sess_title = 'Session' + str(x)

    path = f'./data/raw/IEMOCAP_full_release/{sess_title}/dialog/EmoEvaluation/'
    file_search(path, list_files, skip_dir)
    list_files = sorted(list_files)

    print(f"{sess_title}, # Num of files: {len(list_files)}")

extract_labels(list_files, out_file)


## 2.1 - Extract Label (Test Data)

In [None]:
list_files = []
skip_dir = ['Attribute', 'Categorical', 'Self-evaluation']

for x in range(5, 6):
    sess_title = 'Session' + str(x)

    path = f'./data/raw/IEMOCAP_full_release/{sess_title}/dialog/EmoEvaluation/'
    file_search(path, list_files, skip_dir)
    list_files = sorted(list_files)

    print(f"{sess_title}, # Num of files: {len(list_files)}")

extract_labels(list_files, out_file, test_data=True)

# 2 - Process Extracted data

## Extracted Data

| Category 	| Session 1 - 4 	| Session 5 	|       	|
|----------	|---------------	|-----------	|-------	|
| Angry    	| 933           	| 170       	| 1103  	|
| Happy    	| 1194          	| 442       	| 1636  	|
| Sad      	| 839           	| 245       	| 1084  	|
| Neutral  	| 1324          	| 384       	| 1708  	|
| Total    	| 4290          	| 1241      	| 5531  	|

In [None]:
lines = [] 
with open('./data/processed/IEMOCAP/label.csv', 'r') as f:
    csv_reader = csv.reader(f)
    lines = [line for line in csv_reader if len(line) > 0]

print(len(lines))


## 2.1 - Process Train Data

In [None]:
with open('./data/processed/IEMOCAP/processed_label.txt', 'w') as f:

    with open('./data/processed/IEMOCAP/processed_ids.txt', 'w') as f2:

        for line in lines:
            if line[1] == 'ang':
                f.write('ang\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'hap':
                f.write('hap\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'exc':
                f.write('hap\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'sad':
                f.write('sad\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'neu':
                f.write('neu\n')
                f2.write(line[0]+'\n')
            else:
                f.write('-1\n')


In [None]:
lines = []
with open('./data/processed/IEMOCAP/processed_label.txt', 'r') as f:
    lines = f.readlines()

lines = [line.strip() for line in lines]

print('Angry (0)\t-->', len([x for x in lines if x == 'ang']))
print('Happy (1)\t-->', len([x for x in lines if x == 'hap']))
print('Sad (2)\t\t-->', len([x for x in lines if x == 'sad']))
print('Neutral (3)\t-->', len([x for x in lines if x == 'neu']))


## 2.1.1 - Convert labels to four categories ['ang', 'hap', 'sad', 'neu']

In [None]:
with open('./data/processed/IEMOCAP/final/text/train/FC_label.txt', 'w') as f:
    for label in lines:
        if label != '-1':
            f.write(label+'\n')


## 2.1.2 - Get sentences for four categories

In [None]:
sentences = []

with open('./data/processed/IEMOCAP/sentence_only.txt') as f:
    full_sentences = f.readlines()

sentences = [x.strip() for x in full_sentences]

In [None]:
with open('./data/processed/IEMOCAP/final/text/train/FC_sentence.txt', 'w') as f:
    for index, label in enumerate(lines):
        if label != '-1':
            f.write(sentences[index]+'\n')


In [None]:
sentences, label = [], []

with open('./data/processed/IEMOCAP/final/text/train/FC_sentence.txt') as f:
    with open('./data/processed/IEMOCAP/final/text/train/FC_label.txt') as f2:
        full_sentences = f.readlines()
        category = f2.readlines()

sentences = [x.strip() for x in full_sentences]
label = [y.strip() for y in category]

# sentences, label = shuffle_dataset(sentences, label)

print(f'Sentence length --> {len(sentences)}, Label length --> {len(label)}')

data_dict = {'sentences': sentences, 'label': label}
train_dataset = pd.DataFrame.from_dict(data_dict)

## 2.2 - Process Test Data

In [None]:
lines = [] 
with open('./data/processed/IEMOCAP/label_TESTDATA.csv', 'r') as f:
    csv_reader = csv.reader(f)
    lines = [line for line in csv_reader if len(line) > 0]

print(len(lines))


In [None]:
with open('./data/processed/IEMOCAP/processed_label_TESTDATA.txt', 'w') as f:

    with open('./data/processed/IEMOCAP/processed_ids_TESTDATA.txt', 'w') as f2:

        for line in lines:
            if line[1] == 'ang':
                f.write('ang\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'hap':
                f.write('hap\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'exc':
                f.write('hap\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'sad':
                f.write('sad\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'neu':
                f.write('neu\n')
                f2.write(line[0]+'\n')
            else:
                f.write('-1\n')


In [None]:
lines = []
with open('./data/processed/IEMOCAP/processed_label_TESTDATA.txt', 'r') as f:
    lines = f.readlines()

lines = [line.strip() for line in lines]

print('Angry (0)\t-->', len([x for x in lines if x == 'ang']))
print('Happy (1)\t-->', len([x for x in lines if x == 'hap']))
print('Sad (2)\t\t-->', len([x for x in lines if x == 'sad']))
print('Neutral (3)\t-->', len([x for x in lines if x == 'neu']))


## 2.2.1 - Convert labels to four categories ['ang', 'hap', 'sad', 'neu'] (Testdata)

In [None]:
with open('./data/processed/IEMOCAP/final/text/test/FC_label_TESTDATA.txt', 'w') as f:
    for label in lines:
        if label != '-1':
            f.write(label+'\n')


## 2.2.2 - Get sentences for four categories (Testdata)

In [None]:
sentences = []

with open('./data/processed/IEMOCAP/sentence_only.txt') as f:
    full_sentences = f.readlines()

sentences = [x.strip() for x in full_sentences]

In [None]:
with open('./data/processed/IEMOCAP/final/text/test/FC_sentence_TESTDATA.txt', 'w') as f:
    for index, label in enumerate(lines):
        if label != '-1':
            f.write(sentences[index]+'\n')


In [None]:
sentences, label = [], []

with open('./data/processed/IEMOCAP/final/text/test/FC_label_TESTDATA.txt') as f:
    with open('./data/processed/IEMOCAP/final/text/test/FC_label_TESTDATA.txt') as f2:
        full_sentences = f.readlines()
        category = f2.readlines()

sentences = [x.strip() for x in full_sentences]
label = [y.strip() for y in category]

# sentences, label = shuffle_dataset(sentences, label)

print(f'Sentence length --> {len(sentences)}, Label length --> {len(label)}')

data_dict = {'sentences': sentences, 'label': label}
test_dataset = pd.DataFrame.from_dict(data_dict)


# 3 - Save Dataframes as pickle

In [None]:
train_dataset.to_pickle('./data/processed/IEMOCAP/final/text/train/train_dataset.pkl')
test_dataset.to_pickle('./data/processed/IEMOCAP/final/text/test/test_dataset.pkl')
