## Package load

In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys
import copy
import pickle

## Path setting (KEMDy20)

In [39]:
wav_folder_path = "../../KEMDy20_v1_1/wav/"
an_folder_path = "../../KEMDy20_v1_1/annotation/"

## wav file path load

In [40]:
wav_file_paths = []
for root, dirs, files in os.walk(wav_folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        if file_path.endswith(".wav"): # .wav 확장자만 저장
            wav_file_paths.append(file_path)

## annotation file path load

In [41]:
an_file_paths = [an_folder_path+i for i in os.listdir(an_folder_path)]

## Train, Test split

In [42]:
train_an_file_paths, test_an_file_paths = train_test_split(an_file_paths, test_size=0.2, random_state=42)

## Add merge key to wav_path_df

In [43]:
wav_path_df = pd.DataFrame(wav_file_paths, columns=["Path"])
wav_path_df["Segment ID"] = wav_path_df["Path"].apply(lambda x: x.split("\\")[-1].split(".")[0])

## Merge train annotation and wav file path

In [47]:
train_seg_eval_df_list = []
    
for an_file_path in train_an_file_paths:
    file = pd.read_csv(an_file_path)
    seg_eval = pd.concat([file.iloc[1:].filter(regex="^(Segment|Total)"), file.iloc[1:,5:7]], axis=1)
    seg_eval.columns = ["Segment ID", "Total Evaluation", "Valence", "Arousal"]
    train_seg_eval_df_list.append(seg_eval)
        
train_seg_eval_df = pd.concat(train_seg_eval_df_list)
train_seg_eval_df = train_seg_eval_df[~train_seg_eval_df["Total Evaluation"].str.contains(";")]

train_seg_eval_df = pd.merge(train_seg_eval_df, wav_path_df, on='Segment ID', how='left')
        
train_seg_eval_df.columns = ["Segment ID", "Emotion", "Valence", "Arousal", "Path"]
train_seg_eval_df = train_seg_eval_df[["Emotion", "Valence", "Arousal", "Path"]]

## Merge test annotation and wav file path

In [48]:
test_seg_eval_df_list = []

for an_file_path in test_an_file_paths:
    file = pd.read_csv(an_file_path)
    seg_eval = pd.concat([file.iloc[1:].filter(regex="^(Segment|Total)"), file.iloc[1:,5:7]], axis=1)
    seg_eval.columns = ["Segment ID", "Total Evaluation", "Valence", "Arousal"]
    test_seg_eval_df_list.append(seg_eval)
        
test_seg_eval_df = pd.concat(test_seg_eval_df_list)
test_seg_eval_df = test_seg_eval_df[~test_seg_eval_df["Total Evaluation"].str.contains(";")]

test_seg_eval_df = pd.merge(test_seg_eval_df, wav_path_df, on='Segment ID', how='left')
        
test_seg_eval_df.columns = ["Segment ID", "Emotion", "Valence", "Arousal", "Path"]
test_seg_eval_df = test_seg_eval_df[["Emotion", "Valence", "Arousal", "Path"]]

## Save Train, Test file (Only KEMDy20)

In [49]:
# 데이터 저장
save_path = "../../KEMDy20_v1_1/Splitting/"
if not os.path.exists(save_path):
    os.mkdir(save_path)
    
train_seg_eval_df.to_csv(f"{save_path}Train_Origin.csv", encoding="utf-8", index=False)
test_seg_eval_df.to_csv(f"{save_path}Test_Origin.csv", encoding="utf-8", index=False)

## Path setting (AI Hub)

In [15]:
aug_wav_folder_path = "../../AI_hub/wav/"
aug_an_file_path = "../../AI_hub/annotation/5차년도_2차.csv"

## wav file path load

In [16]:
aug_wav_file_paths = []
for root, dirs, files in os.walk(aug_wav_folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        if file_path.endswith(".wav"): # .wav 확장자만 저장
            aug_wav_file_paths.append(file_path)

## anotation file path load

In [17]:
aug_wav_path_df = pd.DataFrame(aug_wav_file_paths, columns=["Path"])
aug_wav_path_df["wav_id"] = aug_wav_path_df["Path"].apply(lambda x: x.split("/")[4].split(".")[0])

## Merge annotation and wav file path & annotation mapping

In [18]:
aug_seg_eval_df = pd.read_csv(aug_an_file_path, encoding="cp949")
aug_seg_eval_df = aug_seg_eval_df[["wav_id", "상황"]]

aug_seg_eval_df = pd.merge(aug_seg_eval_df, aug_wav_path_df, on="wav_id", how="left")
        
aug_seg_eval_df.columns = ["wav_id", "Emotion", "Path"]
aug_seg_eval_df = aug_seg_eval_df[["Emotion", "Path"]]

# 주어진 배열 정의
train_label_names = np.array(['happiness', 'neutral', 'sadness', 'angry', 'surprise', 'disgust', 'fear'])
aug_label_names = np.array(['happy', 'neutral', 'sad', 'angry', 'surprise', 'disqust', 'fear'])

# 매핑 딕셔너리 생성
mapping = dict(zip(train_label_names, aug_label_names))

# 매핑을 이용하여 열 값 변경
aug_seg_eval_df['Emotion'] = aug_seg_eval_df['Emotion'].map(mapping)

## Train, Test split

In [20]:
# 데이터 프레임의 인덱스를 섞기
aug_seg_eval_df = aug_seg_eval_df.sample(frac=1, random_state=42)

# 데이터 프레임을 train과 test set으로 나누기
train_size = int(len(aug_seg_eval_df) * 0.8)
train_aug_seg_eval_df = aug_seg_eval_df[:train_size]
test_aug_seg_eval_df = aug_seg_eval_df[train_size:]

## Merge KEMDy20 and AI Hub

In [23]:
train_df = pd.concat([train_seg_eval_df, train_aug_seg_eval_df], axis=0)
test_df = pd.concat([test_seg_eval_df, test_aug_seg_eval_df], axis=0)

## Save Train, Test file (KEMDy20 + AI Hub)

In [32]:
# 데이터 저장
save_path = "../../KEMDy20_v1_1/Splitting/"
if not os.path.exists(save_path):
    os.mkdir(save_path)
    
train_df.to_csv(f"{save_path}Train.csv", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}Test.csv", encoding="utf-8", index=False)