In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys
import copy
import pickle

In [2]:
wav_folder_path = "../../KEMDy20_v1_1/wav/"
an_folder_path = "../../KEMDy20_v1_1/annotation/"

In [3]:
wav_file_paths = []
for root, dirs, files in os.walk(wav_folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        if file_path.endswith(".wav"): # .wav 확장자만 저장
            wav_file_paths.append(file_path)

In [4]:
an_file_paths = [an_folder_path+i for i in os.listdir(an_folder_path)]

In [5]:
train_an_file_paths, test_an_file_paths = train_test_split(an_file_paths, test_size=0.2, random_state=42)

In [6]:
wav_path_df = pd.DataFrame(wav_file_paths, columns=["Path"])
wav_path_df["Segment ID"] = wav_path_df["Path"].apply(lambda x: x.split("\\")[-1].split(".")[0])

In [7]:
train_seg_eval_df_list = []
    
for an_file_path in train_an_file_paths:
    file = pd.read_csv(an_file_path)
    seg_eval = file.iloc[1:].filter(regex="^(Segment|Total)")
    train_seg_eval_df_list.append(seg_eval)
        
train_seg_eval_df = pd.concat(train_seg_eval_df_list)
train_seg_eval_df = train_seg_eval_df[~train_seg_eval_df["Total Evaluation"].str.contains(";")]

train_seg_eval_df = pd.merge(train_seg_eval_df, wav_path_df, on='Segment ID', how='left')
        
train_seg_eval_df.columns = ["Segment ID", "Emotion", "Path"]
train_seg_eval_df = train_seg_eval_df[["Emotion", "Path"]]

In [8]:
test_seg_eval_df_list = []

for an_file_path in test_an_file_paths:
    file = pd.read_csv(an_file_path)
    seg_eval = file.iloc[1:].filter(regex="^(Segment|Total)")
    test_seg_eval_df_list.append(seg_eval)
        
test_seg_eval_df = pd.concat(test_seg_eval_df_list)
test_seg_eval_df = test_seg_eval_df[~test_seg_eval_df["Total Evaluation"].str.contains(";")]

test_seg_eval_df = pd.merge(test_seg_eval_df, wav_path_df, on='Segment ID', how='left')
        
test_seg_eval_df.columns = ["Segment ID", "Emotion", "Path"]
test_seg_eval_df = test_seg_eval_df[["Emotion", "Path"]]

In [9]:
aug_wav_folder_path = "../../AI_hub/wav/"
aug_an_file_path = "../../AI_hub/annotation/5차년도_2차.csv"

In [10]:
aug_wav_file_paths = []
for root, dirs, files in os.walk(aug_wav_folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        if file_path.endswith(".wav"): # .wav 확장자만 저장
            aug_wav_file_paths.append(file_path)

In [11]:
aug_wav_path_df = pd.DataFrame(aug_wav_file_paths, columns=["Path"])
aug_wav_path_df["wav_id"] = aug_wav_path_df["Path"].apply(lambda x: x.split("/")[4].split(".")[0])

In [12]:
aug_seg_eval_df = pd.read_csv(aug_an_file_path, encoding="cp949")
aug_seg_eval_df = aug_seg_eval_df[["wav_id", "상황"]]

aug_seg_eval_df = pd.merge(aug_seg_eval_df, aug_wav_path_df, on="wav_id", how="left")
        
aug_seg_eval_df.columns = ["wav_id", "Emotion", "Path"]
aug_seg_eval_df = aug_seg_eval_df[["Emotion", "Path"]]

# 주어진 배열 정의
train_label_names = np.array(['happiness', 'neutral', 'sadness', 'angry', 'surprise', 'disgust', 'fear'])
aug_label_names = np.array(['happy', 'neutral', 'sad', 'angry', 'surprise', 'disqust', 'fear'])

# 매핑 딕셔너리 생성
mapping = dict(zip(train_label_names, aug_label_names))

# 매핑을 이용하여 열 값 변경
aug_seg_eval_df['Emotion'] = aug_seg_eval_df['Emotion'].map(mapping)

In [14]:
train_aug_seg_eval_df = pd.concat([train_seg_eval_df, aug_seg_eval_df], axis=0)

In [15]:
# 데이터 저장
save_path = "../../KEMDy20_v1_1/Splitting/"
if not os.path.exists(save_path):
    os.mkdir(save_path)
    
train_seg_eval_df.to_csv(f"{save_path}Train.csv", encoding="utf-8", index=False)
test_seg_eval_df.to_csv(f"{save_path}Test.csv", encoding="utf-8", index=False)
train_aug_seg_eval_df.to_csv(f"{save_path}Train_aug.csv", encoding="utf-8", index=False)