In [10]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys
import copy
import pickle

In [11]:
wav_folder_path = "../../KEMDy20_v1_1/wav/"
an_folder_path = "../../KEMDy20_v1_1/annotation/"

In [12]:
wav_file_paths = []
for root, dirs, files in os.walk(wav_folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        if file_path.endswith(".wav"): # .wav 확장자만 저장
            wav_file_paths.append(file_path)

In [13]:
an_file_paths = [an_folder_path+i for i in os.listdir(an_folder_path)]

In [14]:
train_an_file_paths, test_an_file_paths = train_test_split(an_file_paths, test_size=0.2, random_state=42)

In [15]:
def preprocessing(an_file_paths, data_type):
    seg_eval_df_list = []
    
    for an_file_path in an_file_paths:
        file = pd.read_csv(an_file_path)
        seg_eval = file.iloc[1:].filter(regex="^(Segment|Eval)")
        seg_eval_df_list.append(seg_eval)
        
    seg_eval_df = pd.concat(seg_eval_df_list)
    
    # 라벨 열(`Eval01F`, `Eval02M`, ...)과 값을 하나의 열로 변환
    melted_df = seg_eval_df.melt(id_vars='Segment ID', var_name='Label', value_name='Emotion')

    # 각 Segment ID별 라벨 발생 횟수 계산
    em_count_df = melted_df.groupby(['Segment ID', 'Emotion'])['Emotion'].count().reset_index(name='Count')
    
    # pivot_table 함수를 사용하여 각 Segment ID당 happy와 neutral 라벨이 발생한 횟수 계산
    label_df = pd.pivot_table(em_count_df, index='Segment ID', columns='Emotion', values='Count', aggfunc=sum, fill_value=0).reset_index()
    
    # wav에 대한 path를 추가
    for wav_file_path in wav_file_paths:
        segment_id = wav_file_path.split("\\")[-1].split(".")[0]
        row = label_df.loc[label_df["Segment ID"].str.endswith(segment_id)].index
        if len(row) > 0:
            label_df.loc[row, "path"] = wav_file_path
    
    # 라벨을 확률 값으로 정의
    prob_label_df = copy.copy(label_df)
    prob_label_df.iloc[:,1:-1] /= 10
    prob_label_df["emotion"] = prob_label_df.iloc[:,1:-1].apply(list, axis=1)
    prob_label_df = prob_label_df[["Segment ID", "path", "emotion"]]
    
    # 라벨을 여러개로 정의
    multi_label_df = copy.copy(label_df)
    multi_label_df.iloc[:,1:-1] = (multi_label_df.iloc[:,1:-1] > 0).astype(int)
    multi_label_df.iloc[:,1:-1] = multi_label_df.iloc[:,1:-1].div(multi_label_df.iloc[:,1:-1].sum(axis=1), axis=0)
    multi_label_df.iloc[:,1:-1] = multi_label_df.iloc[:,1:-1].round(5)
    multi_label_df["emotion"] = multi_label_df.iloc[:,1:-1].apply(list, axis=1)
    multi_label_df = multi_label_df[["Segment ID", "path", "emotion"]]
    
    # 데이터 저장
    save_path = "../../KEMDy20_v1_1/Splitting/"
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    prob_label_df.to_csv(f"{save_path}{data_type}_Prob.csv", encoding="utf-8", index=False)
    multi_label_df.to_csv(f"{save_path}{data_type}_Multi.csv", encoding="utf-8", index=False)
    if not os.path.exists(save_path+"Label.txt"):
        with open(save_path+"Label.txt", "wb") as f:
            pickle.dump(label_df.columns[1:-1], f)

In [16]:
preprocessing(train_an_file_paths, "Train")

  multi_label_df.iloc[:,1:-1] = (multi_label_df.iloc[:,1:-1] > 0).astype(int)


In [17]:
preprocessing(test_an_file_paths, "Test")

  multi_label_df.iloc[:,1:-1] = (multi_label_df.iloc[:,1:-1] > 0).astype(int)


In [18]:
with open("../../KEMDy20_v1_1/Splitting/Label.txt", "rb") as f:
    ttt = pickle.load(f)

In [19]:
ttt

Index(['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'], dtype='object', name='Emotion')