### Path

e.g. './EATD-Corpus'

In [2]:
import os
import pandas as pd
import re
import numpy as np
from tqdm import tqdm

In [3]:
# Function to extract the numeric part
def extract_number(folder_name):
    match = re.search(r'\d+', folder_name)
    if match:
        return int(match.group())
    return 0

# Retrieve the list of folders and sort them based on their numeric parts
folders = os.listdir('raw-data')
sorted_folders = sorted(folders, key=extract_number)

In [4]:
# Initialize an empty DataFrame
df = pd.DataFrame(columns=['folder', 'negative', 'neutral', 'positive', 'new_label'])

# Traverse through all folders in the 'raw-data' directory with tqdm progress bar
for folder in tqdm(sorted_folders, desc="Processing folders"):
    folder_path = os.path.join('raw-data', folder)
    
    # Process only if it is a folder
    if os.path.isdir(folder_path):
        
        # Dictionary to store the content of each text file
        data = {'folder': folder}
        
        # Read required text files and store their content in the dictionary
        for txt_file in ['negative.txt', 'neutral.txt', 'positive.txt', 'new_label.txt']:
            txt_path = os.path.join(folder_path, txt_file)

            if os.path.exists(txt_path):
                with open(txt_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                data[txt_file[:-4].lower()] = content  # Remove the file extension and save the name in lowercase
            else:
                data[txt_file[:-4].lower()] = None  # Set to None if the file does not exist
                
        # Collect paths to wav files
        for wav_file in ['negative.wav', 'neutral.wav', 'positive.wav']:
            wav_path = os.path.join(folder_path, wav_file)
            if os.path.exists(wav_path):
                data[wav_file[:-4] + '_Wav'] = os.path.abspath(wav_path)
            else:
                data[wav_file[:-4] + '_Wav'] = None

        # Convert the dictionary to a DataFrame and concatenate it with the existing DataFrame
        df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)

Processing folders: 100%|████████████████████████████████████████████████████████████| 162/162 [10:29<00:00,  3.89s/it]


In [5]:
df.columns

Index(['folder', 'negative', 'neutral', 'positive', 'new_label',
       'negative_Wav', 'neutral_Wav', 'positive_Wav'],
      dtype='object')

In [6]:
df['new_label'] = df['new_label'].astype(float)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   folder        162 non-null    object 
 1   negative      162 non-null    object 
 2   neutral       162 non-null    object 
 3   positive      162 non-null    object 
 4   new_label     162 non-null    float64
 5   negative_Wav  162 non-null    object 
 6   neutral_Wav   162 non-null    object 
 7   positive_Wav  162 non-null    object 
dtypes: float64(1), object(7)
memory usage: 10.3+ KB


In [8]:
df['labels'] = np.where(df['new_label'] >= 53.0, 1, 0)

In [9]:
df['labels'].value_counts()

labels
0    132
1     30
Name: count, dtype: int64

In [None]:
df.to_csv('all_data.csv', index=False)