# Preprocessing Data

## Import Libraries

In [None]:
import os
import librosa
import json
import math

## Define Functions

In [None]:
# Function to load dataset and extract MFCCs with segemnts
def extract_mfcc_seg(name, path, n_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    
    # Make a dictionary to store the data
    data = {
        'mfcc': [],
        'label':[]
    }
    
    # Loop through the folders
    for dirpath, dirnames, filenames in os.walk(path):
        
        # Process audio files
        for file in filenames:
            
            # Check file type
            if file.endswith('.wav'):
                
                # Save the semantic label
                label = get_label(name, file)
                
                #print("\nProcessing {} from: {}".format(semantic_label, dirpath))
                
                if label is not None:

                    # Load audio file
                    file_path = os.path.join(dirpath, file)
                    signal, sr = librosa.load(file_path, sr=4000)

                    # Calculations
                    duration = len(signal) / sr
                    samples_per_track = 4000 / duration
                    samples_per_segments = int(samples_per_track / num_segments)
                    expected_mffc_amount = math.ceil(samples_per_segments / hop_length)

                    # Process segments and extract MFCC
                    for s in range(num_segments):
                        start_sample = samples_per_segments * s
                        finish_sample = start_sample + samples_per_segments

                        # Extract MFCC
                        mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample],
                                                    sr=sr,
                                                    n_mfcc=n_mfcc,
                                                    n_fft=n_fft,
                                                    hop_length=hop_length
                                                   )
                        mfcc = mfcc.T

                        # Store the MFCC
                        if len(mfcc) == expected_mffc_amount:
                            data['mfcc'].append(mfcc.tolist())
                            data['label'].append(label)
                            print("{}\n{}, segment:{}".format(label, mfcc, s))
                
    return data

In [None]:
# Function to load dataset and extract MFCCs without segments
def extract_mfcc(name, path, n_mfcc=13, n_fft=2048, sr=44100):
    
    # Make a dictionary to store the data
    data = {
        'mfcc': [],
        'label':[]
    }
    
    # Loop through the folders
    for dirpath, dirnames, filenames in os.walk(path):
        
        # Process audio files
        for file in filenames:
            
            # Check file type
            if file.endswith('.wav'):
                
                # Save the semantic label
                label = get_label(name, file)
                
                if label is not None:
                    # Load audio file
                    file_path = os.path.join(dirpath, file)
                    signal, sr = librosa.load(file_path, sr=sr)

                    # Extract MFCC for the entire audio file
                    mfcc = librosa.feature.mfcc(y=signal,
                                                sr=sr,
                                                n_mfcc=n_mfcc,
                                                n_fft=n_fft,
                                               )
                    mfcc = mfcc.T

                    # Store the MFCC
                    data['mfcc'].append(mfcc.tolist())
                    data['label'].append(label)
                    print("{}\n{}".format(label, mfcc))
                    
    return data

In [None]:
# Function to label data
def get_label(name, data_label):
    
    # Check the dataset of the label
    if name == 'ravdess':
        label = data_label[7]
        label_name = {
            '1': 'neutral',
            '2': 'calm',
            '3': 'happy',
            '4': 'sad',
            '5': 'angry',
            '6': 'fearful',
            '7': 'disgust',
            '8': 'surprised'
        }
        
    elif name == 'crema':
        label = data_label.split('_')[2]
        label_name = {
            'ANG': 'angry',
            'DIS': 'disgust',
            'FEA': 'fearful',
            'HAP': 'happy',
            'NEU': 'neutral',
            'SAD': 'sad'
        }
        
    elif name == 'savee':
        label = data_label[0]
        if label == 's':
            label = data_label[0:2]
            
        label_name = {
            'a': 'angry',
            'd': 'disgust',
            'f': 'fearful',
            'h': 'happy',
            'n': 'neutral',
            'sa': 'sad',
            'su': 'surprised'
        }
        
    elif name == 'tess':
        label = data_label.split('_')[2]
        label_name = {
            'angry.wav': 'angry',
            'disgust.wav': 'disgust',
            'fear.wav': 'fearful',
            'happy.wav': 'happy',
            'neutral.wav': 'neutral',
            'ps.wav': 'surprised',
            'sad.wav': 'sad'
        }
    
    """"
    Encode all labels
    # Encode the label
    label_encoding = {
        'neutral': 0,
        'calm': 1,
        'happy': 2,
        'sad': 3,
        'angry': 4,
        'fearful': 5,
        'disgust': 6,
        'surprised': 7
    }
    """
    
    
    # Encode only 6 label
    label_encoding = {
        'neutral': 0,
        'happy': 1,
        'sad': 2,
        'angry': 3,
        'fearful': 4,
        'disgust': 5,
    }
    
    semantic_label = label_name.get(label)
    num_label = label_encoding.get(semantic_label)
    
    return num_label

## RAVDESS Dataset

In [None]:
## Path of the dataset
ravdess_path = f"{os.getcwd()}/ravdess"

# Load data
ravdess_data = extract_mfcc('ravdess', ravdess_path)

In [None]:
print(ravdess_data['mfcc'][18], ravdess_data['label'][18])
len(ravdess_data['mfcc'])

## CREMA_D Dataset

In [None]:
# Path of the dataset
crema_path = f"{os.getcwd()}/crema_d"

# Load data
crema_data = extract_mfcc('crema', crema_path)

In [None]:
print(crema_data['mfcc'][7], crema_data['label'][7])
len(crema_data['mfcc'])

## SAVEE Dataset

In [None]:
# Path of the dataset
savee_path = f"{os.getcwd()}/savee"

# Load data
savee_data = extract_mfcc('savee', savee_path)

In [None]:
print(savee_data['mfcc'][7], savee_data['label'][7])
len(savee_data['mfcc'])

## TESS Dataset

In [None]:
# Path of the dataset
tess_path = f"{os.getcwd()}/tess"

# Load data
tess_data = extract_mfcc('tess', tess_path)

In [None]:
print(tess_data['mfcc'][7], tess_data['label'][7])
len(tess_data['mfcc'])

## Augmented RAVDESS Dataset

In [None]:
# Path of the dataset
aug_path = f"{os.getcwd()}/augmented_data"

# Load data
aug_data = extract_mfcc('ravdess', aug_path)

In [None]:
print(aug_data['mfcc'][7], aug_data['label'][7])
len(aug_data['mfcc'])

## Combine Data

In [None]:
# Dictionary to save all the data
data = {
    'mfcc': [],
    'label': []
}

print(len(data['mfcc']), len(data['label']))

In [None]:
# Add ravdess data
data['mfcc'].extend(ravdess_data['mfcc'])
data['label'].extend(ravdess_data['label'])

print(len(data['mfcc']), len(data['label']))

In [None]:
# Add crema data
data['mfcc'].extend(crema_data['mfcc'])
data['label'].extend(crema_data['label'])

print(len(data['mfcc']), len(data['label']))

In [None]:
# Add savee data
data['mfcc'].extend(savee_data['mfcc'])
data['label'].extend(savee_data['label'])

print(len(data['mfcc']), len(data['label']))

In [None]:
# Add tess data
data['mfcc'].extend(tess_data['mfcc'])
data['label'].extend(tess_data['label'])

print(len(data['mfcc']), len(data['label']))

In [None]:
# Add augmented data
data['mfcc'].extend(aug_data['mfcc'])
data['label'].extend(aug_data['label'])

print(len(data['mfcc']), len(data['label']))

## Ouput

In [None]:
def save_json(json_path, data):
    with open(json_path, 'w') as f:
        json.dump(data, f, indent=4)

In [None]:
# Path of the json file
json_path = 'combined-data - 6 label.json'

# Ouput json file
save_json(json_path, data)