In [2]:
import pandas as pd
import numpy as np
import librosa
import os

In [3]:
def extract_emotion(name):
    emotion_list = ["HAP","SAD","FEA","ANG","NEU","DIS"]
    for e in emotion_list:
        if e in name:
            return e

In [4]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

In [5]:


def extract_features(data, sample_rate):
    result = np.array([])

    # Zero Crossing Rate
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))

    # STFT-based chroma features (mean and variance)
    stft = np.abs(librosa.stft(data))
    chroma = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
    chroma_mean = np.mean(chroma.T, axis=0)
    chroma_var = np.var(chroma.T, axis=0)
    result = np.hstack((result, chroma_mean, chroma_var))

    # MFCCs (mean, variance, delta1, delta2)
    mfcc = librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=20)
    mfcc_mean = np.mean(mfcc.T, axis=0)
    mfcc_var = np.var(mfcc.T, axis=0)
    mfcc_delta1 = np.mean(librosa.feature.delta(mfcc).T, axis=0)
    mfcc_delta2 = np.mean(librosa.feature.delta(mfcc, order=2).T, axis=0)
    result = np.hstack((result, mfcc_mean, mfcc_var, mfcc_delta1, mfcc_delta2))

    # Root Mean Square Energy (RMS)
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms))

    # MelSpectrogram (mean only)
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel))

    # Pitch estimation using pyin
    f0, voiced_flag, voiced_probs = librosa.pyin(data, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    f0 = f0[~np.isnan(f0)]  # remove nan values
    if len(f0) > 0:
        pitch_mean = np.mean(f0)
        pitch_var = np.var(f0)
    else:
        pitch_mean, pitch_var = 0.0, 0.0
    result = np.hstack((result, pitch_mean, pitch_var))

    # Intensity (approximated as RMS energy)
    intensity = librosa.feature.rms(y=data)[0]
    intensity_mean = np.mean(intensity)
    intensity_var = np.var(intensity)
    result = np.hstack((result, intensity_mean, intensity_var))

    return result


def noise(data):
    noise_amp = 0.005 * np.random.uniform() * np.amax(data)
    data_noisy = data + noise_amp * np.random.normal(size=data.shape[0])
    return data_noisy


def get_features(path):
    data, sample_rate = librosa.load(path, duration=3)
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)
    noise_data = noise(data)
    res2 = extract_features(noise_data, sample_rate)
    result = np.vstack((result, res2))
    return result


def get_features_test(path):
    data, sample_rate = librosa.load(path, duration=3)
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)
    return result


# train feature extraction

In [6]:
X, Y = [], []
count = 0
for file in os.listdir("/kaggle/input/speech-emotion-traning-data/data"):

    path = os.path.join("/kaggle/input/speech-emotion-traning-data/data", file)
    emotion = extract_emotion(file)
    count += 1
    feature = get_features(path)
    for temp_row in feature:
        X.append(temp_row)
        Y.append(emotion)
    print('\r'+f' Processed {count}/9054 audio samples ',end='')

print("features extracted!")
X = np.array(X)
print(X.shape)
Y = np.array(Y)
print(Y.shape)


Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('audio_features_238.csv', index=False)
Features.head(10)

 Processed 4/9054 audio samples features extracted!
(8, 238)
(8,)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,229,230,231,232,233,234,235,236,237,labels
0,0.040639,0.641578,0.731294,0.639526,0.574088,0.610525,0.589961,0.598165,0.721411,0.696141,...,5.294059e-10,5.212416e-10,5.153924e-10,5.118213e-10,5.095446e-10,132.84749,24.624941,0.010524,4.8e-05,SAD
1,0.043145,0.648762,0.737962,0.662575,0.585341,0.615408,0.602909,0.588862,0.715457,0.70145,...,3.370751e-06,3.436093e-06,3.353532e-06,3.235501e-06,3.261162e-06,132.868496,24.849041,0.010527,4.8e-05,SAD
2,0.034633,0.684663,0.754851,0.684856,0.600552,0.553416,0.568573,0.571734,0.612078,0.724511,...,5.690764e-11,5.663896e-11,5.651583e-11,5.651367e-11,5.65062e-11,104.217598,158.112587,0.009626,3.9e-05,SAD
3,0.03465,0.687432,0.757951,0.687873,0.603931,0.556571,0.570537,0.571732,0.611784,0.723946,...,1.0119e-07,9.992394e-08,1.011074e-07,1.10397e-07,1.06079e-07,104.217598,158.112587,0.009626,3.9e-05,SAD
4,0.077496,0.452142,0.5764,0.352777,0.296484,0.337495,0.537997,0.820265,0.499239,0.314776,...,2.811517e-05,3.997327e-05,5.109184e-05,3.520153e-05,2.263126e-06,183.471554,50.007235,0.009982,3.4e-05,NEU
5,0.077448,0.451526,0.576658,0.353099,0.296884,0.337914,0.537796,0.819052,0.498671,0.314717,...,2.814095e-05,3.995869e-05,5.11316e-05,3.519004e-05,2.264653e-06,183.471554,50.007235,0.009982,3.4e-05,NEU
6,0.051739,0.599006,0.711401,0.682961,0.614834,0.611833,0.608982,0.562066,0.632959,0.722276,...,5.122367e-08,4.994143e-08,4.901038e-08,4.841056e-08,4.803817e-08,0.0,0.0,0.011002,3.3e-05,DIS
7,0.053531,0.715706,0.699372,0.634654,0.623517,0.625513,0.567149,0.633407,0.725271,0.766311,...,3.577816e-06,3.762425e-06,3.821753e-06,3.549745e-06,3.619812e-06,0.0,0.0,0.011006,3.3e-05,DIS


## test feature extraction

In [9]:
data = []
file_count = 0
for file in os.listdir("/kaggle/input/speech-emotion-test-data/test"):

    path = os.path.join("/kaggle/input/speech-emotion-test-data/test", file)
    file_count += 1
    feature = get_features_test(path) 
    row = [file] + list(feature)
    data.append(row)
    print('\r'+f' Processed {file_count}/2764 audio samples ',end='')
    

#labels contains test file name 
columns = ['labels'] + [str(i) for i in range(len(feature))]

test_features_df = pd.DataFrame(data, columns=columns)

test_features_df.head()
# Save to CSV
test_features_df.to_csv('audio_features_238_test.csv', index=False)

 Processed 4/2764 audio samples 