<a href="https://colab.research.google.com/github/anmolag190153/BCS_summer_project_SER/blob/main/Speech_emotion_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SPEECH EMOTION RECOGNITION PROJECT
## BCS-IITK


# Emotions feature extraction for CNN model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa
import librosa.display
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [3]:
def extract_cnn_feature(file, sr=22050):
    result = np.array([])
    # cnn
    mfcc = np.mean(librosa.feature.mfcc(y=file,sr=sr,n_mfcc=20).T,axis=0)
    result = np.hstack((result,mfcc))
    spec_centroid = np.mean(librosa.feature.spectral_centroid(y=file,sr=sr).T,axis=0)
    result = np.hstack((result,spec_centroid))
    spec_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=file,sr=sr).T,axis=0)
    result = np.hstack((result,spec_bandwidth))
    spec_contrast = np.mean(librosa.feature.spectral_contrast(y=file,sr=sr).T,axis=0)
    result=np.hstack((result,spec_contrast))
    spec_flat = np.mean(librosa.feature.spectral_flatness(y=file).T,axis=0)
    result=np.hstack((result,spec_flat))

    # fuzzy
    f0, voiced_flag, voiced_probs = librosa.pyin(file, fmin=50, fmax=300)
    pitch = np.nanmean(f0)  # Mean pitch value (ignoring NaNs)
    result=np.hstack((result,pitch))        
    # Extract energy using Root Mean Square (RMS)
    energy = np.mean(librosa.feature.rms(y=file))
    result=np.hstack((result,energy)) 
    return result

# Loading Data
We are using the RAVDESS dataset for this project.

In [4]:
path='./Audio Datasets'
k=0
x_data=np.ndarray(shape=(24*60,120000))
y_data=np.ndarray(shape=(24*60,1))
for i,actor in enumerate(os.listdir(path)):
    path1=os.path.join(path,actor)
    for j,voice in enumerate(os.listdir(path1)):
        if voice=='test':
            continue
        b=voice[6:8]
        y_data[k,0]=b
        y_data[k,0]=y_data[k,0]-1
        temp_path=os.path.join(path1,voice)
        var,_=librosa.load(temp_path)
        x_data[k,:var.shape[0]]=var[:]
        x_data[k,var.shape[0]:]=0
        k=k+1

# Extracting Features
We have used mfccs and spectral features here.

In [5]:
x=[]
for i in range(24*60):
    feature=extract_cnn_feature(x_data[i,:])
    x.append(feature)

  pitch = np.nanmean(f0)  # Mean pitch value (ignoring NaNs)


**Scaling** the data and splitting data for **training** and **validation**

In [7]:
x_cnn, x_fuzzy, y_cnn ,y_fuzzy = train_test_split(np.array(x), y_data, test_size=0.4, random_state=10)

x_cnn_train, x_cnn_test, y_cnn_train, y_cnn_test = train_test_split(np.array(x_cnn), y_cnn, test_size=0.2, random_state=10)
x_cnn_val, x_cnn_test, y_cnn_val, y_cnn_test = train_test_split(np.array(x_cnn_test), y_cnn_test, test_size=0.1, random_state=10)

scaler = StandardScaler()
x_cnn_train = scaler.fit_transform(x_cnn_train)
x_cnn_test = scaler.transform(x_cnn_test)
x_cnn_val = scaler.transform(x_cnn_val)

x_fuzzy_train, x_fuzzy_test, y_fuzzy_train, y_fuzzy_test = train_test_split(np.array(x_fuzzy), y_fuzzy, test_size=0.2, random_state=10)
x_fuzzy_val, x_fuzzy_test, y_fuzzy_val, y_fuzzy_test = train_test_split(np.array(x_fuzzy_test), y_fuzzy_test, test_size=0.1, random_state=10)

scaler = StandardScaler()
x_fuzzy_train = scaler.fit_transform(x_fuzzy_train)
x_fuzzy_test = scaler.transform(x_fuzzy_test)
x_fuzzy_val = scaler.transform(x_fuzzy_val)


# Saving features 

In [8]:
for i,feat in enumerate(x_cnn_train):
    np.savetxt('features/cnn/train/'+str(i)+'_'+str(y_cnn_train[i])+'.csv',feat,delimiter=',')
for i,feat in enumerate(x_cnn_test):
    np.savetxt('features/cnn/test/'+str(i)+'_'+str(y_cnn_test[i])+'.csv',feat,delimiter=',')
for i,feat in enumerate(x_cnn_val):
    np.savetxt('features/cnn/val/'+str(i)+'_'+str(y_cnn_val[i])+'.csv',feat,delimiter=',')

for i,feat in enumerate(x_fuzzy_train):
    np.savetxt('features/fuzzy/train/'+str(i)+'_'+str(y_fuzzy_train[i])+'.csv',feat,delimiter=',')
for i,feat in enumerate(x_fuzzy_test):
    np.savetxt('features/fuzzy/test/'+str(i)+'_'+str(y_fuzzy_test[i])+'.csv',feat,delimiter=',')
for i,feat in enumerate(x_fuzzy_val):
    np.savetxt('features/fuzzy/val/'+str(i)+'_'+str(y_fuzzy_val[i])+'.csv',feat,delimiter=',')


In [9]:
print((x_cnn_train.shape,x_cnn_test.shape,y_cnn_train.shape))
print((x_fuzzy_train.shape,x_fuzzy_test.shape,y_fuzzy_train.shape))

((691, 32), (18, 32), (691, 1))
((460, 32), (12, 32), (460, 1))
