In [1]:
import pandas as pd 
import numpy as np 
import pickle

import librosa
import librosa.display

from IPython.display import Audio

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score , classification_report , confusion_matrix



In [2]:
data_path = pd.read_csv('/kaggle/input/happysad/dataset.csv')
data_path.head()

Unnamed: 0,Emotions,Path
0,1,/kaggle/input/ravdess-emotional-speech-audio/a...
1,0,/kaggle/input/ravdess-emotional-speech-audio/a...
2,0,/kaggle/input/ravdess-emotional-speech-audio/a...
3,1,/kaggle/input/ravdess-emotional-speech-audio/a...
4,0,/kaggle/input/ravdess-emotional-speech-audio/a...


In [3]:
class AudioFeatureExtractor:
    def __init__(self):
        pass
    
    def noise(self, data):
        noise_amp = 0.035*np.random.uniform()*np.amax(data)
        data = data + noise_amp*np.random.normal(size=data.shape[0])
        return data

    def stretch(self, data, rate=0.8):
        return librosa.effects.time_stretch(data, rate=rate)

    def shift(self, data):
        shift_range = int(np.random.uniform(low=-5, high=5)*1000)
        return np.roll(data, shift_range)

    def pitch(self, data, sampling_rate, pitch_factor=0.7):
        return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

    def extract_features(self, data, sample_rate):
        result = np.array([])

        zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
        result = np.hstack((result, zcr))

        stft = np.abs(librosa.stft(data))
        chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma_stft))

        mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mfcc))

        rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
        result = np.hstack((result, rms))

        mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel))

        return result

    def get_features(self, path):
        data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
        res1 = self.extract_features(data, sample_rate)

        noise_data = self.noise(data)
        res2 = self.extract_features(noise_data, sample_rate)

        new_data = self.stretch(data)
        data_stretch_pitch = self.pitch(new_data, sample_rate)
        res3 = self.extract_features(data_stretch_pitch, sample_rate)

        return np.vstack((res1, res2, res3))

In [4]:
# Example Usage
extractor = AudioFeatureExtractor()

X, Y = [], []
for path, emotion in zip(data_path.Path, data_path.Emotions):
    features = extractor.get_features(path)
    X.extend(features)
    Y.extend([emotion]*len(features))
    
len(X) , len(Y)

(1152, 1152)

In [5]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features.csv', index=False)
Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.208293,0.506897,0.422292,0.393062,0.495781,0.44363,0.374513,0.397753,0.549603,0.739099,...,8.2e-05,0.000154,7.6e-05,0.000132,9.5e-05,6.6e-05,7.9e-05,2.9e-05,3e-06,1
1,0.278985,0.640704,0.580154,0.6013,0.680189,0.619322,0.555964,0.541476,0.690339,0.787897,...,0.000192,0.00027,0.000183,0.000243,0.000206,0.000179,0.000196,0.00014,0.000113,1
2,0.166753,0.541065,0.442284,0.418231,0.551849,0.446742,0.385239,0.432699,0.554014,0.748904,...,1.3e-05,1.3e-05,2.7e-05,2.3e-05,2.3e-05,3.9e-05,2e-05,6e-06,1e-06,1
3,0.252356,0.619311,0.588566,0.547787,0.569659,0.564923,0.583524,0.65439,0.730218,0.68871,...,6.9e-05,6.4e-05,5.9e-05,5.2e-05,3.3e-05,4.7e-05,5.7e-05,2.6e-05,3e-06,0
4,0.313915,0.727826,0.705622,0.738513,0.750247,0.718389,0.675212,0.733604,0.715832,0.698479,...,0.000714,0.000737,0.000697,0.000676,0.000668,0.000689,0.000699,0.000673,0.000661,0


In [6]:
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=42, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((864, 162), (864,), (288, 162), (288,))

In [7]:
model = XGBClassifier()
model.fit(x_train , y_train)

with open('model.pkl' , 'wb') as model_file : 
    pickle.dump(model , model_file)
    print('Dumping Done')

Dumping Done


In [8]:
with open('model.pkl' , 'rb') as model_file :
    loaded_model = pickle.load(model_file)
    
y_pred = loaded_model.predict(x_test)

In [9]:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85       146
           1       0.83      0.88      0.85       142

    accuracy                           0.85       288
   macro avg       0.85      0.85      0.85       288
weighted avg       0.85      0.85      0.85       288



In [10]:
def user(audio_file_path) :
    extractor = AudioFeatureExtractor()

    x = []
    for path in (audio_file_path):
        features = extractor.get_features(path)
        X.extend(features)
    
    with open('model.pkl' , 'rb') as model_file :
        loaded_model = pickle.load(model_file)
    
    y_pred = loaded_model.predict(X)
    
    return y_pred 