In [5]:
#!pip install tensorflow tensorflow-hub librosa numpy pandas scikit-learn keras

In [6]:
!unzip "/content/ravdess.zip" -d ravdess

Archive:  /content/ravdess.zip
   creating: ravdess/Actor_01/
  inflating: ravdess/Actor_01/03-01-01-01-01-01-01.wav  
  inflating: ravdess/Actor_01/03-01-01-01-01-02-01.wav  
  inflating: ravdess/Actor_01/03-01-01-01-02-01-01.wav  
  inflating: ravdess/Actor_01/03-01-01-01-02-02-01.wav  
  inflating: ravdess/Actor_01/03-01-02-01-01-01-01.wav  
  inflating: ravdess/Actor_01/03-01-02-01-01-02-01.wav  
  inflating: ravdess/Actor_01/03-01-02-01-02-01-01.wav  
  inflating: ravdess/Actor_01/03-01-02-01-02-02-01.wav  
  inflating: ravdess/Actor_01/03-01-02-02-01-01-01.wav  
  inflating: ravdess/Actor_01/03-01-02-02-01-02-01.wav  
  inflating: ravdess/Actor_01/03-01-02-02-02-01-01.wav  
  inflating: ravdess/Actor_01/03-01-02-02-02-02-01.wav  
  inflating: ravdess/Actor_01/03-01-03-01-01-01-01.wav  
  inflating: ravdess/Actor_01/03-01-03-01-01-02-01.wav  
  inflating: ravdess/Actor_01/03-01-03-01-02-01-01.wav  
  inflating: ravdess/Actor_01/03-01-03-01-02-02-01.wav  
  inflating: ravdess/Actor

In [None]:
import os
import numpy as np
import librosa
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam

In [None]:
# Function to parse the filename to get the emotion label
def parse_filename(filename):
    parts = filename.split('-')
    emotion = int(parts[2])  # Emotion label is the third part
    return emotion

In [None]:
#YAMNet model
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

#features (YAMNet + MFCCs + Chroma + Mel + Spectral Contrast + Tonnetz + ZCR)
def extract_combined_features(file_name):
    try:
        #resample to 16 kHz (YAMNet's expected sample rate)
        audio, _ = librosa.load(file_name, sr=16000)

        #YAMNet embeddings
        scores, embeddings, spectrogram = yamnet_model(audio)
        yamnet_feature = np.mean(embeddings, axis=0)

        #additional features
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=40).T, axis=0)
        chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=16000).T, axis=0)
        mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=16000).T, axis=0)
        contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=16000).T, axis=0)
        tonnetz = np.mean(librosa.feature.tonnetz(y=audio, sr=16000).T, axis=0)
        zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        #Combine features
        combined_feature = np.hstack([yamnet_feature, mfccs, chroma, mel, contrast, tonnetz, zcr])
        return combined_feature
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")
        return None

In [None]:
# Load dataset
dataset_path = '/content/ravdess'

features = []
labels = []

for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith('.wav'):
            file_path = os.path.join(root, file)
            emotion = parse_filename(file)
            feature = extract_combined_features(file_path)
            if feature is not None:
                features.append(feature)
                labels.append(emotion)

# Convert to numpy arrays
X = np.array(features)
y = np.array(labels)

# Check if X and y are non-empty
if X.size == 0 or y.size == 0:
    raise ValueError("No valid data found. Check dataset path and feature extraction.")

# Encode labels to one-hot vectors
y = to_categorical(y)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [None]:
#CNN model
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate = 1e-4), metrics=['accuracy'])

In [21]:
#train
history = model.fit(X_train, y_train, epochs=100, batch_size=12, validation_data=(X_test, y_test))

Epoch 1/100
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9503 - loss: 0.1642 - val_accuracy: 0.6042 - val_loss: 2.7969
Epoch 2/100
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9547 - loss: 0.1655 - val_accuracy: 0.5972 - val_loss: 3.4380
Epoch 3/100
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9513 - loss: 0.2129 - val_accuracy: 0.6285 - val_loss: 2.9658
Epoch 4/100
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9560 - loss: 0.1538 - val_accuracy: 0.5729 - val_loss: 3.5506
Epoch 5/100
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9440 - loss: 0.1939 - val_accuracy: 0.5799 - val_loss: 3.3572
Epoch 6/100
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9555 - loss: 0.1755 - val_accuracy: 0.5729 - val_loss: 3.5561
Epoch 7/100
[1m96/96[0m [32m━━━

In [None]:
# Save
model.save('/content/VOICE_model.h5')