In [1]:
import os
import librosa
import pandas as pd
import numpy as np
import pyaudio
import wave
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [3]:
# Emotions mapping
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}
observed_emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

In [5]:
# Function to list all .wav files in a directory and subdirectories
def list_wav_files(directory):
    wav_files = []
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith(".wav"):
                wav_files.append(os.path.join(root, filename))
    return wav_files

## PRE-PROCESSING & FEATURE EXTRACTION

In [8]:
# Function to extract features from audio file
def extract_features(file_path, mfcc=True, chroma=True, mel=True, pitch=True, energy=True):
    audio, sr = librosa.load(file_path, sr=None)
    features = []
    # Acoustic Features
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40).T, axis=0)
        features.extend(mfccs)
    if chroma:
        stft = np.abs(librosa.stft(audio))
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
        features.extend(chroma)
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sr).T, axis=0)
        features.extend(mel)
        
    # Temporal Features    
    if pitch:
        pitches, magnitudes = librosa.core.piptrack(y=audio, sr=sr)
        pitch = np.mean(pitches[pitches > 0])
        features.append(pitch)
    if energy:
        energy = np.mean(librosa.feature.rms(y=audio).T, axis=0)
        features.append(energy)
    return features



In [10]:
# Function to load data and extract features for each audio file
def load_data_and_extract_features(file_paths):
    data = []
    labels = []
    for file_path in file_paths:
        parts = os.path.basename(file_path).split("-")
        if len(parts) < 3:
            continue
        emotion = emotions.get(parts[2], None)
        if emotion not in observed_emotions:
            continue
        features = extract_features(file_path)
        data.append(features)
        labels.append(emotion)
    return data, labels

In [12]:
audio_path = '/Users/noshitha/Downloads/AUDIO DATA'
wav_files = list_wav_files(audio_path)
data, labels = load_data_and_extract_features(wav_files)

# Print the number of samples and labels
print(f"Number of samples: {len(data)}")
print(f"Number of labels: {len(labels)}")

Number of samples: 2880
Number of labels: 2880


## RANDOM FOREST CLASSIFIER

In [13]:
# Function to pad features to ensure consistent feature lengths
def pad_features(features, max_len=180):
    padded_features = np.zeros((len(features), max_len))
    for i, feature in enumerate(features):
        if len(feature) > max_len:
            padded_features[i, :max_len] = feature[:max_len]
        else:
            padded_features[i, :len(feature)] = feature
    return padded_features

In [14]:
# Pad features to ensure consistent lengths
padded_data = pad_features(data)

# Convert labels to numerical format
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_data, encoded_labels, test_size=0.2, random_state=9)

# Train a Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_clf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the trained model
with open('rf_emotion_classifier.pkl', 'wb') as file:
    pickle.dump(rf_clf, file)

Random Forest Accuracy: 0.9201388888888888
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        83
           1       0.92      1.00      0.96        72
           2       0.92      0.92      0.92        77
           3       0.91      0.89      0.90        72
           4       0.88      0.84      0.86        87
           5       1.00      0.85      0.92        41
           6       0.93      0.93      0.93        82
           7       0.89      1.00      0.94        62

    accuracy                           0.92       576
   macro avg       0.93      0.92      0.92       576
weighted avg       0.92      0.92      0.92       576



## CNN

## REAL TIME PROCESSING

In [22]:
# Function to pad features to ensure consistent feature lengths
def pad_features_real_time(features, max_len=180):
    padded_features = np.zeros((1, max_len))
    if len(features) > max_len:
        padded_features[0, :max_len] = features[:max_len]
    else:
        padded_features[0, :len(features)] = features
    return padded_features

# Function to load the trained model and make predictions
def predict_emotion(file_path, model_path='rf_emotion_classifier.pkl'):
    features = extract_features(file_path)
    padded_features = pad_features_real_time(features)
    
    # Load the trained model
    with open(model_path, 'rb') as file:
        model = pickle.load(file)
    
    # Predict the emotion
    prediction = model.predict(padded_features)
    
    # Convert prediction to emotion label
    label_encoder = LabelEncoder()
    label_encoder.fit(observed_emotions)
    emotion = label_encoder.inverse_transform(prediction)
    
    return emotion[0]

# Real-time audio processing
def process_real_time_audio():
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100
    RECORD_SECONDS = 5
    WAVE_OUTPUT_FILENAME = "output.wav"

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("* recording")

    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("* done recording")

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    
    return WAVE_OUTPUT_FILENAME

# Capture and process audio in real-time
audio_file = process_real_time_audio()
rf_emotion = predict_emotion(audio_file)
print(f"Predicted Emotion (Random Forest): {rf_emotion}")


* recording
* done recording
Predicted Emotion (Random Forest): happy
