<a href="https://colab.research.google.com/github/Rishab741/Emotion-recognition/blob/main/Voice_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install tensorflow librosa transformers sounddevice numpy streamlit pydub



In [None]:
import librosa
import numpy as np

def extract_audio_features(file_path):
    # Load audio file
    y, sr = librosa.load(file_path, sr=None)

    # Extract features
    features = {
        'mfcc': np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1),
        'chroma': np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1),
        'mel': np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1),
        'contrast': np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1),
        'tonnetz': np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr), axis=1)
    }

    # Combine all features
    combined = np.concatenate([features['mfcc'], features['chroma'],
                             features['mel'], features['contrast'],
                             features['tonnetz']])
    return combined

In [None]:
from transformers import pipeline

text_classifier = pipeline(
    "text-classification",
    model="finiteautomata/bertweet-base-emotion-analysis",
    return_all_scores=True
)

def analyze_text_emotion(text):
    results = text_classifier(text)[0]
    return {item['label']: item['score'] for item in results}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Device set to use cuda:0


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout

# Audio model
audio_input = Input(shape=(180,))  # Adjust based on your feature size
audio_dense = Dense(128, activation='relu')(audio_input)
audio_dropout = Dropout(0.3)(audio_dense)

# Text model (using pre-extracted features)
text_input = Input(shape=(6,))  # 6 emotion scores from text model
text_dense = Dense(64, activation='relu')(text_input)

# Combine
combined = Concatenate()([audio_dropout, text_dense])
dense = Dense(64, activation='relu')(combined)
output = Dense(4, activation='softmax')(dense)  # 4 emotions

model = Model(inputs=[audio_input, text_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
!apt-get install -y libportaudio2
!pip install sounddevice

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libportaudio2 is already the newest version (19.6.0-1.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
import sounddevice as sd
import numpy as np
from pydub import AudioSegment

def record_audio(duration=5, sample_rate=44100):
    print(f"Recording for {duration} seconds...")
    recording = sd.rec(int(duration * sample_rate),
                      samplerate=sample_rate,
                      channels=1)
    sd.wait()
    return np.squeeze(recording)

def save_and_process(audio, filename="temp.wav", sample_rate=44100):
    # Save as WAV
    audio_segment = AudioSegment(
        audio.tobytes(),
        frame_rate=sample_rate,
        sample_width=audio.dtype.itemsize,
        channels=1
    )
    audio_segment.export(filename, format="wav")

    # Process
    audio_features = extract_audio_features(filename)
    return audio_features

In [None]:
import streamlit as st
import time

st.title("🎙️ Real-Time Emotion Detector")

if st.button("Start Recording"):
    with st.spinner("Recording for 5 seconds..."):
        audio = record_audio()
        audio_features = save_and_process(audio)

        st.audio("temp.wav")

        text = st.text_input("What did you say? (For text analysis)")
        if text:
            text_emotion = analyze_text_emotion(text)
            text_features = np.array(list(text_emotion.values()))

            # Predict (using our trained model)
            prediction = model.predict([audio_features.reshape(1, -1),
                                      text_features.reshape(1, -1)])
            emotions = ["happy", "sad", "angry", "neutral"]
            predicted_emotion = emotions[np.argmax(prediction)]

            st.success(f"Predicted emotion: {predicted_emotion}")

            # Show probabilities
            st.subheader("Emotion Probabilities")
            for e, p in zip(emotions, prediction[0]):
                st.write(f"{e}: {p:.2f}")

2025-08-06 04:55:01.894 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


In [None]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload() # This will prompt you to upload the kaggle.json file

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"rishabchouhan","key":"7a9c237ea76248669a1eb98764d9dca7"}'}

In [None]:
import os
import stat

# Create the .kaggle directory if it doesn't exist
kaggle_dir = os.path.expanduser('~/.kaggle')
os.makedirs(kaggle_dir, exist_ok=True)

# Define the path to the kaggle.json file
kaggle_json_path = os.path.join(kaggle_dir, 'kaggle.json')

# Move the uploaded kaggle.json to the correct directory
# Assuming the file was uploaded to the current working directory
if os.path.exists('kaggle.json'):
    os.rename('kaggle.json', kaggle_json_path)
    print(f"Moved kaggle.json to {kaggle_json_path}")
else:
    print("kaggle.json not found in the current directory. Please upload it again.")

# Set the permissions to read/write only for the owner (600)
if os.path.exists(kaggle_json_path):
    os.chmod(kaggle_json_path, stat.S_IREAD | stat.S_IWRITE)
    print(f"Set permissions for {kaggle_json_path} to 600")
else:
    print("Cannot set permissions, kaggle.json not found.")

Moved kaggle.json to /root/.kaggle/kaggle.json
Set permissions for /root/.kaggle/kaggle.json to 600


In [None]:
import os
import zipfile
import librosa
import numpy as np
import pandas as pd
from datasets import Dataset, ClassLabel, Features

# 1. Set the dataset identifier
KAGGLE_DATASET_ID = "ejlok1/cremad"
DOWNLOAD_DIR = "crema_d_raw" # Directory to download the zip file and extract to

# 2. Authenticate Kaggle API (if not done by files.upload() or manual placement)
# This step is usually handled by the `kaggle.json` file once placed correctly.
# from kaggle.api.kaggle_api_extended import KaggleApi
# api = KaggleApi()
# api.authenticate()

# 3. Create download directory if it doesn't exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# 4. Download the dataset
print(f"Downloading {KAGGLE_DATASET_ID} from Kaggle...")
# The !kaggle command uses the CLI, which is typically easier in notebooks
!kaggle datasets download -d {KAGGLE_DATASET_ID} -p {DOWNLOAD_DIR} --unzip

print(f"Dataset downloaded and unzipped to {DOWNLOAD_DIR}/AudioWAV")

# 5. Define the base path to the audio files after unzipping
# Kaggle's CREMA-D usually unzips into an 'AudioWAV' folder inside the download directory
AUDIO_BASE_PATH = os.path.join(DOWNLOAD_DIR, "AudioWAV")

# 6. Prepare data for Hugging Face Dataset
data_list = []
# These are the 6 emotions from CREMA-D's naming convention
EMOTION_MAP = {
    'ANG': 'angry', 'DIS': 'disgust', 'FEA': 'fear',
    'HAP': 'happy', 'NEU': 'neutral', 'SAD': 'sad'
}
all_emotion_names = sorted(list(EMOTION_MAP.values()))

# Walk through the directory and collect file paths and labels
print("Processing audio files and extracting labels...")
for filename in os.listdir(AUDIO_BASE_PATH):
    if filename.endswith('.wav'):
        file_path = os.path.join(AUDIO_BASE_PATH, filename)

        # Parse emotion from filename (e.g., 1001_DFA_ANG_XX.wav)
        parts = filename.split('_')
        if len(parts) >= 3:
            emotion_code = parts[2].upper() # Ensure uppercase for map lookup
            emotion_label = EMOTION_MAP.get(emotion_code, None)

            if emotion_label:
                data_list.append({
                    'audio_path': file_path, # Store the path instead of loading directly
                    'label': emotion_label # Categorical label string
                })

# Convert to a Pandas DataFrame (optional, but often helpful)
df_crema_d = pd.DataFrame(data_list)
print(f"Found {len(df_crema_d)} audio files in CREMA-D.")
print("Emotion distribution:\n", df_crema_d['label'].value_counts())

# 7. Create the Hugging Face Dataset from the DataFrame with paths
crema_d_dataset = Dataset.from_pandas(df_crema_d)

# Manually load audio and add as a new column
def load_audio_and_sr(batch):
    audio_arrays = []
    sampling_rates = []
    for path in batch['audio_path']:
        try:
            # Load with librosa, explicitly setting sr to None to get original sr
            y, sr = librosa.load(path, sr=None)
            audio_arrays.append(y)
            sampling_rates.append(sr)
        except Exception as e:
            # Handle potential errors during loading
            print(f"Error loading audio file {path}: {e}")
            audio_arrays.append(None) # Append None for failed loads
            sampling_rates.append(None)
    batch['audio'] = [{'array': arr, 'sampling_rate': sr} for arr, sr in zip(audio_arrays, sampling_rates)]
    return batch

print("\nLoading audio data using librosa...")
# Use batched map for efficiency
crema_d_dataset = crema_d_dataset.map(load_audio_and_sr, batched=True, remove_columns=['audio_path'])

# Filter out samples that failed to load
crema_d_dataset = crema_d_dataset.filter(lambda x: x['audio'] is not None)

# Cast the label column to ClassLabel
crema_d_dataset = crema_d_dataset.cast_column("label", ClassLabel(names=all_emotion_names))


print("\nSuccessfully created Hugging Face Dataset for CREMA-D with loaded audio:")
print(crema_d_dataset)
print(crema_d_dataset.features)

# Access a sample to verify
print("\nFirst sample from CREMA-D dataset:")
# Access the first sample using integer indexing, as there are no predefined splits
if len(crema_d_dataset) > 0:
    print(crema_d_dataset[0])
else:
    print("Dataset is empty after processing.")

Downloading ejlok1/cremad from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/ejlok1/cremad
License(s): ODC Attribution License (ODC-By)
Downloading cremad.zip to crema_d_raw
 95% 427M/451M [00:02<00:00, 197MB/s]
100% 451M/451M [00:02<00:00, 173MB/s]
Dataset downloaded and unzipped to crema_d_raw/AudioWAV
Processing audio files and extracting labels...
Found 7442 audio files in CREMA-D.
Emotion distribution:
 label
angry      1271
disgust    1271
sad        1271
fear       1271
happy      1271
neutral    1087
Name: count, dtype: int64

Loading audio data using librosa...


Map:   0%|          | 0/7442 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7442 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7442 [00:00<?, ? examples/s]


Successfully created Hugging Face Dataset for CREMA-D with loaded audio:
Dataset({
    features: ['label', 'audio'],
    num_rows: 7442
})
{'label': ClassLabel(names=['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad']), 'audio': {'array': List(Value('float32')), 'sampling_rate': Value('int64')}}

First sample from CREMA-D dataset:
{'label': 0, 'audio': {'array': [0.00042724609375, -0.000274658203125, -0.0013427734375, -0.002349853515625, -0.002288818359375, -0.00238037109375, -0.002777099609375, -0.002716064453125, -0.001922607421875, -0.00140380859375, -0.001312255859375, -0.001617431640625, -0.001617431640625, -0.0010986328125, -0.000457763671875, 0.000335693359375, 0.0006103515625, 0.000244140625, 0.001312255859375, 0.001190185546875, 0.001434326171875, 0.00262451171875, 0.003265380859375, 0.00372314453125, 0.00469970703125, 0.00579833984375, 0.005859375, 0.006622314453125, 0.0064697265625, 0.00628662109375, 0.005462646484375, 0.005859375, 0.005828857421875, 0.005584716796875, 

Let's set up your Kaggle API key for the command line interface.

In [None]:
!kaggle datasets list

ref                                                      title                                                    size  lastUpdated                 downloadCount  voteCount  usabilityRating  
-------------------------------------------------------  -------------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
rohitgrewal/airlines-flights-data                        Airlines Flights Data                                 2440299  2025-07-29 09:16:00.463000           6941        127  1.0              
wasiqaliyasir/breast-cancer-dataset                      Breast cancer dataset                                   49830  2025-07-30 12:52:44.057000           4234        151  1.0              
abdulmalik1518/cars-datasets-2025                        Cars Datasets (2025)                                    25987  2025-07-17 21:43:28.493000          10157        230  1.0              
kunshbhatia/delhi-air-quality-dataset   

In [None]:
import os
import shutil
# Path to your Hugging Face cache directory
# This is typically ~/.cache/huggingface/datasets
# You can also find it by running: from huggingface_hub import HfFileSystem; print(HfFileSystem().root)
hf_cache_dir = os.path.expanduser("~/.cache/huggingface/datasets")

# You want to clear the cache specifically for 'go_emotions'
go_emotions_cache_path = os.path.join(hf_cache_dir, "go_emotions")

if os.path.exists(go_emotions_cache_path):
    print(f"Clearing cache for go_emotions at: {go_emotions_cache_path}")
    shutil.rmtree(go_emotions_cache_path)
else:
    print(f"GoEmotions cache directory not found at: {go_emotions_cache_path}")

# You might also want to clear any general fsspec cache if the problem persists
fsspec_cache_dir = os.path.expanduser("~/.cache/fsspec")
if os.path.exists(fsspec_cache_dir):
    print(f"Clearing fsspec cache at: {fsspec_cache_dir}")
    shutil.rmtree(fsspec_cache_dir)

GoEmotions cache directory not found at: /root/.cache/huggingface/datasets/go_emotions


In [None]:
!pip install --upgrade datasets fsspec huggingface_hub

Collecting fsspec
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Downloading huggingface_hub-0.34.3-py3-none-any.whl (558 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.34.1
    Uninstalling huggingface-hub-0.34.1:
      Successfully uninstalled huggingface-hub-0.34.1
Successfully installed huggingface_hub-0.34.3


In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='librosa')

In [None]:
# setup_and_config.py
# --- 1. SETUP AND CONFIGURATION ---
import os
import zipfile
import librosa
import numpy as np
import pandas as pd
from datasets import Dataset, Audio, ClassLabel, Features, load_dataset
from transformers import pipeline
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.utils import to_categorical
import random
from collections import defaultdict
import sounddevice as sd
from scipy.io.wavfile import write as write_wav
import time

# --- Core Configuration ---
# Define the target emotions for the final model
TARGET_EMOTIONS = ["happy", "sad", "angry", "neutral"]
NUM_OUTPUT_EMOTIONS = len(TARGET_EMOTIONS)
emotion_to_int = {emotion: i for i, emotion in enumerate(TARGET_EMOTIONS)}

# Define the expected order of labels from the text model
BERTWEET_EMOTION_LABELS = ['anger', 'joy', 'optimism', 'sadness']
TEXT_FEATURE_SIZE = len(BERTWEET_EMOTION_LABELS)

# Audio feature settings
DUMMY_SR = 16000
AUDIO_FEATURE_SIZE = 166 # Hardcoded as determined earlier

# Training Configuration
TRAIN_RATIO = 0.8
BUFFER_SIZE = 1024
BATCH_SIZE = 32
EPOCHS = 20 # FIX: Removed comma to make it an integer

print("Setup and Configuration Loaded.")

Setup and Configuration Loaded.


In [None]:
# feature_extraction_and_model.py
# --- 2. FEATURE EXTRACTION AND MODEL DEFINITION ---

# --- Audio Feature Extraction ---
def extract_audio_features(y, sr):
    """
    Extracts a feature vector from an audio waveform.
    FIXED: Robustly converts 'y' to a NumPy array of float32.
    """
    if isinstance(y, list):
        y = np.array(y, dtype=np.float32)
    elif not isinstance(y, np.ndarray) or y.dtype != np.float32:
        y = y.astype(np.float32)

    features = {
        'mfcc': np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1),
        'chroma': np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1),
        'mel': np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1),
        'contrast': np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1),
        'tonnetz': np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr), axis=1)
    }
    concatenated_features = np.concatenate(list(features.values()))

    # Ensure consistent feature size (padding/truncating)
    if len(concatenated_features) < AUDIO_FEATURE_SIZE:
        padded_features = np.pad(concatenated_features, (0, AUDIO_FEATURE_SIZE - len(concatenated_features)), 'constant')
        return padded_features
    elif len(concatenated_features) > AUDIO_FEATURE_SIZE:
        return concatenated_features[:AUDIO_FEATURE_SIZE]
    else:
        return concatenated_features


# --- Text Emotion Analysis Pipeline ---
# Initialize the pipeline here. We will use it on a dataset later.
text_classifier = pipeline(
    "text-classification",
    model="finiteautomata/bertweet-base-emotion-analysis",
    return_all_scores=True,
    device= -1 # Use GPU if available
)

def analyze_text_emotion_batch(text_batch):
    """
    Analyzes a batch of texts using the pipeline for efficiency.
    """
    if not text_batch:
        return np.zeros((0, TEXT_FEATURE_SIZE), dtype=np.float32)

    results_list = text_classifier(text_batch)
    batch_scores = []
    for results in results_list:
        emotion_scores = {label: 0.0 for label in BERTWEET_EMOTION_LABELS}
        for item in results:
            if item['label'] in emotion_scores:
                emotion_scores[item['label']] = item['score']
        batch_scores.append([emotion_scores[label] for label in BERTWEET_EMOTION_LABELS])
    return np.array(batch_scores, dtype=np.float32)

# --- Multimodal Model Architecture ---
def create_multimodal_model():
    """
    Creates and compiles the Keras multimodal model.
    """
    # Audio branch
    audio_input = Input(shape=(AUDIO_FEATURE_SIZE,), name='audio_input')
    audio_dense = Dense(128, activation='relu')(audio_input)
    audio_dropout = Dropout(0.3)(audio_dense)

    # Text branch
    text_input = Input(shape=(TEXT_FEATURE_SIZE,), name='text_input')
    text_dense = Dense(64, activation='relu')(text_input)

    # Fusion
    combined = Concatenate()([audio_dropout, text_dense])
    dense = Dense(64, activation='relu')(combined)
    output = Dense(NUM_OUTPUT_EMOTIONS, activation='softmax', name='emotion_output')(dense)

    model = Model(inputs=[audio_input, text_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the model and print its summary
model = create_multimodal_model()
model.summary()
print("\nFeature Extraction functions and Model Architecture defined.")

Device set to use cpu



Feature Extraction functions and Model Architecture defined.


In [None]:
# unified_dataset_and_training.py
# --- 3. DATASET LOADING AND PREPROCESSING ---

# --- Emotion Mapping Dictionaries (ensure these are correct and align with your data) ---
# --- Emotion Mapping Dictionaries ---
CREMA_D_TO_TARGET = {
    'HAP': 'happy',
    'SAD': 'sad',
    'ANG': 'angry',
    'NEU': 'neutral',
    'DIS': 'angry',  # Mapping 'DIS' (disgust) to 'angry' as per your original logic
    'FEA': 'neutral' # Mapping 'FEA' (fear) to 'neutral' as per your original logic
}
TESS_TO_TARGET = { 'happy': 'happy', 'sad': 'sad', 'angry': 'angry', 'neutral': 'neutral', 'disgust': 'angry', 'fear': 'neutral', 'surprise': 'happy' }
GOEMOTIONS_TO_TARGET = {
    'joy': 'happy', 'amusement': 'happy', 'excitement': 'happy', 'optimism': 'happy', 'love': 'happy',
    'sadness': 'sad', 'grief': 'sad', 'disappointment': 'sad', 'remorse': 'sad', 'shame': 'sad',
    'anger': 'angry', 'annoyance': 'angry', 'disapproval': 'angry', 'frustration': 'angry', 'disgust': 'angry',
    'neutral': 'neutral', 'realization': 'neutral', 'relief': 'neutral', 'surprise': 'neutral', 'admiration': 'neutral',
    'caring': 'neutral', 'desire': 'neutral', 'embarrassment': 'neutral', 'gratitude': 'neutral',
    'nervousness': 'neutral', 'pride': 'neutral', 'curiosity': 'neutral', 'confusion': 'neutral', 'indifference': 'neutral', 'fear': 'neutral'
}

# --- Helper function to load audio robustly ---
def load_audio_data(file_paths):
    """Loads audio data and sampling rates from a list of file paths."""
    audio_data = []
    for path in file_paths:
        try:
            y, sr = librosa.load(path, sr=None) # Load with original sampling rate
            audio_data.append({'array': y, 'sampling_rate': sr})
        except Exception as e:
            print(f"Error loading audio file {path}: {e}")
            audio_data.append(None) # Append None for failed loads
    return audio_data

# --- Load and Process CREMA-D ---
print("--- Processing CREMA-D ---")
crema_d_audio_dir = os.path.join("crema_d_raw", "AudioWAV")
if not os.path.exists(crema_d_audio_dir) or not os.listdir(crema_d_audio_dir):
    print("CREMA-D raw data not found or directory is empty. Attempting download...")
    try:
        !kaggle datasets download -d ejlok1/cremad -p crema_d_raw --unzip
        if not os.path.exists(crema_d_audio_dir) or not os.listdir(crema_d_audio_dir):
            print("ERROR: CREMA-D download/unzip failed or directory still empty. Check Kaggle setup and internet connection.")
            df_crema_d = pd.DataFrame({'audio': [], 'label': []})
            print("No CREMA-D samples loaded due to download failure.\n")
        else:
            print("CREMA-D downloaded and unzipped successfully.")
            crema_d_files = [f for f in os.listdir(crema_d_audio_dir) if f.endswith('.wav')]
            crema_d_paths = [os.path.join(crema_d_audio_dir, f) for f in crema_d_files]
            crema_d_labels_raw = [f.split('_')[2] for f in crema_d_files]
            crema_d_mapped_labels = [CREMA_D_TO_TARGET.get(label_code.upper(), 'unknown') for label_code in crema_d_labels_raw] # Ensure uppercase

            df_crema_d = pd.DataFrame({'audio_path': crema_d_paths, 'label': crema_d_mapped_labels})
            print(f"CREMA-D: Initial DataFrame size: {len(df_crema_d)}")

    except Exception as e:
        print(f"ERROR: An exception occurred during CREMA-D processing: {e}")
        df_crema_d = pd.DataFrame({'audio_path': [], 'label': []})
        print("No CREMA-D samples loaded due to error.\n")
else:
    print("CREMA-D raw data already present. Loading...")
    crema_d_files = [f for f in os.listdir(crema_d_audio_dir) if f.endswith('.wav')]
    crema_d_paths = [os.path.join(crema_d_audio_dir, f) for f in crema_d_files]
    crema_d_labels_raw = [f.split('_')[2] for f in crema_d_files]
    crema_d_mapped_labels = [CREMA_D_TO_TARGET.get(label_code.upper(), 'unknown') for label_code in crema_d_labels_raw] # Ensure uppercase

    df_crema_d = pd.DataFrame({'audio_path': crema_d_paths, 'label': crema_d_mapped_labels})
    print(f"CREMA-D: Initial DataFrame size: {len(df_crema_d)}")


# Filter CREMA-D for target emotions and load audio data
df_crema_d = df_crema_d[df_crema_d['label'].isin(TARGET_EMOTIONS)]
if not df_crema_d.empty:
    print("CREMA-D: Loading audio data...")
    df_crema_d['audio'] = load_audio_data(df_crema_d['audio_path'].tolist())
    df_crema_d = df_crema_d.dropna(subset=['audio']) # Remove rows where audio loading failed
    df_crema_d['label'] = df_crema_d['label'].map(emotion_to_int)
    crema_d_dataset = Dataset.from_pandas(df_crema_d[['audio', 'label']])
    crema_d_dataset = crema_d_dataset.shuffle(seed=42)
    print(f"Loaded {len(crema_d_dataset)} CREMA-D samples after filtering and loading.\n")
else:
    crema_d_dataset = Dataset.from_dict({'audio': [], 'label': []})
    print("No CREMA-D samples loaded after filtering (resulting DataFrame was empty).\n")


# --- Load and Process TESS ---
print("--- Processing TESS ---")
tess_base_path = "tess_raw/TESS Toronto emotional speech set data"
if not os.path.exists(tess_base_path) or not os.listdir(tess_base_path):
    print("TESS raw data not found or directory is empty. Attempting download...")
    try:
        !kaggle datasets download -d ejlok1/toronto-emotional-speech-set-tess -p tess_raw --unzip
        if not os.path.exists(tess_base_path) or not os.listdir(tess_base_path):
            print("ERROR: TESS download/unzip failed or directory still empty. Check Kaggle setup and internet connection.")
            df_tess = pd.DataFrame({'audio': [], 'label': []})
            print("No TESS samples loaded due to download failure.\n")
        else:
            print("TESS downloaded and unzipped successfully.")
            tess_data = []
            for folder in os.listdir(tess_base_path):
                emotion = folder.split('_')[-1].lower()
                if emotion == 'ps': emotion = 'surprise'
                mapped_emotion = TESS_TO_TARGET.get(emotion)
                if mapped_emotion and mapped_emotion in TARGET_EMOTIONS:
                    for file in os.listdir(os.path.join(tess_base_path, folder)):
                        if file.endswith('.wav'):
                            tess_data.append({
                                'audio_path': os.path.join(tess_base_path, folder, file),
                                'label': mapped_emotion
                            })
            df_tess = pd.DataFrame(tess_data)
            print(f"TESS: Initial DataFrame size: {len(df_tess)}")
    except Exception as e:
        print(f"ERROR: An exception occurred during TESS processing: {e}")
        df_tess = pd.DataFrame({'audio_path': [], 'label': []})
        print("No TESS samples loaded due to error.\n")
else:
    print("TESS raw data already present. Loading...")
    tess_data = []
    for folder in os.listdir(tess_base_path):
        emotion = folder.split('_')[-1].lower()
        if emotion == 'ps': emotion = 'surprise'
        mapped_emotion = TESS_TO_TARGET.get(emotion)
        if mapped_emotion and mapped_emotion in TARGET_EMOTIONS:
            for file in os.listdir(os.path.join(tess_base_path, folder)):
                if file.endswith('.wav'):
                    tess_data.append({
                        'audio_path': os.path.join(tess_base_path, folder, file),
                        'label': mapped_emotion
                    })
    df_tess = pd.DataFrame(tess_data)
    print(f"TESS: Initial DataFrame size: {len(df_tess)}")


# Filter TESS for target emotions and load audio data
df_tess = df_tess[df_tess['label'].isin(TARGET_EMOTIONS)]
if not df_tess.empty:
    print("TESS: Loading audio data...")
    df_tess['audio'] = load_audio_data(df_tess['audio_path'].tolist())
    df_tess = df_tess.dropna(subset=['audio']) # Remove rows where audio loading failed
    df_tess['label'] = df_tess['label'].map(emotion_to_int)
    tess_dataset = Dataset.from_pandas(df_tess[['audio', 'label']])
    tess_dataset = tess_dataset.shuffle(seed=42)
    print(f"Loaded {len(tess_dataset)} TESS samples after filtering and loading.\n")
else:
    tess_dataset = Dataset.from_dict({'audio': [], 'label': []})
    print("No TESS samples loaded after filtering (resulting DataFrame was empty).\n")


# --- Load and Process GoEmotions ---
print("--- Processing GoEmotions ---")
try:
    goemotions_ds = load_dataset("go_emotions", "simplified")
    go_emotion_labels = goemotions_ds['train'].features['labels'].feature
    goemotions_data = []
    for split in ['train', 'validation', 'test']:
        for item in goemotions_ds[split]:
            if item['labels']:
                emotion_str = go_emotion_labels.int2str(item['labels'][0])
                mapped_emotion = GOEMOTIONS_TO_TARGET.get(emotion_str)
                if mapped_emotion and mapped_emotion in TARGET_EMOTIONS:
                    goemotions_data.append({'text': item['text'], 'label': mapped_emotion})

    df_goemotions = pd.DataFrame(goemotions_data)
    print(f"GoEmotions: Initial DataFrame size: {len(df_goemotions)}")
    if not df_goemotions.empty:
        df_goemotions['label'] = df_goemotions['label'].map(emotion_to_int)
        goemotions_dataset = Dataset.from_pandas(df_goemotions)
        goemotions_dataset = goemotions_dataset.shuffle(seed=42)
        print(f"Loaded {len(goemotions_dataset)} GoEmotions samples.\n")
    else:
        goemotions_dataset = Dataset.from_dict({'text': [], 'label': []})
        print("No GoEmotions samples loaded after filtering.\n")
except Exception as e:
    print(f"ERROR: An exception occurred during GoEmotions processing: {e}")
    goemotions_dataset = Dataset.from_dict({'text': [], 'label': []})
    print("No GoEmotions samples loaded due to error.\n")


print("Dataset Loading and Preprocessing Complete.")

--- Processing CREMA-D ---
CREMA-D raw data already present. Loading...
CREMA-D: Initial DataFrame size: 7442
CREMA-D: Loading audio data...
Loaded 7442 CREMA-D samples after filtering and loading.

--- Processing TESS ---
TESS raw data not found or directory is empty. Attempting download...
Dataset URL: https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess
License(s): Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)
Downloading toronto-emotional-speech-set-tess.zip to tess_raw
 94% 400M/428M [00:01<00:00, 262MB/s]
100% 428M/428M [00:01<00:00, 320MB/s]
TESS downloaded and unzipped successfully.
TESS: Initial DataFrame size: 2600
TESS: Loading audio data...
Loaded 2600 TESS samples after filtering and loading.

--- Processing GoEmotions ---


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

GoEmotions: Initial DataFrame size: 50994
Loaded 50994 GoEmotions samples.

Dataset Loading and Preprocessing Complete.


In [None]:
# unified_dataset_and_training.py
# --- 4. UNIFIED DATASET CREATION AND TRAINING ---

# --- Combine Datasets ---
# --- PASTE THIS OPTIMIZED BLOCK IN ITS PLACE ---

from datasets import concatenate_datasets
from collections import Counter

# --- 2. PREPARE AND COMBINE AUDIO DATASETS (MEMORY-EFFICIENT) ---
# This just combines the dataset tables without loading audio data.
print("--- Starting Optimized Multimodal Dataset Creation ---")
all_audio_ds = concatenate_datasets([crema_d_dataset, tess_dataset])

# Map string labels to the integers defined in your config file.
all_audio_ds = all_audio_ds.map(lambda x: {"label": emotion_to_int.get(x["label"], -1)})
# Filter out any emotions we aren't targeting (e.g., fear, disgust).
all_audio_ds = all_audio_ds.filter(lambda x: x['label'] != -1)


# --- 3. PREPARE TEXT DATASET ---
# Filter the text dataset to only include the labels we need.
text_ds = goemotions_dataset.filter(lambda x: x['label'] in emotion_to_int.values())


# --- 4. CALCULATE BALANCING COUNT EFFICIENTLY ---
# Count samples per label without loading the actual data into memory.
audio_counts = Counter(all_audio_ds['label'])
text_counts = Counter(text_ds['label'])

min_samples = 0
if audio_counts and text_counts:
    # Find the smallest class size that is present in BOTH audio and text datasets.
    valid_labels = set(audio_counts.keys()) & set(text_counts.keys())
    if valid_labels:
        min_samples = min(min(audio_counts[label], text_counts[label]) for label in valid_labels)

print(f"Found valid shared labels. Balancing dataset to {min_samples} samples per emotion.")


# --- 5. CREATE UNIFIED DATASET BY EFFICIENT PAIRING ---
# This is the core optimization. We build the final dataset from file paths
# and text strings, NOT from pre-loaded audio arrays.

multimodal_data = {'audio': [], 'text': [], 'label': []}

if min_samples > 0:
    for label_int, emotion_str in enumerate(TARGET_EMOTIONS):
        # Filter each source dataset for the current emotion.
        # `load_from_cache_file=False` can help prevent issues with stale caches.
        audio_subset = all_audio_ds.filter(lambda x: x['label'] == label_int, load_from_cache_file=False)
        text_subset = text_ds.filter(lambda x: x['label'] == label_int, load_from_cache_file=False)

        # Check if we have enough data for this emotion after filtering.
        if len(audio_subset) >= min_samples and len(text_subset) >= min_samples:
            # Shuffle and select 'min_samples' from each. This is very fast.
            audio_samples = audio_subset.shuffle(seed=42).select(range(min_samples))
            text_samples = text_subset.shuffle(seed=42).select(range(min_samples))

            # Append the metadata (file paths and text strings), not the decoded data.
            multimodal_data['audio'].extend(audio_samples['audio'])
            multimodal_data['text'].extend(text_samples['text'])
            multimodal_data['label'].extend(audio_samples['label'])
        else:
            print(f"Skipping emotion '{emotion_str}' due to insufficient samples.")

# Create the final, memory-light dataset from the dictionary.
unified_dataset = Dataset.from_dict(multimodal_data).shuffle(seed=42)
print(f"Created unified dataset with {len(unified_dataset)} samples.")

# --- END OF THE REPLACEMENT BLOCK ---

# --- BATCHED FEATURE EXTRACTION ---
def process_in_batches(batch):
    batch['audio_features'] = [extract_audio_features(item['array'], item['sampling_rate']) for item in batch['audio']]
    batch['text_features'] = analyze_text_emotion_batch(batch['text'])
    batch['label_one_hot'] = to_categorical(batch['label'], num_classes=NUM_OUTPUT_EMOTIONS)
    return batch

# Apply the batched processing
if len(unified_dataset) > 0:
    # FIX: Reduce batch_size for mapping to potentially avoid CUDA errors
    processed_dataset = unified_dataset.map(
        process_in_batches,
        batched=True,
        batch_size=16, # Reduced batch size
        remove_columns=['audio', 'text', 'label']
    )

    # --- Create TensorFlow Datasets ---
    # Ensure processed_dataset is not empty after mapping
    if len(processed_dataset) > 0:
        processed_dataset.set_format(type='tensorflow', columns=['audio_features', 'text_features', 'label_one_hot'])
        train_size = int(len(processed_dataset) * TRAIN_RATIO)

        tf_train_dataset = processed_dataset.select(range(train_size))
        tf_val_dataset = processed_dataset.select(range(train_size, len(processed_dataset)))

        def as_tf_dataset(ds):
            return tf.data.Dataset.from_tensor_slices(
                (
                    {'audio_input': ds['audio_features'], 'text_input': ds['text_features']},
                    ds['label_one_hot']
                )
            )

        train_ds = as_tf_dataset(tf_train_dataset).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        val_ds = as_tf_dataset(tf_val_dataset).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        print("TensorFlow datasets are ready for training.")

        # --- Train the Model ---
        print("\n--- Starting Model Training ---")
        history = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=EPOCHS # Use the integer EPOCHS
        )
        print("\nModel training complete.")

        # --- Save the Model ---
        # FIX: Save in .keras format for better compatibility
        model.save("multimodal_emotion_model.keras")
        print("Model saved to multimodal_emotion_model.keras")
    else:
        print("Processed dataset is empty after feature extraction. Cannot create TensorFlow datasets or train model.")
else:
    print("Unified dataset is empty. Skipping feature extraction, TensorFlow dataset creation, and model training.")

--- Starting Optimized Multimodal Dataset Creation ---


Map:   0%|          | 0/10042 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10042 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50994 [00:00<?, ? examples/s]

Found valid shared labels. Balancing dataset to 0 samples per emotion.
Created unified dataset with 0 samples.
Unified dataset is empty. Skipping feature extraction, TensorFlow dataset creation, and model training.


In [None]:
from google.colab import files

files.download('multimodal_emotion_model.keras')

FileNotFoundError: Cannot find file: multimodal_emotion_model.keras

In [None]:
# app.py
import streamlit as st
import numpy as np
import tensorflow as tf
import librosa
import sounddevice as sd
from scipy.io.wavfile import write as write_wav
from transformers import pipeline
import os
import pandas as pd

# --- Global Configuration ---
# NOTE: These variables must match your training script's configuration.
TARGET_EMOTIONS = ["happy", "sad", "angry", "neutral"]
AUDIO_FEATURE_SIZE = 166
TEXT_FEATURE_SIZE = 4 # Based on ['anger', 'joy', 'optimism', 'sadness']
BERTWEET_EMOTION_LABELS = ['anger', 'joy', 'optimism', 'sadness']
TEMP_AUDIO_FILENAME = "temp_recording.wav"
SAMPLING_RATE_MODEL = 16000 # The rate your model expects
SAMPLING_RATE_RECORD = 44100 # Standard for recording hardware


# --- Core Functions ---

def extract_audio_features(y, sr):
    """
    Extracts a feature vector from an audio waveform, ensuring it's the correct
    data type and size.
    """
    # Ensure 'y' is a NumPy array of float32
    if not isinstance(y, np.ndarray) or y.dtype != np.float32:
        y = np.array(y, dtype=np.float32)

    features = {
        'mfcc': np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1),
        'chroma': np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1),
        'mel': np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1),
        'contrast': np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1),
        'tonnetz': np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr), axis=1)
    }
    concatenated_features = np.concatenate(list(features.values()))

    # Pad or truncate to ensure consistent feature vector size
    if len(concatenated_features) < AUDIO_FEATURE_SIZE:
        padded_features = np.pad(concatenated_features, (0, AUDIO_FEATURE_SIZE - len(concatenated_features)), 'constant')
        return padded_features
    else:
        return concatenated_features[:AUDIO_FEATURE_SIZE]

@st.cache_resource
def load_resources():
    """
    Load models and pipelines once and cache them.
    This function is now more robust to prevent crashes.
    """
    # FIX: Initialize variables to None before trying to assign them
    model = None
    text_classifier = None

    # Load the trained Keras model
    try:
        model = tf.keras.models.load_model("multimodal_emotion_model.keras")
    except Exception as e:
        st.error(f"Fatal Error: Could not load Keras model 'multimodal_emotion_model.keras'.\nReason: {e}")
        st.info("Please ensure the model file is in the same directory as app.py and that TensorFlow versions are compatible.")
        st.stop()

    # Load the Hugging Face text classification pipeline
    try:
        text_classifier = pipeline(
            "text-classification",
            model="finiteautomata/bertweet-base-emotion-analysis",
            return_all_scores=True
        )
    except Exception as e:
        st.error(f"Fatal Error: Could not load the text analysis pipeline from Hugging Face.\nReason: {e}")
        st.info("This may be a network issue or a problem with the transformers library.")
        st.stop()

    return model, text_classifier

def analyze_text_emotion(text, classifier):
    """
    Analyzes a single text string and returns a feature vector based on emotion scores.
    """
    # Return a zero vector for empty input to avoid errors
    if not text or not text.strip():
        return np.zeros(TEXT_FEATURE_SIZE, dtype=np.float32)

    # Get the list of score dictionaries from the pipeline
    results = classifier(text)[0]

    # Create a score map from the results
    score_map = {item['label']: item['score'] for item in results}

    # Return the scores in the predefined order of BERTWEET_EMOTION_LABELS
    return np.array([score_map.get(label, 0.0) for label in BERTWEET_EMOTION_LABELS], dtype=np.float32)


# --- Streamlit App UI ---
st.set_page_config(page_title="Emotion Detector", layout="wide")
st.title("🎙️ Real-Time Multimodal Emotion Detector")

# Load resources with caching and clear error messages
try:
    trained_model, text_pipe = load_resources()
    st.success("✅ Models loaded successfully!")
except Exception as e:
    st.error(f"A critical error occurred during model loading: {e}")
    st.stop()


# --- Main App Logic ---
st.header("1. Record Audio")
recording_duration = st.slider("Select recording duration (seconds):", 1, 10, 5)

if st.button(f"🎤 Start {recording_duration}-Second Recording"):
    with st.spinner(f"Recording for {recording_duration} seconds... Please speak clearly."):
        # Record audio from the microphone
        audio_array = sd.rec(
            int(recording_duration * SAMPLING_RATE_RECORD),
            samplerate=SAMPLING_RATE_RECORD,
            channels=1,
            dtype='float32'
        )
        sd.wait() # Wait for the recording to complete

        # Save the raw recording to session state
        st.session_state.audio_data = np.squeeze(audio_array)
        st.success("Recording finished!")

        # Display the recorded audio
        st.audio(st.session_state.audio_data, format="audio/wav", sample_rate=SAMPLING_RATE_RECORD)

# Only show the next step if a recording exists in the session state
if 'audio_data' in st.session_state and st.session_state.audio_data is not None:
    st.header("2. Enter Spoken Text & Analyze")
    user_text = st.text_input("Enter the text you spoke during the recording:", "")

    if st.button("Analyze Emotion") and user_text:
        with st.spinner("Analyzing audio and text..."):
            # --- PREPARATION ---
            # Save the recorded audio to a temporary file to be re-loaded by librosa
            write_wav(TEMP_AUDIO_FILENAME, SAMPLING_RATE_RECORD, st.session_state.audio_data)

            # Load the audio with librosa, which resamples it to the model's required rate (16000 Hz)
            y_resampled, sr_resampled = librosa.load(TEMP_AUDIO_FILENAME, sr=SAMPLING_RATE_MODEL)

            # --- FEATURE EXTRACTION ---
            # 1. Process Audio
            audio_feats = extract_audio_features(y_resampled, sr_resampled).reshape(1, AUDIO_FEATURE_SIZE)

            # 2. Process Text
            text_feats = analyze_text_emotion(user_text, text_pipe).reshape(1, TEXT_FEATURE_SIZE)

            # --- PREDICTION ---
            # 3. Get model prediction
            prediction = trained_model.predict([audio_feats, text_feats])[0]
            predicted_emotion_index = np.argmax(prediction)
            predicted_emotion = TARGET_EMOTIONS[predicted_emotion_index]

            # --- DISPLAY RESULTS ---
            st.subheader(f"Predicted Emotion: **{predicted_emotion.upper()}**")

            # Create and display a DataFrame for the probabilities
            df_probs = pd.DataFrame({
                'Emotion': TARGET_EMOTIONS,
                'Probability': prediction
            })
            st.bar_chart(df_probs.set_index('Emotion'))

            # Clean up the temporary audio file
            if os.path.exists(TEMP_AUDIO_FILENAME):
                os.remove(TEMP_AUDIO_FILENAME)
    elif st.button("Analyze Emotion") and not user_text:
        st.warning("Please enter the text you spoke before analyzing.")

In [None]:
!npm install localtunnel

In [None]:
# Create the app.py file with the Streamlit code
# This code is copied from cell dhRb1nEXyN-8
streamlit_code = '''
import streamlit as st
import numpy as np
import tensorflow as tf
import librosa
import sounddevice as sd
from scipy.io.wavfile import write as write_wav
from transformers import pipeline
import os

# --- Load Model and Helper Functions ---

# NOTE: You would need to redefine or import the following from the training script:
# - TARGET_EMOTIONS, AUDIO_FEATURE_SIZE, TEXT_FEATURE_SIZE
# - extract_audio_features()
# - analyze_text_emotion_batch() or a single-instance version for prediction

# For simplicity, we redefine them here.
TARGET_EMOTIONS = ["happy", "sad", "angry", "neutral"]
AUDIO_FEATURE_SIZE = 166
TEXT_FEATURE_SIZE = 4 # Based on ['anger', 'joy', 'optimism', 'sadness']
BERTWEET_EMOTION_LABELS = ['anger', 'joy', 'optimism', 'sadness']

def extract_audio_features(y, sr):
    y = y.astype(np.float32)
    features = {
        'mfcc': np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1),
        'chroma': np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1),
        'mel': np.mean(librosa.feature.melspectrogram(y=y, sr=sr), axis=1),
        'contrast': np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1),
        'tonnetz': np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr), axis=1)
    }
    return np.concatenate(list(features.values()))

@st.cache_resource
def load_resources():
    """Load models and pipelines once.""" # Changed back to triple double quotes as the outer is triple single
    # Ensure the model file exists
    if not os.path.exists("multimodal_emotion_model.h5"):
         st.error("Model file 'multimodal_emotion_model.h5' not found. Please run the training steps first.")
         return None, None

    model = tf.keras.models.load_model("multimodal_emotion_model.h5")
    text_classifier = pipeline(
        "text-classification",
        model="finiteautomata/bertweet-base-emotion-analysis",
        return_all_scores=True
    )
    return model, text_classifier

def analyze_text_emotion(text, classifier):
    """Analyzes a single text string for prediction.""" # Changed back to triple double quotes
    if not text.strip():
        return np.zeros(TEXT_FEATURE_SIZE, dtype=np.float32)
    results = classifier(text)[0]
    emotion_scores = {label: 0.0 for label in BERTWEET_EMOTION_LABELS}
    for item in results:
        if item['label'] in emotion_scores:
            emotion_scores[item['label']] = item['score']
    return np.array([emotion_scores[label] for label in BERTWEET_EMOTION_LABELS], dtype=np.float32)

# --- Streamlit App UI ---
st.title("🎙️ Real-Time Multimodal Emotion Detector")

trained_model, text_pipe = load_resources()

if trained_model is not None and text_pipe is not None:
    st.success("Model and text pipeline loaded successfully!")

    if 'recording' not in st.session_state:
        st.session_state.recording = None
    if 'sample_rate' not in st.session_state:
        st.session_state.sample_rate = None

    if st.button("🎤 Start 5-Second Recording"):
        with st.spinner("Recording..."):
            st.session_state.recording = sd.rec(int(5 * 44100), samplerate=44100, channels=1, dtype='float32')
            sd.wait()
            st.session_state.sample_rate = 44100
        st.success("Recording finished!")
        # Save the recording to a temporary file for playback and feature extraction
        temp_audio_file = "temp_recording.wav"
        write_wav(temp_audio_file, st.session_state.sample_rate, st.session_state.recording)
        st.audio(temp_audio_file, format="audio/wav", sample_rate=st.session_state.sample_rate)
        os.remove(temp_audio_file) # Clean up the temporary file


    if st.session_state.recording is not None:
        user_text = st.text_input("Enter the text you spoke:", "")

        if st.button("Analyze Emotion") and user_text:
            with st.spinner("Analyzing..."):
                # 1. Process Audio
                audio_data = np.squeeze(st.session_state.recording)
                # Need to save to a file temporarily for librosa to load
                temp_audio_file_for_librosa = "temp_for_librosa.wav"
                write_wav(temp_audio_file_for_librosa, st.session_state.sample_rate, audio_data)

                y, sr = librosa.load(temp_audio_file_for_librosa, sr=16000)
                audio_feats = extract_audio_features(y, sr).reshape(1, AUDIO_FEATURE_SIZE)

                os.remove(temp_audio_file_for_librosa) # Clean up the temporary file

                # 2. Process Text
                text_feats = analyze_text_emotion(user_text, text_pipe).reshape(1, TEXT_FEATURE_SIZE)

                # 3. Predict
                prediction = trained_model.predict([audio_feats, text_feats])[0]
                predicted_emotion_index = np.argmax(prediction)
                predicted_emotion = TARGET_EMOTIONS[predicted_emotion_index]

                st.subheader(f"Predicted Emotion: **{predicted_emotion.upper()}**")

                # Display probabilities
                df_probs = pd.DataFrame({'Emotion': TARGET_EMOTIONS, 'Probability': prediction})
                st.bar_chart(df_probs.set_index('Emotion'))
else:
    st.error("Failed to load model or text pipeline. Please check previous steps.")

''' # Changed to triple single quotes

with open("app.py", "w") as f:
    f.write(streamlit_code)

print("app.py created successfully.")

# Run the Streamlit app in the background
get_ipython().system_raw('streamlit run app.py &>/content/logs.txt &')

# Get your public IP (needed for localtunnel password)
!curl ipv4.icanhazip.com

# Start localtunnel
!npx localtunnel --port 8501

In [None]:
!free -h    # RAM
!nvidia-smi # GPU
