In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
import numpy as np
from pydub import AudioSegment
import speech_recognition as sr
import os

# Load the model and extractor from your notebook's logic
model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
model = AutoModelForAudioClassification.from_pretrained(model_name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

# Emotion mapping
id2label = {
    "0": "angry", "1": "calm", "2": "disgust", "3": "fearful",
    "4": "happy", "5": "neutral", "6": "sad", "7": "surprised"
}

# Define the output filename simply as a string
OUTPUT_FILE = "audio.wav"



Loading weights:   0%|          | 0/422 [00:00<?, ?it/s]

[1mWav2Vec2ForSequenceClassification LOAD REPORT[0m from: ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition
Key                      | Status     | 
-------------------------+------------+-
classifier.output.bias   | UNEXPECTED | 
classifier.output.weight | UNEXPECTED | 
classifier.dense.weight  | UNEXPECTED | 
classifier.dense.bias    | UNEXPECTED | 
projector.bias           | MISSING    | 
classifier.bias          | MISSING    | 
classifier.weight        | MISSING    | 
projector.weight         | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [2]:
emotions = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
} # map to emotions


In [3]:
def predict_mood_result(audio_file):
    # Process audio at 16000Hz as the model expects
    sound = AudioSegment.from_file(audio_file)
    sound = sound.set_frame_rate(16000)
    sound_array = np.array(sound.get_array_of_samples())

    input_values = feature_extractor(
        raw_speech=sound_array,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        logits = model(input_values.input_values.float()).logits
        
    predicted_id = torch.argmax(logits, dim=-1).item()
    return id2label[str(predicted_id)] # highest prob. emotion
    
    

In [4]:
# # mel spectogram converts audio to visual form
# # uses librosa library

# def extract_features(audio_data):
  
#        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast', duration=5, sr=22050)
        
#        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T, axis=0)
#         # Mel-frequency cepstral coefficients - power spectrum of sound
#        chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
#         # chromagram -> vector representation of audio, representing pitch on chromatic scale
#        mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate).T, axis=0)
#         # compute mel-scaled spectrogram
        
#        features = np.hstack([mfccs, chroma, mel])
#        return features  



 
 
 
    
 

In [5]:
# import joblib

# MODEL_PATH = 'ravdess-pretrained.ipynb' 
# try:
#     model = joblib.load(MODEL_PATH)
#     print("Model loaded successfully!")
# except Exception as e:
#     print(f"Error loading model: {e}. Check the filename!")
# emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

In [10]:
import sounddevice as sd
from scipy.io.wavfile import write
import speech_recognition as sr
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
from pydub import AudioSegment

songs_df = pd.read_csv("list.csv")

def canonical_mood(label):
    label = str(label).strip().lower()
    mood_aliases = {
        "anger": "angry",
        "angry": "angry",
        "calm": "calm",
        "disgust": "disgust",
        "fear": "fearful",
        "fearful": "fearful",
        "happy": "happy",
        "happiness": "happy",
        "neutral": "neutral",
        "sad": "sad",
        "sadness": "sad",
        "surprise": "surprised",
        "surprised": "surprised"
    }
    return mood_aliases.get(label, label)

def ensure_mood_model_loaded():
    global model, feature_extractor, predict_mood_result
    
    if globals().get("_mood_model_version") == 2 and "predict_mood_result" in globals():
        return
    
    model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
    model = AutoModelForAudioClassification.from_pretrained(model_name)
    model.eval()
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

    id2label_cfg = model.config.id2label

    def predict_mood_result(audio_file):
        # Resample and convert to mono
        sound = AudioSegment.from_file(audio_file)
        sound = sound.set_frame_rate(16000).set_channels(1)
        samples = np.array(sound.get_array_of_samples(), dtype=np.float32)
        
        # Normalize waveform to [-1, 1]
        max_abs = np.max(np.abs(samples))
        if max_abs > 0:
            samples = samples / max_abs

        inputs = feature_extractor(
            raw_speech=samples,
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        )

        with torch.no_grad():
            logits = model(**inputs).logits

        predicted_id = int(torch.argmax(logits, dim=-1).item())
        raw_label = id2label_cfg.get(predicted_id, str(predicted_id))
        return canonical_mood(raw_label)

    globals()["predict_mood_result"] = predict_mood_result
    globals()["_mood_model_version"] = 2

def get_song_for_mood(mood):
    mood = mood.strip().lower()

    # Try exact mood match first
    mood_matches = songs_df[
        songs_df["labels"].fillna("").str.lower().str.contains(fr"\\b{mood}\\b", regex=True)
    ]
    if not mood_matches.empty:
        song = mood_matches.sample(1).iloc[0]
        return {
            "artist": song["artist"],
            "title": song["title"],
            "spotify_url": song["spotify_url"],
            "source": "mood_match"
        }

    # If no mood match in songs.csv, return a random song from songs.csv (no fallback list)
    random_song = songs_df.sample(1).iloc[0]
    return {
        "artist": random_song["artist"],
        "title": random_song["title"],
        "spotify_url": random_song["spotify_url"],
        "source": "random_from_songs_csv"
    }

def record_audio_until_enter(output_file="current_voice.wav", fs=44100):
    print("Recording... Press Enter to stop.")
    chunks = []

    def callback(indata, frames, time_info, status):
        if status:
            print(status)
        chunks.append(indata.copy())

    with sd.InputStream(samplerate=fs, channels=1, dtype="float32", callback=callback):
        input()  # stop recording when user presses Enter

    if not chunks:
        raise RuntimeError("No audio captured. Please try again.")

    recording = np.concatenate(chunks, axis=0)
    recording_int16 = (recording * 32767).astype(np.int16)
    write(output_file, fs, recording_int16)
    return output_file

def record_and_get_mood():
    ensure_mood_model_loaded()

    output_file = record_audio_until_enter()
    print("Recording saved. Analyzing...")

    # Speech to Text
    r = sr.Recognizer()
    try:
        with sr.AudioFile(output_file) as source:
            audio_data = r.record(source)
            text = r.recognize_google(audio_data)
            print(f"You said: \"{text}\"")
    except Exception as e:
        print(f"Could not transcribe speech: {e}")

    # Emotion + Song Recommendation
    try:
        mood = predict_mood_result(output_file)
        print(f"Mood: {mood.upper()} ---")

        song = get_song_for_mood(mood)
        print(f"Recommended song ({song['source']}): {song['title']} - {song['artist']}")
        print(f"Spotify: {song['spotify_url']}")
    except Exception as e:
        print(f"Emotion analysis or recommendation failed: {e}")

# Run the process
record_and_get_mood()

Recording... Press Enter to stop.
Recording saved. Analyzing...
You said: "okay"
Mood: CALM ---
Recommended song (random_from_songs_csv): I Love Kanye - Kanye West
Spotify: https://open.spotify.com/track/4S8d14HvHb70ImctNgVzQQ
