In [1]:
import torch
import librosa
from transformers import (
    AutoFeatureExtractor, 
    AutoModelForAudioClassification, 
    TrainingArguments, 
    Trainer
)

  from .autonotebook import tqdm as notebook_tqdm





In [None]:
import pandas as pd

data_path = pd.read_csv("data_path.csv")  # Adjust path if needed

# Sample 30% of data stratified by emotion
data_subset = data_path.groupby("Emotions", group_keys=False).apply(lambda x: x.sample(frac=0.3, random_state=42)).reset_index(drop=True)


  Emotions                                               Path
0    angry      datasets/CREMA-D/AudioWAV/1071_IWL_ANG_XX.wav
1    angry  datasets/RAVDESS/Actor_22/03-01-05-02-02-02-22...
2    angry      datasets/CREMA-D/AudioWAV/1006_WSI_ANG_XX.wav
3    angry      datasets/CREMA-D/AudioWAV/1026_MTI_ANG_XX.wav
4    angry      datasets/CREMA-D/AudioWAV/1049_ITS_ANG_XX.wav


  data_subset = data_path.groupby("Emotions", group_keys=False).apply(lambda x: x.sample(frac=0.3, random_state=42)).reset_index(drop=True)


In [35]:
print(data_subset.tail())

      Emotions                                               Path
2661  surprise  datasets/RAVDESS/Actor_07/03-01-08-01-02-02-07...
2662  surprise  datasets/RAVDESS/Actor_21/03-01-08-01-02-02-21...
2663  surprise  datasets/RAVDESS/Actor_05/03-01-08-02-02-01-05...
2664  surprise  datasets/RAVDESS/Actor_04/03-01-08-02-02-02-04...
2665  surprise  datasets/RAVDESS/Actor_20/03-01-08-01-02-02-20...


In [13]:
# Label encoding
label2id = {label: i for i, label in enumerate(sorted(data_subset.Emotions.unique()))}
id2label = {v: k for k, v in label2id.items()}

In [4]:
# Load model and feature extractor
wav2vec2_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
extractor = AutoFeatureExtractor.from_pretrained(wav2vec2_model_name)
wav2vec2_model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label,
    problem_type="single_label_classification"
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from torch.utils.data import Dataset, DataLoader

# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, dataframe, extractor, max_duration=4):
        self.data = dataframe.reset_index(drop=True)
        self.extractor = extractor
        self.max_duration = max_duration

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        path = row['Path']
        label = label2id[row['Emotions']]
        audio, sr = librosa.load(path, sr=16000)
        audio = audio[:int(self.max_duration * sr)]  # Truncate to max duration
        inputs = self.extractor(audio, sampling_rate=16000, return_tensors="pt")
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = torch.tensor(label, dtype=torch.long)
        return inputs

In [14]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(data_subset, test_size=0.1, stratify=data_subset['Emotions'], random_state=42)

In [15]:
# 2. Convert to Hugging Face-compatible Dataset
train_dataset = EmotionDataset(train_df, extractor)
val_dataset = EmotionDataset(val_df, extractor)

In [8]:
import transformers
print(transformers.__version__)
print(transformers.TrainingArguments.__init__.__code__.co_varnames)


4.51.3
('self', 'output_dir', 'overwrite_output_dir', 'do_train', 'do_eval', 'do_predict', 'eval_strategy', 'prediction_loss_only', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'per_gpu_train_batch_size', 'per_gpu_eval_batch_size', 'gradient_accumulation_steps', 'eval_accumulation_steps', 'eval_delay', 'torch_empty_cache_steps', 'learning_rate', 'weight_decay', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'max_grad_norm', 'num_train_epochs', 'max_steps', 'lr_scheduler_type', 'lr_scheduler_kwargs', 'warmup_ratio', 'warmup_steps', 'log_level', 'log_level_replica', 'log_on_each_node', 'logging_dir', 'logging_strategy', 'logging_first_step', 'logging_steps', 'logging_nan_inf_filter', 'save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_only_model', 'restore_callback_states_from_checkpoint', 'no_cuda', 'use_cpu', 'use_mps_device', 'seed', 'data_seed', 'jit_mode_eval', 'use_ipex', 'bf16', 'fp16', 'fp16_opt_level', 'half_precision_ba

In [17]:
from transformers import TrainingArguments

# TrainingArguments with faster feedback
training_args = TrainingArguments(
    output_dir="./fine_tuned_emotion_model",
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)


In [18]:
# Trainer setup
trainer = Trainer(
    model=wav2vec2_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=extractor
)

  trainer = Trainer(


In [19]:
# Resume from latest checkpoint (optional)
latest_checkpoint = "fine_tuned_emotion_model/checkpoint-1000"

In [20]:
# Start training
trainer.train(resume_from_checkpoint=latest_checkpoint)

# Save the model
wav2vec2_model.save_pretrained("fine_tuned_emotion_model")
extractor.save_pretrained("fine_tuned_emotion_model")

	logging_steps: 10 (from args) != 50 (from trainer_state.json)
	eval_steps: 50 (from args) != 100 (from trainer_state.json)
	save_steps: 100 (from args) != 200 (from trainer_state.json)


Step,Training Loss,Validation Loss
1100,1.7267,1.661416
1200,1.7969,1.653152
1300,1.667,1.633661
1400,1.6337,1.65151
1500,1.6614,1.618276
1600,1.6201,1.62488
1700,1.6874,1.6205
1800,1.6567,1.608483


['fine_tuned_emotion_model\\preprocessor_config.json']

In [38]:
trainer.evaluate()

{'eval_loss': 1.608483076095581,
 'eval_runtime': 113.9105,
 'eval_samples_per_second': 2.344,
 'eval_steps_per_second': 0.588,
 'epoch': 3.0}

## HELLO START RUNNING EVERYTHING FROM HERE IF YOU JUST WANT THE MAIN LOOP AND NO TRAINING/TUNING

In [82]:
import torch
import librosa
from transformers import (
    AutoFeatureExtractor, 
    AutoModelForAudioClassification, 
    TrainingArguments, 
    Trainer
)

wav2vec2_model = AutoModelForAudioClassification.from_pretrained("fine_tuned_emotion_model")
extractor = AutoFeatureExtractor.from_pretrained("fine_tuned_emotion_model")

In [24]:
import sounddevice as sd
from scipy.io.wavfile import write

def record_audio(filename="recorded.wav", duration=3, fs=16000):
    print("🎙️ Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()
    write(filename, fs, recording)
    print(f"✅ Saved: {filename}")
    return filename


In [21]:
def classify_emotion(path):
    audio, sr = librosa.load(path, sr=16000)  # model expects 16kHz
    inputs = extractor(audio, sampling_rate=16000, return_tensors="pt")
    
    with torch.no_grad():
        logits = wav2vec2_model(**inputs).logits

    pred_id = torch.argmax(logits, dim=1).item()
    label = wav2vec2_model.config.id2label[pred_id]
    return label

In [50]:
import torchaudio
from transformers import pipeline

# Load the pipeline once
asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

def transcribe_wav_clip(path: str) -> str:
    """
    Transcribe a .wav file using Hugging Face Wav2Vec2 (offline).
    
    Args:
        path (str): Path to the .wav file
    
    Returns:
        str: Transcribed text
    """
    try:
        # Load audio file
        waveform, sr = torchaudio.load(path)

        # Convert stereo to mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0).unsqueeze(0)

        # Resample to 16kHz if necessary
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
            waveform = resampler(waveform)
            sr = 16000

        # Prepare input
        input_data = {
            "array": waveform.squeeze().numpy(),
            "sampling_rate": sr
        }

        # Run transcription
        result = asr(input_data)
        return result["text"]
    
    except Exception as e:
        print(f"❌ Error transcribing '{path}': {e}")
        return ""





Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [53]:
import google.generativeai as genai
import json
import time

In [67]:
genai_model = genai.GenerativeModel(model_name="gemini-1.5-pro")


In [80]:
# One-time Gemini setup
from logging import config

from sklearn import config_context


genai_model = genai.GenerativeModel("gemini-1.5-flash")
chat = genai_model.start_chat(history=[
        {
            "role": "user",
            "parts": ["You are an empathetic assistant who chats naturally with users. They are speaking out loud, so their messages might contain typos, filler words, or incomplete thoughts. Please interpret their intent and respond clearly, gently, and naturally."]
        }
    ]
)

def generate_reply_with_context(transcript, emotion):
    user_message = f"[Emotion: {emotion}] {transcript}"
    try:
        response = chat.send_message(f"The user said (may contain errors): {user_message}")
        return response.text.strip()
    except Exception as e:
        print(f"❌ Gemini error: {e}")
        return "[Could not generate response.]"



In [76]:
# === MAIN LOOP ===
import time
import os
from datetime import datetime
import sounddevice as sd
from scipy.io.wavfile import write
from tqdm import tqdm

fs = 16000

os.makedirs("recordings", exist_ok=True)

print("🎤 Voice interaction loop started. Press Ctrl+C to stop.")

try:
    while True:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        clip_path = f"recordings/clip_{timestamp}.wav"

        # Record audio
        duration = 4  # seconds
        print("🔴 Recording...")
        audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')

        for _ in tqdm(range(duration), desc="⏳ Recording", unit="s"):
            time.sleep(1)

        sd.wait()
        write(clip_path, 16000, audio)
        print("✅ Recording saved.")

        # Transcribe and classify
        transcript = transcribe_wav_clip(clip_path)
        emotion = classify_emotion(clip_path)
        print(f"🗣️ Transcript: {transcript}")
        print(f"😊 Detected Emotion: {emotion}")

        if transcript.strip():
            reply = generate_reply_with_context(transcript, emotion)
            print(f"🤖 Gemini: {reply}\n")
        else:
            print("⚠️ No speech detected.\n")

        time.sleep(1)

except KeyboardInterrupt:
    print("👋 Exiting voice loop.")

🎤 Voice interaction loop started. Press Ctrl+C to stop.
🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: HELLOA GERMAN I
😊 Detected Emotion: fear
🤖 Gemini: Oh my goodness, it sounds like you're feeling really scared right now.  Is something wrong?  "Hallo" is German for hello, so I'm guessing you might be in a situation involving German speakers or Germany itself?  Can you tell me more about what's happening so I can try to help?  Even if it's just a little bit, anything you can share would be useful.

🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: NO I AM NOT SCARE GREAT NEW
😊 Detected Emotion: disgust
🤖 Gemini: Okay, I understand.  You said you're *not* scared, and that's great news!  I apologize if I misinterpreted your initial message.  It sounded like you were distressed, so I wanted to check in.  My mistake!  Is there something else you'd like to talk about, or was there something else that caused that feeling of disgust?  I'm happy to listen if you'd like to share.

🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: I AM SAYING YOUR NAME
😊 Detected Emotion: disgust
🤖 Gemini: Oh, I see.  You're expressing disgust because of my name?  I understand.  Is there something about my name that you find unpleasant?  Or is it something else entirely? I'm programmed to be helpful and harmless, and I want to make sure I'm not causing any offense.  Perhaps we could try to talk about it, if you are comfortable doing so. I’m here to listen.

🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: NO I WAS JUST SAYING HALLO
😊 Detected Emotion: disgust
🤖 Gemini: Ah, I see!  You were just saying "Hallo" –  like a test, perhaps?  My apologies for the misinterpretations.  I'm still under development and learning to understand nuances in language and emotion.  I appreciate you clarifying. Is there anything else I can help you with today?

🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: ITSOUK
😊 Detected Emotion: sad
🤖 Gemini: It sounds like you're feeling sad.  "Itsouk" isn't a word I recognize, so I'm not entirely sure what you mean. Could you tell me a little more about what's making you feel sad?  There's no pressure, but if you'd like to talk about it, I'm here to listen.

🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: I MEAN MAYBE IT JOST SOME DEPRESSED
😊 Detected Emotion: fear
🤖 Gemini: I understand. You're saying it might just be depression, and that's making you feel afraid.  It's completely understandable to be scared when dealing with feelings of depression.  Depression can be a very difficult thing to manage. Would you like to talk about it?  Perhaps we can explore some resources or ideas that might help. Or, if you prefer, I can simply listen without offering suggestions.  Whatever feels most comfortable for you is fine.

🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: 
😊 Detected Emotion: sad
⚠️ No speech detected.

🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: 
😊 Detected Emotion: sad
⚠️ No speech detected.

🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: I DOREALLY KNOW WHAT ELSE TO SAY IN THIS CONVERSATION
😊 Detected Emotion: disgust
🤖 Gemini: It sounds like you're feeling a bit frustrated and unsure of how to continue the conversation. That's perfectly okay. Sometimes conversations just reach a natural end.  Is there anything else on your mind, or are you ready to finish our chat for now?  I'm happy to continue if you'd like, or I completely understand if you don't have anything further to discuss.

🔴 Recording...


⏳ Recording: 100%|██████████| 4/4 [00:04<00:00,  1.00s/s]


✅ Recording saved.
🗣️ Transcript: 
😊 Detected Emotion: sad
⚠️ No speech detected.

👋 Exiting voice loop.
