In [2]:
pip install openai-whisper librosa scikit-learn

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/800.5 kB[0m [31m32.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch-

In [5]:
import whisper
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from moviepy.editor import AudioFileClip

# Step 1: Load Whisper Model
model = whisper.load_model("base")

# Step 2: Transcribe Audio using Whisper
def transcribe_audio(audio_path):
    result = model.transcribe(audio_path)
    print("Transcription:", result["text"])
    return result["text"], result["language"]

# Step 3: Extract Audio Features (MFCC + Pitch)
def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    pitch, _ = librosa.core.piptrack(y=y, sr=sr)
    pitch = np.mean(pitch, axis=1)
    mfcc_features = np.mean(mfcc, axis=1)
    pitch_features = pitch[:13]  # Only first 13 values
    audio_features = np.concatenate([mfcc_features, pitch_features])
    return audio_features

# Step 4: Train Emotion Classifier (SVM)
def train_emotion_classifier():
    X = np.random.rand(100, 26)  # Simulated feature data
    y = np.random.choice(['happy', 'sad', 'angry', 'neutral'], size=100)
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    classifier = SVC(kernel='linear')
    classifier.fit(X, y_encoded)
    return classifier, le

# Step 5: Predict Emotion
def classify_emotion(features, classifier, le):
    emotion_idx = classifier.predict([features])[0]
    emotion = le.inverse_transform([emotion_idx])[0]
    return emotion

# Step 6: Convert MP4 to WAV
def convert_mp4_to_wav(mp4_path, wav_path):
    try:
        audio_clip = AudioFileClip(mp4_path)
        audio_clip.write_audiofile(wav_path, codec='pcm_s16le')
    except Exception as e:
        print(f"Error converting MP4 to WAV: {e}")
        print(f"Check if the MP4 file exists and has a valid audio stream.")
        # You can add further debugging or error handling here

# Step 7: Main Process
def emotion_aware_speech_recognition(mp4_path):
    wav_path = "angry_alex.wav"
    convert_mp4_to_wav(mp4_path, wav_path)
    transcription, language = transcribe_audio(wav_path)
    audio_features = extract_audio_features(wav_path)
    emotion = classify_emotion(audio_features, emotion_classifier, label_encoder)

    print(f"Detected Emotion: {emotion}")
    print(f"Transcription: {transcription}")
    print(f"Language Detected: {language}")

# Step 8: Train Classifier Once
emotion_classifier, label_encoder = train_emotion_classifier()

# Example Usage
audio_path = "/content/sample.mp4"
emotion_aware_speech_recognition(audio_path)


chunk:   0%|          | 1/221 [05:26<19:57:17, 326.53s/it, now=None]
chunk:   0%|          | 1/221 [04:40<17:08:10, 280.41s/it, now=None][A

MoviePy - Writing audio in angry_alex.wav




index -100001 is out of bounds for axis 0 with size 0






Error converting MP4 to WAV: index -100001 is out of bounds for axis 0 with size 0
Check if the MP4 file exists and has a valid audio stream.
Transcription: 





Detected Emotion: sad
Transcription: 
Language Detected: en
