In [1]:
# ================================
#  Step 1: Install Libraries
# ================================
!pip install -q transformers torchaudio librosa soundfile ffmpeg-python gTTS langdetect
!pip uninstall whisper -y
!pip install -U openai-whisper


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m778.2/981.5 kB[0m [31m23.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1

In [4]:

# ================================
#  Step 2: Import Libraries
# ================================
import os
import torch
import librosa
import numpy as np
import subprocess
from google.colab import files
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
from transformers import MarianMTModel, MarianTokenizer
from langdetect import detect
from gtts import gTTS
import IPython.display as ipd

# ================================
#  Step 3: Load Emotion Model
# ================================
emotion_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained(emotion_model_name)
emotion_extractor = Wav2Vec2FeatureExtractor.from_pretrained(emotion_model_name)

emotion_labels = {
    0: "angry",
    1: "calm",
    2: "happy",
    3: "sad",
    4: "surprise",
    5: "fear",
    6: "disgust",
    7: "neutral"
}

# ================================
#  Step 4: Translation Models
# ================================
en_to_ar_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
en_to_ar_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")

ar_to_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
ar_to_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")

# ================================
#  Step 5: Whisper for transcription
# ================================
import whisper

print(whisper.__file__)

asr_model = whisper.load_model("small")

# ================================
#  Utility Functions
# ================================
def convert_to_wav(uploaded_file):
    ext = os.path.splitext(uploaded_file)[1].lower()
    if ext == '.wav':
        print(f" Already in WAV format: {uploaded_file}")
        return uploaded_file
    else:
        converted = "converted.wav"
        subprocess.call(["ffmpeg", "-y", "-i", uploaded_file, converted])
        print(f" Converted {uploaded_file} → {converted}")
        return converted

def predict_emotion(filepath):
    wav, sr = librosa.load(filepath, sr=16000)
    wav = wav / np.max(np.abs(wav))
    inputs = emotion_extractor(wav, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = emotion_model(**inputs).logits
    predicted = torch.argmax(logits, dim=-1).item()
    confidence = torch.softmax(logits, dim=-1).squeeze()[predicted].item()
    return emotion_labels[predicted], confidence

def translate_text(text, src_lang):
    if src_lang == "en":
        tokenizer, model = en_to_ar_tokenizer, en_to_ar_model
    else:
        tokenizer, model = ar_to_en_tokenizer, ar_to_en_model
    tokens = tokenizer(text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(**tokens)
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

def play_tts(text, lang):
    tts = gTTS(text=text, lang=lang)
    filename = f"{lang}_tts.mp3"
    tts.save(filename)
    return ipd.Audio(filename)

# ================================
#  Step 6: Upload File
# ================================
uploaded = files.upload()
uploaded_file = next(iter(uploaded))
audio_path = convert_to_wav(uploaded_file)

#  Play original audio
print(" Original audio:")
ipd.display(ipd.Audio(audio_path))

# ================================
# Transcribe with Whisper
# ================================
result = asr_model.transcribe(audio_path)
transcript = result["text"]
print("\nTranscribed text:\n", transcript)

# ================================
#  Detect Language & Translate
# ================================
lang = detect(transcript)
print(f"\n Detected language: {lang}")

if lang == "en":
    translated = translate_text(transcript, "en")
    print("\n Arabic translation:\n", translated)
    translated_audio = play_tts(translated, lang="ar")
elif lang == "ar":
    translated = translate_text(transcript, "ar")
    print("\n🇬🇧 English translation:\n", translated)
    translated_audio = play_tts(translated, lang="en")
else:
    print(" Unsupported language for translation.")
    translated = None
    translated_audio = None

#  Play translations
if translated_audio:
    print("\n Translated speech:")
    ipd.display(translated_audio)

# ================================
#  Emotion Detection
# ================================
emotion, confidence = predict_emotion(audio_path)
print(f"\n Emotion: {emotion}\n Confidence: {confidence:.2%}")

Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

/usr/local/lib/python3.11/dist-packages/whisper/__init__.py


Saving Everything In Your Life. Denzel Washington Motivational Speech. #motivation.m4a to Everything In Your Life. Denzel Washington Motivational Speech. #motivation.m4a
 Converted Everything In Your Life. Denzel Washington Motivational Speech. #motivation.m4a → converted.wav
 Original audio:



Transcribed text:
  Everything in your life is a reflection of a choice you have made. If you want a different result, make a different choice.

 Detected language: en

 Arabic translation:
 كل شيء في حياتك هو انعكاس للخيار الذي قمت به. إذا كنت تريد نتيجة مختلفة، فاختر اختياراً مختلفاً.

 Translated speech:



 Emotion: calm
 Confidence: 14.00%
