<a href="https://colab.research.google.com/github/Sri-1660/AI-CHATBOT/blob/main/MAIN_speech_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q git+https://github.com/openai/whisper.git gradio nltk transformers torchaudio sentencepiece

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.2 MB/s[0m eta [36m0:0

In [None]:

import whisper
import gradio as gr
import torch
import torchaudio
import nltk
import pandas as pd
import numpy as np
import time
from transformers import pipeline

nltk.download("punkt")

# ✅ Load Models
whisper_model = whisper.load_model("small")  # For speed
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", device=0 if torch.cuda.is_available() else -1)
emotion_analyzer = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=False, device=0 if torch.cuda.is_available() else -1)

# ✅ Add-on: Explanation and Rating
def explain_emotion(label):
    info = {
        'joy': {
            'explanation': 'The speaker expresses positive emotion like happiness or contentment.',
            'rating': '⭐️⭐️⭐️⭐️⭐️',
            'mood': '😊 Joyful',
            'urgency': '🟢 Low',
            'action': 'Reinforce positivity or celebrate with them.'
        },
        'neutral': {
            'explanation': 'The speaker is calm, factual, or non-emotional.',
            'rating': '⭐️⭐️⭐️',
            'mood': '😐 Neutral',
            'urgency': '🟢 Low',
            'action': 'No immediate action needed; maintain engagement.'
        },
        'sadness': {
            'explanation': 'The speaker shows signs of disappointment, loneliness, or emotional hurt.',
            'rating': '⭐️⭐️',
            'mood': '😢 Sad',
            'urgency': '🟠 Medium',
            'action': 'Reach out with comfort or empathy.'
        },
        'anger': {
            'explanation': 'The speaker is frustrated, upset, or aggressive.',
            'rating': '⭐️',
            'mood': '😠 Angry',
            'urgency': '🔴 High',
            'action': 'Calmly address their concerns or avoid conflict.'
        },
        'fear': {
            'explanation': 'The speaker expresses worry, stress, or fear of something.',
            'rating': '⭐️⭐️',
            'mood': '😟 Anxious',
            'urgency': '🟠 Medium',
            'action': 'Provide reassurance or clarity.'
        },
        'disgust': {
            'explanation': 'The speaker expresses aversion, disapproval, or dislike.',
            'rating': '⭐️',
            'mood': '🤢 Disgusted',
            'urgency': '🔴 High',
            'action': 'Avoid triggers and investigate the cause.'
        },
        'surprise': {
            'explanation': 'The speaker reacts to unexpected events or information.',
            'rating': '⭐️⭐️⭐️⭐️',
            'mood': '😲 Surprised',
            'urgency': '🟡 Medium',
            'action': 'Clarify or elaborate on surprises.'
        }
    }
    return info.get(label, {
        'explanation': 'Unrecognized emotion.',
        'rating': '⭐️',
        'mood': '🤖 Unknown',
        'urgency': '🟡 Medium',
        'action': 'Analyze further.'
    })

# ✅ Full Analysis Function
def fast_transcribe_translate_emotion(audio_path):
    start = time.time()

    audio, sr = torchaudio.load(audio_path)
    audio = audio.mean(dim=0) if audio.ndim > 1 else audio
    audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
    torchaudio.save("fast.wav", audio.unsqueeze(0), 16000)

    result = whisper_model.transcribe("fast.wav", fp16=False)
    transcription = result["text"].strip()
    language = result["language"]

    # Translation
    if language != "en":
        chunks = [transcription[i:i+400] for i in range(0, len(transcription), 400)]
        translated_chunks = [translator(chunk)[0]['translation_text'] for chunk in chunks]
        translation = " ".join(translated_chunks)
    else:
        translation = transcription

    # Emotion detection
    emotion_result = emotion_analyzer(translation[:512])[0]
    label = emotion_result['label'].lower()
    confidence = emotion_result['score']

    # Add-on explanation
    info = explain_emotion(label)
    explanation = info['explanation']
    stars = info['rating']
    mood = info['mood']
    urgency = info['urgency']
    action = info['action']

    summary = f"{stars} {label.upper()} — {explanation} (Confidence: {confidence:.2f})"

    # Analytics Table
    analytics_df = pd.DataFrame({
        "Metric": [
            "Detected Emotion",
            "Confidence Score",
            "Overall Mood",
            "Urgency Level",
            "Recommended Action"
        ],
        "Value": [
            label.upper(),
            f"{confidence:.2f}",
            mood,
            urgency,
            action
        ],
        "Interpretation": [
            "Primary emotion expressed",
            "Model confidence in prediction",
            "General tone of the speaker",
            "How urgent is the emotional tone?",
            "How you should respond to the speaker"
        ]
    })

    print(f"✅ Done in {time.time() - start:.2f}s")
    return transcription, translation, summary, analytics_df

# ✅ Gradio UI
with gr.Blocks() as app:
    gr.Markdown("# ⚡️ Fast AI Audio Emotion Analyzer + Star Rating & Summary")
    gr.Markdown("Upload an audio file. We'll transcribe, translate, and analyze emotion with detailed explanation & star rating.")

    audio_input = gr.Audio(type="filepath", label="🎵 Upload Audio")
    run_btn = gr.Button("🚀 Analyze Fast")

    with gr.Row():
        transcription_box = gr.Textbox(label="📜 Transcription")
        translation_box = gr.Textbox(label="🌐 Translation")
        sentiment_box = gr.Textbox(label="🧠 Emotion Summary")

    analytics_table = gr.Dataframe(label="📊 Emotion Insights Table")

    run_btn.click(
        fn=fast_transcribe_translate_emotion,
        inputs=audio_input,
        outputs=[transcription_box, translation_box, sentiment_box, analytics_table]
    )

app.launch(debug=True, share=True)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
100%|███████████████████████████████████████| 461M/461M [00:12<00:00, 38.7MiB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/791k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f5836ebc3cc79b2a9e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
