**INSTALL REQUIRED LIBRARIES**

In [None]:
!pip install torch torchvision torchaudio transformers librosa gTTS gradio soundfile jiwer --quiet

import torch
import librosa
import gradio as gr
from gtts import gTTS
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from jiwer import wer

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m2.2/3.2 MB[0m [31m66.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m62.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25h

**LOAD SPEECH-TO-TEXT MODEL**

In [None]:
MODEL_NAME = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**FUNCTION 1: TEXT TO SPEECH**

In [None]:
def text_to_speech(text: str) -> str:
    """Converts input text into speech (MP3 file)."""
    out_path = "generated_speech.mp3"
    tts = gTTS(text=text, lang="en")
    tts.save(out_path)
    return out_path

**FUNCTION 2: SPEECH TO TEXT**

In [None]:
def speech_to_text(audio_path: str) -> str:
    """Converts input audio into transcribed text."""
    if audio_path is None:
        return "Please upload or record an audio file."

    speech, rate = librosa.load(audio_path, sr=16000)
    input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription.lower()

**TEXT ACCURACY CHECK**

In [None]:
def check_accuracy(original_text: str, transcribed_text: str) -> str:
    """
    Compares original text vs transcribed text using Word Error Rate.
    Returns accuracy percentage.
    """
    if not original_text or not transcribed_text:
        return "Please enter both original and transcribed texts."

    error_rate = wer(original_text.lower(), transcribed_text.lower())
    accuracy = max(0, (1 - error_rate) * 100)
    return f"✅ Accuracy: {accuracy:.2f}%\n(WER: {error_rate:.2f})"

**GRADIO APP**

In [None]:
with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ Deep Learning Project: Text ↔ Speech Converter + Accuracy Checker")

    with gr.Tab("Text ➝ Speech"):
        text_input = gr.Textbox(label="Enter Text", placeholder="Type something...")
        tts_button = gr.Button("Convert to Speech")
        audio_output = gr.Audio(label="Generated Speech", type="filepath")
        tts_button.click(fn=text_to_speech, inputs=text_input, outputs=audio_output)

    with gr.Tab("Speech ➝ Text"):
        audio_input = gr.Audio(label="Upload or Record Speech", type="filepath")
        stt_button = gr.Button("Convert to Text")
        text_output = gr.Textbox(label="Recognized Text")
        stt_button.click(fn=speech_to_text, inputs=audio_input, outputs=text_output)

    with gr.Tab("Accuracy Checker"):
        gr.Markdown("### Compare Original Text and Transcribed Text")
        orig_text = gr.Textbox(label="Original Text", placeholder="Enter the original text...")
        trans_text = gr.Textbox(label="Transcribed Text", placeholder="Enter the recognized text...")
        acc_button = gr.Button("Check Accuracy")
        acc_output = gr.Textbox(label="Result", interactive=False)
        acc_button.click(fn=check_accuracy, inputs=[orig_text, trans_text], outputs=acc_output)


**RUN APP**

In [None]:
if __name__ == "__main__":
    demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://46069113385e4717ee.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
