## **Voice-to-text Translator Backend**



**Loading DataSet**

In [None]:
!wget -O TED2020.en-ur.zip https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/en-ur.txt.zip
!unzip TED2020.en-ur.zip


In [None]:
import pandas as pd

# Load and preprocess the dataset
def load_ted_dataset(en_file, ur_file):
    with open(en_file, 'r', encoding='utf-8') as en, open(ur_file, 'r', encoding='utf-8') as ur:
        english_sentences = en.readlines()
        urdu_sentences = ur.readlines()

    assert len(english_sentences) == len(urdu_sentences), "Files line counts don't match."
    data = {"english": english_sentences, "urdu": urdu_sentences}
    return pd.DataFrame(data)

# Updated file paths based on your screenshot
en_file_path = "TED2020.en-ur.en"
ur_file_path = "TED2020.en-ur.ur"

# Load the dataset
df = load_ted_dataset(en_file_path, ur_file_path)

# Clean and save
df['english'] = df['english'].str.strip()
df['urdu'] = df['urdu'].str.strip()
df = df.dropna().reset_index(drop=True)

# Save to a CSV file
df.to_csv("ted_talks_english_urdu.csv", index=False)

print("Dataset preprocessing complete. Saved as 'ted_talks_english_urdu.csv'.")

**Installing relevant dependencies**

In [None]:
!pip install fastapi uvicorn pyngrok whisper

In [None]:
!pip install transformers

In [None]:
!pip install pydub

In [None]:
!pip install -U openai-whisper

In [None]:
!pip install python-multipart

In [None]:
!ngrok config add-authtoken 2opx3iwAs2jYTIWTw7r0pEyXevX_2YtqaJv3nMDqLk7Nq2i3K


In [None]:
!apt-get install -y socat

In [None]:
!pip install fastapi

In [None]:
!pip install openai==0.28.0

In [None]:
!pip install git+https://github.com/openai/whisper.git

**Model training and data preprocessing**

In [None]:
import torch
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
import os

# Disable WANDB for now
os.environ["WANDB_DISABLED"] = "true"

# Check if CUDA (GPU) is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
dataset = load_dataset('csv', data_files={"train": "ted_talks_english_urdu.csv"})
dataset = dataset['train'].train_test_split(test_size=0.2)

# Ensure no None or empty values
dataset = dataset.filter(lambda example: example['english'] and example['urdu'])

# Load Pretrained MarianMT model and tokenizer
model_name = "abdulwaheed1/english-to-urdu-translation-mbart"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="ur_PK", tgt_lang="en_XX")
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)  # Move model to GPU if available

def preprocess_function(examples):
    inputs = [text for text in examples['urdu']]  # Urdu as source
    targets = [text for text in examples['english']]  # English as target

    model_inputs = tokenizer(inputs, max_length=150, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length").input_ids

    labels = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels]
    model_inputs["labels"] = labels

    return model_inputs

# Tokenize datasets
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_total_limit=2,
    generation_max_length=150,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=500,
    # Use mixed precision for faster training
    fp16=True,
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
)

# Train model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./urdu_to_english_finetuned_model")
tokenizer.save_pretrained("./urdu_to_english_finetuned_model")


In [None]:
import torch
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
import os

# Disable WANDB for now
os.environ["WANDB_DISABLED"] = "true"

# Check if CUDA (GPU) is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
dataset = load_dataset('csv', data_files={"train": "ted_talks_english_urdu.csv"})
dataset = dataset['train'].train_test_split(test_size=0.2)

# Ensure no None or empty values
dataset = dataset.filter(lambda example: example['english'] and example['urdu'])

# Load Pretrained MarianMT model and tokenizer
model_name = "abdulwaheed1/english-to-urdu-translation-mbart"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="en_XX", tgt_lang="ur_PK")
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)  # Move model to GPU if available

def preprocess_function(examples):
    # Prepare the translation inputs for the MarianMT model
    inputs = [text for text in examples['english']]
    targets = [text for text in examples['urdu']]

    # Tokenize inputs and labels with padding
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids

    # Replace padding token ID for labels with -100 (ignored in loss calculation)
    labels = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels]
    model_inputs["labels"] = labels

    return model_inputs

# Tokenize datasets
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_total_limit=2,
    generation_max_length=128,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=500,
    # Use mixed precision for faster training
    fp16=True,
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
)

# Train model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./ted_talks_finetuned_model")
tokenizer.save_pretrained("./ted_talks_finetuned_model")

**Loading model from google drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls /content/drive/MyDrive

In [None]:
!cp /content/drive/MyDrive/NLP_Translator_modle/ted_talks_finetuned_model.zip /content/

In [None]:
!unzip ted_talks_finetuned_model.zip

In [None]:
!cp /content/drive/MyDrive/NLP_Translator_modle/urdu_to_english_finetuned_model.zip /content/

In [None]:
!unzip urdu_to_english_finetuned_model.zip

**Model Evalution**

In [None]:
!pip install datasets

In [None]:
!pip install evaluate

In [None]:
import torch
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
from evaluate import load
from sklearn.metrics import f1_score
from datasets import load_dataset

# Load TED Talks dataset
dataset = load_dataset('csv', data_files={"train": "ted_talks_english_urdu.csv"})
dataset = dataset['train'].train_test_split(test_size=0.2)
test_dataset = dataset['test']  # Use the test split for evaluation

# Load the trained model and tokenizer
model_name = "./urdu_to_english_finetuned_model"  # Path to your model
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="ur_PK", tgt_lang="en_XX")
model = MBartForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize BLEU score and F1 score metrics
bleu_metric = load("bleu")

# Function to translate Urdu to English
def translate_urdu_to_english(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=150, truncation=True, padding="max_length")
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to the correct device
    outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Limit the number of samples to evaluate
sample_size = 1
df_sampled = test_dataset.select(range(sample_size))  # Select first `sample_size` samples

predictions = []
references = []

# Iterate over the sampled test data
for row in df_sampled:
    urdu_text = row['urdu']
    reference_translation = row['english']

    # Skip rows with missing or empty Urdu text
    if not isinstance(urdu_text, str) or not urdu_text.strip():
        continue

    # Translate and store predictions and references
    prediction = translate_urdu_to_english(urdu_text)
    predictions.append(prediction)
    references.append([reference_translation])  # BLEU metric expects a list of references


# Ensure predictions and references are of the same length
assert len(predictions) == len(references), f"Length mismatch: {len(predictions)} != {len(references)}"

# Calculate BLEU score
bleu_score = bleu_metric.compute(predictions=predictions, references=references)
print(f"BLEU Score: {bleu_score['bleu']}")

# Convert predictions and references to tokens for F1 score calculation
def tokenize_sentences(sentences):
    return [sentence.split() for sentence in sentences]

tokenized_predictions = tokenize_sentences(predictions)
tokenized_references = tokenize_sentences([ref[0] for ref in references])

# Flatten the tokenized lists
flat_predictions = [item for sublist in tokenized_predictions for item in sublist]
flat_references = [item for sublist in tokenized_references for item in sublist]

# Calculate F1 score (micro average)
f1 = f1_score(flat_references, flat_predictions, average='micro')
print(f"F1 Score: {f1}")

**API**

In [None]:
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from fastapi.responses import JSONResponse
from pyngrok import ngrok
import uvicorn
import nest_asyncio
from pydub import AudioSegment
import whisper
import os
import subprocess
import torch
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

# Apply the nest_asyncio patch
nest_asyncio.apply()

app = FastAPI()

whisper_model = whisper.load_model("medium")

# Load the fine-tuned models and tokenizers
# English to Urdu
model_path_eng_to_urd = "./ted_talks_finetuned_model"
tokenizer_eng_to_urd = MBart50TokenizerFast.from_pretrained(model_path_eng_to_urd, src_lang="en_XX", tgt_lang="ur_PK")
model_eng_to_urd = MBartForConditionalGeneration.from_pretrained(model_path_eng_to_urd)

# Urdu to English
model_path_urd_to_eng = "./urdu_to_english_finetuned_model"
tokenizer_urd_to_eng = MBart50TokenizerFast.from_pretrained(model_path_urd_to_eng, src_lang="ur_PK", tgt_lang="en_XX")
model_urd_to_eng = MBartForConditionalGeneration.from_pretrained(model_path_urd_to_eng)

# Move models to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_eng_to_urd.to(device)
model_urd_to_eng.to(device)

def translate_text(text, direction):
    """Handles translation based on the specified direction."""
    if direction == "eng-to-urd":
        tokenizer, model = tokenizer_eng_to_urd, model_eng_to_urd
    elif direction == "urd-to-eng":
        tokenizer, model = tokenizer_urd_to_eng, model_urd_to_eng
    else:
        raise ValueError("Invalid translation direction!")

    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", max_length=150, truncation=True, padding="max_length")
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to the correct device

    # Generate translation
    outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Configure CORS middleware to accept requests from any origin
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allows all origins; specify frontend URL if needed
    allow_credentials=True,
    allow_methods=["*"],  # Allows all methods (GET, POST, etc.)
    allow_headers=["*"],  # Allows all headers
)

# Create a tunnel to the FastAPI app
public_url = ngrok.connect(5000)
print("FastAPI is accessible at:", public_url)

# Define a Pydantic model to handle the message data
class Message(BaseModel):
    message: str

# Root endpoint (for testing)
@app.get("/")
async def root():
    return {"message": "Welcome to the FastAPI app!"}

# Hello World endpoint (for testing)
@app.get("/hello")
async def hello_world():
    return {"message": "Hello World"}

# Favicon endpoint (to avoid 404 error for GET /favicon.ico)
@app.get("/favicon.ico")
async def favicon():
    return JSONResponse(content={})

# Endpoint to accept the message from the frontend via POST
@app.post("/send-message")
async def send_message(data: Message):
    return {"received_message": data.message}

# Endpoint to upload audio data and transcribe with Whisper
@app.post("/upload-audio")
async def upload_audio(request: Request):
    try:
        audio_data = await request.body()  # Read binary data
        direction = request.headers.get("Translation-Direction", None)

        if not direction or direction not in ["eng-to-urd", "urd-to-eng"]:
            return JSONResponse(
                content={"error": "Invalid or missing Translation-Direction header"},
                status_code=400
            )

        # Save the audio data temporarily as an Opus file
        opus_path = "temp_audio.opus"
        with open(opus_path, "wb") as f:
            f.write(audio_data)

        # Define the WAV file path
        wav_path = "temp_audio.wav"

        # Convert Opus to WAV using ffmpeg command line
        try:
            subprocess.run(
                ["ffmpeg", "-y", "-i", opus_path, wav_path],
                check=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
        except subprocess.CalledProcessError as e:
            error_message = e.stderr.decode()
            print(f"FFmpeg error: {error_message}")
            return JSONResponse(
                content={"error": "Audio conversion failed", "details": error_message},
                status_code=500
            )

        # Transcribe the audio using the local Whisper model
        try:
          if direction == "eng-to-urd":
            result = whisper_model.transcribe(wav_path)
            transcription = result["text"]
          else:
            result = whisper_model.transcribe(wav_path, language="Urdu")
            transcription = result["text"]
        except Exception as e:
            print(f"Whisper transcription error: {str(e)}")
            return JSONResponse(
                content={"error": "Transcription failed", "details": str(e)},
                status_code=500
            )

        try:
            translation = translate_text(transcription, direction)
        except Exception as e:
            print(f"Translation error: {str(e)}")
            translation = "Translation unavailable"


        # Cleanup temporary files
        os.remove(opus_path)
        os.remove(wav_path)

        # Return the transcription result
        return {"message": "Audio received", "transcription": transcription, "translation" : translation}

    except Exception as e:
        print(f"General error: {str(e)}")
        return JSONResponse(
            content={"error": "Unexpected error occurred", "details": str(e)},
            status_code=500
        )

# To run the server with FastAPI in Jupyter/Colab
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=5000)s
