In [None]:
!pip install -U openai-whisper

Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12

In [None]:
import whisper

# Load the pre-trained Whisper model
model = whisper.load_model("small")

# Function to transcribe audio
def transcribe_audio(audio_path):
    result = model.transcribe(audio_path)
    return result["text"]

# Example usage
audio_path = "/content/WhatsApp Audio 2024-07-14 at 8.16.30 PM.aac"
transcription = transcribe_audio(audio_path)
print("Transcription:", transcription)




Transcription:  The stale smell of old beer lingers. It takes heat to bring out the odor. A cold dip restores health and zest. A salt pickle tastes fine with ham. Tacos al pastor are my favorite. A zestful food is the hot cross bun.


In [15]:
from transformers import MarianMTModel, MarianTokenizer, T5Tokenizer, T5ForConditionalGeneration, DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import torch

# Function to load the translation model based on source and target languages
def load_translation_model(src_lang, tgt_lang):
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# Function to translate text
def translate_text(text, src_lang="en", tgt_lang="en"):
    tokenizer, model = load_translation_model(src_lang, tgt_lang)
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return translated_text

# Load the T5 model
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Load the DPR models
dpr_context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
dpr_context_model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

dpr_question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
dpr_question_model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# Dummy RAG Document
documents = [
    {
        "id": 1,
        "title": "Sample Document",
        "content": "This is a sample RAG document. It contains some text that will be used for querying the RAG model."
    }
]

# Encode the documents
document_embeddings = []
for doc in documents:
    inputs = dpr_context_tokenizer(doc["content"], return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        doc_embedding = dpr_context_model(**inputs).pooler_output
    document_embeddings.append(doc_embedding)

# Function to query using DPR and T5
def query_dpr_t5(question):
    # Encode the question
    question_inputs = dpr_question_tokenizer(question, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        question_embedding = dpr_question_model(**question_inputs).pooler_output

    # Compute similarities
    similarities = torch.matmul(question_embedding, torch.cat(document_embeddings, dim=0).T)
    most_similar_doc_idx = torch.argmax(similarities, dim=1).item()

    # Retrieve the most similar document
    retrieved_doc = documents[most_similar_doc_idx]["content"]

    # Generate a response using T5
    input_text = f"question: {question} context: {retrieved_doc}"
    input_ids = t5_tokenizer(input_text, return_tensors="pt").input_ids
    outputs = t5_model.generate(input_ids)
    response = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Function to query the model using speech
def query_with_speech(audio_path, src_lang, tgt_lang):
    # Transcribe the audio
    transcription = transcribe_audio(audio_path)

    # Translate the transcription
    translated_text = translate_text(transcription, src_lang=src_lang, tgt_lang=tgt_lang)

    # Query using DPR and T5
    response = query_dpr_t5(translated_text)
    return response

# Example usage
audio_path = "/content/WhatsApp Audio 2024-07-14 at 8.16.30 PM.aac"
response1 = query_with_speech(audio_path, src_lang="en", tgt_lang="hi")
response2 = query_with_speech(audio_path, src_lang="en", tgt_lang="de")
response3 = query_with_speech(audio_path, src_lang="en", tgt_lang="fr")
print("Response in hi :", response1)
print("Response in de :", response2)
print("Response in fr :", response3)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expec

Response in hi : 
Response in de : Ein kaltes Dip stellt Gesundheit und Lust wieder her.
Response in fr : une cornichon de sel goûte bien avec le jambon. Tacos al
