In [5]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio  # For loading .wav files

# 1. Load model and processor
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# 2. Load the audio file (single-channel float tensor, sample_rate)
audio, sample_rate = torchaudio.load("speech.wav")
# Whisper expects 16kHz audio. If not, resample.
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    audio = resampler(audio)
    sample_rate = 16000

# 3. Whisper expects a 1D numpy array for input
audio = audio.squeeze().numpy()

# 4. Process audio
inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

# 5. Generate prediction
with torch.no_grad():
    generated_ids = model.generate(**inputs)

# 6. Decode output
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("Transcription:", transcription)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Transcription:  Who is Bill Gates?


In [8]:
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=api_key)

response = model.invoke(f"""

System Prompt and instruction:
    IS the asnwer is a large paragraph then convert the paragraph into a single line asnwer.

    Example:

        Question: 
            Who is Elon Musk?

        output:    
            line 1: Elon Musk is a South African-born American entrepreneur and businessman who founded SpaceX, co-founded Tesla, Neuralink, and OpenAI, and is known for his ambitious ventures in space exploration, electric vehicles, artificial intelligence, and sustainable energy.
            line 2: He is also the owner of X (formerly Twitter). His companies aim to revolutionize transportation both on Earth and in space, reduce global warming through sustainable energy production and consumption, and develop safe artificial intelligence for the benefit of humanity.
            line 3: Musk is a highly influential and often controversial figure, known for his innovative ideas, large personal wealth, and active presence on social media.

        Final Output:
            Elon Musk, a South African-born entrepreneur, leads companies like SpaceX, Tesla, Neuralink, and X (formerly Twitter), driving innovations in space exploration, electric vehicles, sustainable energy, and artificial intelligence to shape the future of humanity.
        
Question: 
    {transcription}?

""")

print(response.content)

Bill Gates, an American business magnate, software developer, and philanthropist, co-founded Microsoft, revolutionizing personal computing, and now focuses on global health, education, and climate change through the Bill & Melinda Gates Foundation.


In [9]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import torch
pipeline = KPipeline(lang_code='a')
# text = '''
# There seems to be a misunderstanding. "Ben Des" isn't widely recognized as a public figure or a common name.
# '''

text = response.content

generator = pipeline(text, voice='af_heart')
for i, (gs, ps, audio) in enumerate(generator):
    print(i, gs, ps)
    display(Audio(data=audio, rate=24000, autoplay=i==0))
    sf.write(f'{i}.wav', audio, 24000)

  _torch_pytree._register_pytree_node(




  WeightNorm.apply(module, name, dim)


0 Bill Gates, an American business magnate, software developer, and philanthropist, co-founded Microsoft, revolutionizing personal computing, and now focuses on global health, education, and climate change through the Bill & Melinda Gates Foundation. bˈɪl ɡˈAts, ɐn əmˈɛɹəkᵊn bˈɪznəs mˈæɡnˌAt, sˈɔftwˌɛɹ dəvˈɛləpəɹ, ænd fəlˈænθɹəpɪst, kˌOfˈWndᵻd mˈIkɹəsˌɑft, ɹˌɛvəlˈuʃənˌIzɪŋ pˈɜɹsᵊnəl kəmpjˈuTɪŋ, ænd nˈW fˈOkəsᵻz ˌɔn ɡlˈObᵊl hˈɛlθ, ˌɛʤəkˈAʃən, ænd klˈImət ʧˈAnʤ θɹu ðə bˈɪl ænd məlˈɪndə ɡˈAts fWndˈAʃən.
