In [None]:
!pip install --q git+https://github.com/m-bain/whisperx.git

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import whisperx
import gc

  torchaudio.set_audio_backend("soundfile")


In [None]:
device = "cpu"
batch_size = 1 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

In [None]:
# Install necessary packages
!pip install pydub simpleaudio ffmpeg-python
!pip install ipywidgets



In [None]:

import io
import ffmpeg
import numpy as np
import wave
from pydub import AudioSegment
from pydub.playback import play
import simpleaudio as sa
import IPython.display as ipd
import ipywidgets as widgets
from google.colab import output
from base64 import b64decode
from IPython.display import Javascript, display
from google.colab.output import eval_js

# Variables
audio_file = None

# JavaScript code for recording
RECORD_JS = """
var my_recorder;
var audio_chunks;
var stream;

async function startRecording() {
    stream = await navigator.mediaDevices.getUserMedia({ audio: true });
    my_recorder = new MediaRecorder(stream);
    audio_chunks = [];
    my_recorder.ondataavailable = e => audio_chunks.push(e.data);
    my_recorder.start();
}

function stopRecording() {
    return new Promise((resolve) => {
        my_recorder.onstop = async () => {
            const blob = new Blob(audio_chunks);
            const audioUrl = URL.createObjectURL(blob);
            const audio = new Audio(audioUrl);
            const reader = new FileReader();
            reader.readAsDataURL(blob);
            reader.onloadend = () => {
                resolve(reader.result);
            };
        };
        my_recorder.stop();
    });
}
"""

# Function to start recording
def start_recording_js():
    display(Javascript(RECORD_JS))
    eval_js('startRecording()')

# Function to stop recording
def stop_recording_js(filename='recorded_audio.wav'):
    audio_data = eval_js('stopRecording()')
    audio_bytes = b64decode(audio_data.split(',')[1])

    with open(filename, 'wb') as f:
        f.write(audio_bytes)

    return filename

# Button handlers
def start_button_clicked(b):
    start_recording_js()
    with output_box:
        print("Recording started...")

def stop_button_clicked(b):
    global audio_file
    audio_file = stop_recording_js()
    with output_box:
        print(f"Recording stopped. Audio saved to {audio_file}")

# Create buttons for start/stop recording
start_button = widgets.Button(description="Start Recording")
stop_button = widgets.Button(description="Stop Recording")
output_box = widgets.Output()

# Assign button click events
start_button.on_click(start_button_clicked)
stop_button.on_click(stop_button_clicked)

# Display buttons
display(start_button, stop_button, output_box)


Button(description='Start Recording', style=ButtonStyle())

Button(description='Stop Recording', style=ButtonStyle())

Output()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Recording started...
Recording stopped. Audio saved to recorded_audio.wav


In [None]:
audio_file = "/content/recorded_audio.wav"

In [None]:
audio_file = "/content/The genius of Satya Nadella Sam Altman and Lex Fridman [TubeRipper.com].wav"

In [None]:
audio = whisperx.load_audio(audio_file)

In [None]:
model = whisperx.load_model("large-v2", device, compute_type=compute_type)


vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

No language specified, language will be first be detected for each audio file (increases inference time).


100%|█████████████████████████████████████| 16.9M/16.9M [00:00<00:00, 19.2MiB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.


In [None]:
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)


Detected language: en (1.00) in first 30s of audio...
[{'text': " Well, let me just ask you as an aside about Satya Nadella, the CEO of Microsoft. He seems to have successfully transformed Microsoft into this fresh, innovative, developer-friendly company. I agree. I mean, it's really hard to do for a very large company. What have you learned from him? Why do you think he was able to do this kind of thing?", 'start': 3.029, 'end': 28.439}, {'text': ' What insights do you have about why this one human being is able to contribute to the pivot of a large company into something very new? I think most CEOs are either great leaders or great managers. And from what I have observed with Satya, he is both.', 'start': 28.439, 'end': 51.391}, {'text': " Super visionary, really like gets people excited, really makes long duration and correct calls. And also he is just a super effective hands-on executive and I assume manager too. And I think that's pretty rare.", 'start': 52.807, 'end': 72.858}, {'

In [None]:
result

{'segments': [{'start': 3.029,
   'end': 8.211,
   'text': ' Well, let me just ask you as an aside about Satya Nadella, the CEO of Microsoft.',
   'words': [{'word': 'Well,', 'start': 3.029, 'end': 3.189, 'score': 0.762},
    {'word': 'let', 'start': 3.209, 'end': 3.269, 'score': 0.043},
    {'word': 'me', 'start': 3.289, 'end': 3.349, 'score': 0.941},
    {'word': 'just', 'start': 3.389, 'end': 3.529, 'score': 0.768},
    {'word': 'ask', 'start': 3.569, 'end': 3.749, 'score': 0.911},
    {'word': 'you', 'start': 3.769, 'end': 3.929, 'score': 0.906},
    {'word': 'as', 'start': 4.71, 'end': 4.81, 'score': 0.863},
    {'word': 'an', 'start': 4.89, 'end': 4.99, 'score': 0.534},
    {'word': 'aside', 'start': 5.03, 'end': 5.49, 'score': 0.878},
    {'word': 'about', 'start': 5.53, 'end': 5.77, 'score': 0.908},
    {'word': 'Satya', 'start': 6.11, 'end': 6.45, 'score': 0.583},
    {'word': 'Nadella,', 'start': 6.47, 'end': 6.931, 'score': 0.852},
    {'word': 'the', 'start': 7.151, 'end': 

In [None]:
diarize_model = whisperx.DiarizationPipeline(use_auth_token="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",
                                             device=device)

In [None]:
diarize_segments = diarize_model(audio, min_speakers=2, max_speakers=2)

In [None]:
diarize_segments

Unnamed: 0,segment,label,speaker,start,end
0,[ 00:00:03.013 --> 00:00:04.134],A,SPEAKER_00,3.013582,4.134126
1,[ 00:00:04.269 --> 00:00:08.446],B,SPEAKER_00,4.269949,8.44652
2,[ 00:00:08.565 --> 00:00:11.502],C,SPEAKER_00,8.565365,11.502547
3,[ 00:00:11.808 --> 00:00:13.370],D,SPEAKER_00,11.808149,13.370119
4,[ 00:00:13.726 --> 00:00:36.358],E,SPEAKER_00,13.726655,36.358234
5,[ 00:00:36.511 --> 00:00:37.478],F,SPEAKER_00,36.511036,37.478778
6,[ 00:00:38.769 --> 00:00:39.719],G,SPEAKER_01,38.7691,39.719864
7,[ 00:00:42.147 --> 00:00:45.339],H,SPEAKER_01,42.147708,45.339559
8,[ 00:00:46.612 --> 00:00:51.349],I,SPEAKER_01,46.612903,51.349745
9,[ 00:00:52.809 --> 00:01:02.266],J,SPEAKER_01,52.809847,62.266553


In [None]:
diarize_segments.speaker.unique()

array(['SPEAKER_00', 'SPEAKER_01'], dtype=object)

In [None]:
result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

                              segment label     speaker       start  \
0   [ 00:00:03.013 -->  00:00:04.134]     A  SPEAKER_00    3.013582   
1   [ 00:00:04.269 -->  00:00:08.446]     B  SPEAKER_00    4.269949   
2   [ 00:00:08.565 -->  00:00:11.502]     C  SPEAKER_00    8.565365   
3   [ 00:00:11.808 -->  00:00:13.370]     D  SPEAKER_00   11.808149   
4   [ 00:00:13.726 -->  00:00:36.358]     E  SPEAKER_00   13.726655   
5   [ 00:00:36.511 -->  00:00:37.478]     F  SPEAKER_00   36.511036   
6   [ 00:00:38.769 -->  00:00:39.719]     G  SPEAKER_01   38.769100   
7   [ 00:00:42.147 -->  00:00:45.339]     H  SPEAKER_01   42.147708   
8   [ 00:00:46.612 -->  00:00:51.349]     I  SPEAKER_01   46.612903   
9   [ 00:00:52.809 -->  00:01:02.266]     J  SPEAKER_01   52.809847   
10  [ 00:01:04.083 -->  00:01:10.772]     K  SPEAKER_01   64.083192   
11  [ 00:01:11.570 -->  00:01:12.860]     L  SPEAKER_01   71.570458   
12  [ 00:01:15.254 -->  00:01:19.753]     M  SPEAKER_00   75.254669   
13  [ 

In [None]:
result

{'segments': [{'start': 3.029,
   'end': 8.211,
   'text': ' Well, let me just ask you as an aside about Satya Nadella, the CEO of Microsoft.',
   'words': [{'word': 'Well,',
     'start': 3.029,
     'end': 3.189,
     'score': 0.762,
     'speaker': 'SPEAKER_00'},
    {'word': 'let',
     'start': 3.209,
     'end': 3.269,
     'score': 0.043,
     'speaker': 'SPEAKER_00'},
    {'word': 'me',
     'start': 3.289,
     'end': 3.349,
     'score': 0.941,
     'speaker': 'SPEAKER_00'},
    {'word': 'just',
     'start': 3.389,
     'end': 3.529,
     'score': 0.768,
     'speaker': 'SPEAKER_00'},
    {'word': 'ask',
     'start': 3.569,
     'end': 3.749,
     'score': 0.911,
     'speaker': 'SPEAKER_00'},
    {'word': 'you',
     'start': 3.769,
     'end': 3.929,
     'score': 0.906,
     'speaker': 'SPEAKER_00'},
    {'word': 'as',
     'start': 4.71,
     'end': 4.81,
     'score': 0.863,
     'speaker': 'SPEAKER_00'},
    {'word': 'an',
     'start': 4.89,
     'end': 4.99,
     's

In [None]:
output_string = ""

for segment in result['segments']:
    output_string += f"{segment['speaker']}: {segment['text'].strip()}\n"


print(output_string)

SPEAKER_00: Well, let me just ask you as an aside about Satya Nadella, the CEO of Microsoft.
SPEAKER_00: He seems to have successfully transformed Microsoft into this fresh, innovative, developer-friendly company.
SPEAKER_00: I agree.
SPEAKER_00: I mean, it's really hard to do for a very large company.
SPEAKER_00: What have you learned from him?
SPEAKER_00: Why do you think he was able to do this kind of thing?
SPEAKER_00: What insights do you have about why this one human being is able to contribute to the pivot of a large company into something very new?
SPEAKER_01: I think most CEOs are either great leaders or great managers.
SPEAKER_01: And from what I have observed with Satya, he is both.
SPEAKER_01: Super visionary, really like gets people excited, really makes long duration and correct calls.
SPEAKER_01: And also he is just a super effective hands-on executive and I assume manager too.
SPEAKER_01: And I think that's pretty rare.
SPEAKER_00: I mean, Microsoft, I'm guessing, like 

In [None]:
#@title Configure Gemini API key

import google.generativeai as genai
from google.colab import userdata

gemini_api_secret_name = 'GOOGLE_API_KEY'  # @param {type: "string"}

try:
  GOOGLE_API_KEY=userdata.get(gemini_api_secret_name)
  genai.configure(api_key=GOOGLE_API_KEY)
except userdata.SecretNotFoundError as e:
   print(f'Secret not found\n\nThis expects you to create a secret named {gemini_api_secret_name} in Colab\n\nVisit https://makersuite.google.com/app/apikey to create an API key\n\nStore that in the secrets section on the left side of the notebook (key icon)\n\nName the secret {gemini_api_secret_name}')
   raise e
except userdata.NotebookAccessError as e:
  print(f'You need to grant this notebook access to the {gemini_api_secret_name} secret in order for the notebook to access Gemini on your behalf.')
  raise e
except Exception as e:
  # unknown error
  print(f"There was an unknown error. Ensure you have a secret {gemini_api_secret_name} stored in Colab and it's a valid key from https://makersuite.google.com/app/apikey")
  raise e

In [None]:
#connect to the API and send an example message

text = 'Summarize the following, keep important details:' + output_string

model = genai.GenerativeModel('gemini-pro')
chat = model.start_chat(history=[])

response = chat.send_message(text)
response.text

'Satya Nadella, CEO of Microsoft, has successfully transformed the company into an innovative and developer-friendly organization. He is both a visionary leader and an effective manager. Nadella has been able to pivot Microsoft by providing clear and firm leadership while also showing compassion and patience towards his employees. He has not ruled by fear, but rather by love.'

In [1]:
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
!pip install pyaudio

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libasound2-dev is already the newest version (1.2.6.1-1ubuntu1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
Suggested packages:
  portaudio19-doc
The following NEW packages will be installed:
  libportaudio2 libportaudiocpp0 portaudio19-dev
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 188 kB of archives.
After this operation, 927 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libportaudio2 amd64 19.6.0-1.1 [65.3 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libportaudiocpp0 amd64 19.6.0-1.1 [16.1 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 portaudio19-dev amd64 19.6.0-1.1 [106 kB]
Fetched 188 kB in 2s (124 kB/s)
Selecting previously unselected package libportaudio2:amd64.
(Reading database ... 121913 files and directories currently installed.)
Pre

In [2]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [4]:
!pip install --q git+https://github.com/m-bain/whisperx.git

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.7/208.7 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.3/192.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m98.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m111.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m100.4 MB/s[0m eta 

In [10]:
!pip install pyaudio whisper transformers torch

Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py) ... [?25l[?25hdone
  Created wheel for whisper: filename=whisper-1.1.10-py3-none-any.whl size=41120 sha256=2b94b9acb417f02d5c30ad24210d0db073cea65f087cb5c0acaddc1a18a1f2db
  Stored in directory: /root/.cache/pip/wheels/aa/7c/1d/015619716e2facae6631312503baf3c3220e6a9a3508cb14b6
Successfully built whisper
Installing collected packages: whisper
Successfully installed whisper-1.1.10


In [5]:
import whisperx
import gc

  torchaudio.set_audio_backend("soundfile")


In [17]:
import pyaudio
from pydub import AudioSegment
from transformers import WhisperModel

import base64
import google.colab
import IPython.display

# Define record in js
script = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(duration=3, file_path):
    print(f"{duration}s recording in progress")
    display(Javascript(script))
    duration += 1
    audio_data = google.colab.output.eval_js('record(%d)' % (duration*1000))
    print("Audio recording complete")

    audio_binary = base64.b64decode(audio_data.split(',')[1])

    if file_path:
        with open(file_path, 'wb') as f:
            f.write(audio_binary)
        print(f"Audio saved to {file_path}")

    return audio_binary

def record_chunk(p, stream, file_path, chunk_length=1):
    frames = []
    for data in stream.read(1024):
        frames.append(data)
    wfwave = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
    wfwave.writeframes(b''.join(frames))
    wfwave.close()

def main2():
    #model_size = "medium.en"
    #model = WhisperModel.from_pretrained(model_size, device="cuda", compute_type="float16")
    model = whisperx.load_model("large-v2", device="cuda", compute_type="float16")
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
    accumulated_transcription = ""
    try:
        while True:
            chunk_file = "temp_chunk.wav"
            record_chunk(chunk_file)
            #transcription = transcribe_chunk(model, chunk_file)
            audio = whisperx.load_audio(chunk_file)
            transcription = model.transcribe(audio, batch_size=16)
            print(transcription)
            os.remove(chunk_file)
            accumulated_transcription += transcription
    except KeyboardInterrupt:
        pass
    finally:
        p.close()

if __name__ == "__main__":
    main2()

test = IPython.display.Audio(record())
IPython.display.display(test)

RuntimeError: CUDA failed with error out of memory