In [5]:

%load_ext autoreload
%autoreload 2

In [6]:
import asyncio
import io
import logging
from datetime import timedelta,datetime,timezone

from faster_whisper import WhisperModel
from app.services.audio.redis import Transcript

from app.database_redis.connection import get_redis_client
from app.services.audio.audio import AudioSlicer
from app.services.audio.redis import Meeting, Transcriber, best_covering_connection
from app.settings import settings


  from .autonotebook import tqdm as notebook_tqdm


check_and_process_connections_interval_sec: 5.0


In [7]:
overlap = 2

In [8]:
logger = logging.getLogger(__name__)

In [9]:
redis_client = await get_redis_client(settings.redis_host, settings.redis_port, settings.redis_password)

In [31]:
transcriber = Transcriber(redis_client)
meeting_id = await transcriber.pop_inprogress()

In [32]:
meeting_id

'https://meet.google.com/zxn-bxis-tdw'

In [33]:
meeting = Meeting(redis_client, meeting_id)
await meeting.load_from_redis()

In [34]:
meeting.start_timestamp

datetime.datetime(2024, 5, 22, 15, 54, 11, 870038, tzinfo=tzutc())

In [35]:
meeting.transcriber_seek_timestamp

datetime.datetime(2024, 5, 22, 15, 58, 9, 918038, tzinfo=tzutc())

In [36]:
seek = meeting.transcriber_seek_timestamp - meeting.start_timestamp

In [37]:
seek

datetime.timedelta(seconds=238, microseconds=48000)

In [38]:
current_time = datetime.now(timezone.utc)

In [39]:
connections = await meeting.get_connections()

In [40]:
connection = best_covering_connection(meeting.diarizer_seek_timestamp, current_time, connections)

In [41]:
max_length = 240

In [42]:
connection = best_covering_connection(meeting.diarizer_seek_timestamp, current_time, connections)
audio_slicer = await AudioSlicer.from_ffmpeg_slice(f"/audio/{connection.id}.webm", seek, max_length)
slice_duration = audio_slicer.audio.duration_seconds
audio_data = await audio_slicer.export_data()

2024-05-22 16:23:30,275 - INFO - app.services.audio.audio - None


In [43]:
model_size = "large-v3"
model = WhisperModel(model_size, device="cuda", compute_type="float16")

In [44]:
segments, _ = model.transcribe(io.BytesIO(audio_data), beam_size=5, vad_filter=True, word_timestamps=True)
segments = [s for s in list(segments)]
logger.info("done")
result = [[w._asdict() for w in s.words] for s in segments]
transcription = Transcript(meeting_id, redis_client, result)
await transcription.lpush()

In [57]:
''.join([w['word'] for w in result[-1]])

' Не чеши батон, открошит он, и раскрошит он твой, чьи-то дон, дон, дон, дон, дон, дон, дон, дон, дон, дон, дон, дон, дон, потому что пчёл не он, гусь к нему чёрный.'

In [45]:
segments

[Segment(id=1, seek=2332, start=28.4, end=111.29, text=' Ну, теперь-то я вам что-нибудь скажу, вам что-нибудь скажу, вам, вот теперь-то я тоже вам что-нибудь скажу, а вот прямо сейчас я вам прямо что-нибудь скажу.', tokens=[50365, 7571, 11, 16983, 12, 860, 2552, 10448, 2143, 12, 23561, 21938, 585, 11, 10448, 2143, 12, 23561, 21938, 585, 11, 10448, 11, 5505, 16983, 12, 860, 2552, 12251, 10448, 2143, 12, 23561, 21938, 585, 11, 2559, 5505, 28547, 10241, 2552, 10448, 28547, 2143, 12, 23561, 21938, 585, 13, 50975], temperature=0.0, avg_logprob=-0.39853897032799657, compression_ratio=2.3035714285714284, no_speech_prob=0.11944580078125, words=[Word(start=28.4, end=28.86, word=' Ну,', probability=0.470947265625), Word(start=28.88, end=29.1, word=' теперь', probability=0.93310546875), Word(start=29.1, end=29.28, word='-то', probability=0.810791015625), Word(start=29.28, end=29.48, word=' я', probability=0.955078125), Word(start=29.48, end=29.68, word=' вам', probability=0.79931640625), Word(sta

In [46]:
seek

datetime.timedelta(seconds=238, microseconds=48000)

In [47]:
meeting.transcriber_seek_timestamp = meeting.start_timestamp+seek +timedelta(seconds = slice_duration-overlap)
await transcriber.remove(meeting.meeting_id)
await meeting.update_redis()

In [48]:
meeting.transcriber_seek_timestamp

datetime.datetime(2024, 5, 22, 16, 2, 7, 966038, tzinfo=tzutc())

In [49]:
await transcriber.remove(meeting.meeting_id)

In [50]:
await meeting.update_redis()

In [51]:
slice_duration

240.048

In [52]:
segments

[Segment(id=1, seek=2332, start=28.4, end=111.29, text=' Ну, теперь-то я вам что-нибудь скажу, вам что-нибудь скажу, вам, вот теперь-то я тоже вам что-нибудь скажу, а вот прямо сейчас я вам прямо что-нибудь скажу.', tokens=[50365, 7571, 11, 16983, 12, 860, 2552, 10448, 2143, 12, 23561, 21938, 585, 11, 10448, 2143, 12, 23561, 21938, 585, 11, 10448, 11, 5505, 16983, 12, 860, 2552, 12251, 10448, 2143, 12, 23561, 21938, 585, 11, 2559, 5505, 28547, 10241, 2552, 10448, 28547, 2143, 12, 23561, 21938, 585, 13, 50975], temperature=0.0, avg_logprob=-0.39853897032799657, compression_ratio=2.3035714285714284, no_speech_prob=0.11944580078125, words=[Word(start=28.4, end=28.86, word=' Ну,', probability=0.470947265625), Word(start=28.88, end=29.1, word=' теперь', probability=0.93310546875), Word(start=29.1, end=29.28, word='-то', probability=0.810791015625), Word(start=29.28, end=29.48, word=' я', probability=0.955078125), Word(start=29.48, end=29.68, word=' вам', probability=0.79931640625), Word(sta

In [53]:

meeting.transcribe_seek_timestamp = end_of_last_speech + meeting.transcribe_seek_timestamp
await transcriber.remove(meeting.id)
await meeting.update_redis()

NameError: name 'end_of_last_speech' is not defined

In [54]:
logger = logging.getLogger(__name__)

# TODO: implement recurrent prompt from last item


async def process(redis_client, model, max_length=240) -> None:
    transcriber = Transcriber(redis_client)
    meeting_id = await transcriber.pop_inprogress()

    if not meeting_id:
        return

    meeting = Meeting(redis_client, meeting_id)
    await meeting.load_from_redis()
    seek = meeting.transcribe_seek_timestamp - meeting.start_timestamp

    connection = best_covering_connection(seek, meeting.start_timestamp, meeting.get_connections())
    audio_slicer = await AudioSlicer.from_ffmpeg_slice(f"/audio/{connection.id}.webm", seek, seek + max_length)
    audio_data = await audio_slicer.export_data()

    segments, _ = model.transcribe(io.BytesIO(audio_data), beam_size=5, vad_filter=True, word_timestamps=True)
    segments = [s for s in list(segments)]
    logger.info("done")
    result = [[w._asdict() for w in s.words] for s in segments]
    transcription = Transcript(meeting_id, redis_client, result)
    await transcription.lpush()
    end_of_last_speech = timedelta(seconds=result[-1][-1]["end"])

    meeting.transcribe_seek_timestamp = end_of_last_speech + meeting.transcribe_seek_timestamp
    await transcriber.remove(meeting.id)
    await meeting.update_redis()

In [None]:


async def main():
    redis_client = await get_redis_client(settings.redis_host, settings.redis_port, settings.redis_password)

    while True:
        await asyncio.sleep(0.1)
        await process(redis_client, model)


if __name__ == "__main__":
    model_size = "large-v3"
    model = WhisperModel(model_size, device="cuda", compute_type="float16")
    logger.info("Model loaded")
    asyncio.run(main())
