In [1]:

%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0,'../library')

In [2]:
import pandas as pd
from audio.audio import AudioSlicer
import uuid
from vexa.tools import log
from dotenv import load_dotenv
import os
load_dotenv()
STREAM_API_PORT = os.getenv('STREAM_API_PORT')
DATA_PATH = os.getenv('DATA_PATH')
SERVICE_TOKEN = os.getenv('SERVICE_TOKEN')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
STREAM_API_PORT

'8000'

In [4]:
from vexa.redis import get_redis

In [5]:
redis_client = await get_redis(host='redis',port=6379)

In [6]:
import httpx

In [7]:
async def get_connections():

    url = f"http://host.docker.internal:{STREAM_API_PORT}/list_connections"
    params = {
            "service_token": SERVICE_TOKEN
        }
    headers = {
        "Content-Type": "application/json"
    }


    async with httpx.AsyncClient() as client:
            response = await client.get(url, params=params, headers=headers, timeout=2)
    
    return response.json()['connections']



In [8]:
async def get_next_audio_chunk(connection_id,num_chunks=100):
    #TODO: add error handlingdepending in case 
    url = f"http://host.docker.internal:{STREAM_API_PORT}/get_next_chunk/{connection_id}"
    headers = {
            "Content-Type": "application/json"
        }
    params = {
                "service_token": SERVICE_TOKEN,
                "num_chunks": num_chunks
            }


    async with httpx.AsyncClient() as client:
        response = await client.get(url, params=params, headers=headers, timeout=2)
        if response.json() == {'message': 'No more chunks available for this connection'}:
            return 
    return response.json()
    

In [9]:
import httpx
import asyncio
import os

async def fetch_chunks(connection_id: str, num_chunks: int):
    # Define the URL to your API endpoint
    url = f"http://host.docker.internal:{STREAM_API_PORT}/get_next_chunks/{connection_id}"
    
    # Define the parameters
    params = {
        "service_token": SERVICE_TOKEN,
        "num_chunks": num_chunks
    }

    # Use an asynchronous client from httpx
    async with httpx.AsyncClient() as client:
        try:
            # Perform the GET request
            response = await client.get(url, params=params)

            # Check if the request was successful
            if response.status_code == 200:
                if response.json() == {'message': 'No more chunks available for this connection'}:
                    print(response.json())
                    return
                else:
                    return response.json()  # Return the JSON response from the API
            else:
                return {"error": "Failed to fetch chunks", "status_code": response.status_code, "details": response.text}
        except httpx.RequestError as e:
            return {"error": "An error occurred while requesting chunks", "exception": str(e)}


In [10]:
async def writestream2file(connection_id):
    path = f'/audio/{connection_id}.webm'
    first_timestamp = None
    items = await  fetch_chunks(connection_id,100)

    if items:
        for item in items['chunks']:
            chunk = bytes.fromhex(item['chunk'])
            first_timestamp = item['timestamp'] if not first_timestamp else first_timestamp
            # Open the file in append mode
            with open(path, 'ab') as file:
                # Write data to the file
                file.write(chunk)
            last_timestamp = item['timestamp']
            meeting_id = item['meeting_id']
            client_id = item['client_id']
        return meeting_id, first_timestamp,last_timestamp,client_id

In [11]:
async def get_meeting_start(timestamp):
    meeting_start = await redis_client.hget(f'Meeting:{meeting_id}','meeting_start')
    meeting_start = meeting_start if meeting_start else timestamp
    await redis_client.hset(f'Meeting:{meeting_id}','meeting_start',timestamp)
    return pd.Timestamp(meeting_start)

In [12]:
from audio.redis import Audio,Transcript,Diarisation

In [13]:
connections = await get_connections()

In [14]:
connections

[['3329123e-c417-423e-b2c6-5b3570b27ee3--0', 13],
 ['97313f28-d515-4720-8116-daf2ef1ece0f--0', 18]]

In [15]:
connections = [c[0] for c in connections]

In [16]:
connection_id = connections[0]

In [17]:
path = f'/audio/{connection_id}.webm'
start = 0
max_length = 200

In [18]:
connection_id

'3329123e-c417-423e-b2c6-5b3570b27ee3--0'

In [19]:
async def transcribe(audio_name, redis_client,client_id):
    await redis_client.lpush('Audio2TranscribeQueue', f'{audio_name}:{client_id}')
    _,done = await redis_client.brpop(f'TranscribeReady:{audio_name}',timeout=60)
    transcription =  Transcript(audio_name,redis_client)
    await transcription.get()
    return transcription.data

In [20]:

async def diarize(audio_name, redis_client,client_id):
    await redis_client.lpush('Audio2DiarizeQueue', f'{audio_name}:{client_id}')
    _,done = await redis_client.brpop(f'DiarizeReady:{audio_name}',timeout=60)
    diarization = Diarisation(audio_name, redis_client)
    await diarization.get()
    return diarization.data

In [21]:
connection_output = await writestream2file(connection_id)
if connection_output:
    meeting_id, start_timestamp,finish_timestamp, client_id = connection_output  
    meeting_start = await get_meeting_start(start_timestamp)

In [22]:
audio_slicer = await AudioSlicer.from_ffmpeg_slice(path,start,start+max_length)
slice_duration = audio_slicer.audio.duration_seconds
audio_data = await audio_slicer.export_data()

None


In [23]:
audio_name = f'Chunk_{str(uuid.uuid4())}'
audio = Audio(chunk_name=audio_name, redis_client=redis_client, data=audio_data)
await audio.save()

In [59]:
await diarize(audio_name, redis_client,client_id)

In [24]:
await transcribe(audio_name, redis_client,client_id)

[[{'start': 16.54, 'end': 17.14, 'word': ' Mm', 'probability': 0.348388671875},
  {'start': 17.14,
   'end': 17.42,
   'word': '-hmm.',
   'probability': 0.82763671875}]]

In [None]:
await audio.delete()

True

In [41]:
diarization_result, transcription_result = await asyncio.gather(
        asyncio.wait_for(diarize   (audio_name, redis_client,client_id), timeout=60),
        asyncio.wait_for(transcribe(audio_name, redis_client,client_id), timeout=60) 
    )

TypeError: cannot unpack non-iterable NoneType object

In [None]:
await audio.delete()

In [86]:
transcription_result

[[{'start': 0.88,
   'end': 1.62,
   'word': ' Бусик',
   'probability': 0.6811930338541666},
  {'start': 1.62, 'end': 1.78, 'word': ' по', 'probability': 0.9287109375},
  {'start': 1.78, 'end': 2.1, 'word': ' имени', 'probability': 0.986083984375},
  {'start': 2.1,
   'end': 2.52,
   'word': ' Муром.',
   'probability': 0.6548258463541666}],
 [{'start': 2.84, 'end': 3.44, 'word': ' Ты', 'probability': 0.412353515625},
  {'start': 7.25, 'end': 7.41, 'word': ' всё', 'probability': 0.404541015625},
  {'start': 7.41,
   'end': 7.75,
   'word': ' устал?',
   'probability': 0.64697265625},
  {'start': 7.77, 'end': 7.93, 'word': ' Давай', 'probability': 0.947265625},
  {'start': 7.93, 'end': 8.03, 'word': ' в', 'probability': 0.9833984375},
  {'start': 8.03,
   'end': 8.57,
   'word': ' кроватку.',
   'probability': 0.99267578125}],
 [{'start': 11.82, 'end': 12.12, 'word': ' Как', 'probability': 0.83984375},
  {'start': 12.12,
   'end': 12.32,
   'word': ' тебе',
   'probability': 0.97314453

In [106]:
meeting_id, start_timestamp,finish_timestamp, client_id

('a609851a-e429-44af-8590-f264b4d6c8cf--0',
 '2024-05-05T18:23:02.610603Z',
 '2024-05-05T18:23:53.971434Z',
 '1')

In [None]:
meeting_start(timestamp)

In [None]:
async def last_start(timestamp):
    last_start = await redis_client.hget(f'Meeting:{meeting_id}','last_start')
    last_start = last_start if last_start else timestamp
    await redis_client.hset(f'Meeting:{meeting_id}','last_start',timestamp)
    return pd.Timestamp(meeting_start)

In [177]:
await meeting_start(timestamp)

Timestamp('2024-05-03 20:32:12.414516+0000', tz='UTC')

In [103]:
start = await redis_client.rpop(f'Start:{meeting_id}')
start = float(start) if start else pd.Timestamp(timestamp)

In [132]:
step = pd.Timedelta('1 min')

In [133]:
(pd.Timestamp.utcnow()-start)>step

True

In [135]:
from audio.audio import *

In [139]:
max_length = 200

In [140]:
path = f'/audio/{connection_id}.webm'

In [181]:
meeting_start = await meeting_start(timestamp)

In [141]:
audio_slicer = await AudioSlicer.from_ffmpeg_slice(path,start,start+max_length)

TypeError: Addition/subtraction of integers and integer-arrays with Timestamp is no longer supported.  Instead of adding/subtracting `n`, use `n * obj.freq`