make sure both processes get same audio


-choose file
- process via diarizer

In [1]:
import pandas as pd
import numpy as np
import json
from datetime import timedelta
from app.settings import settings
from app.database_redis.connection import get_redis_client
from app.services.audio.redis import Diarisation,Transcript
from app.services.audio.redis import Connection, Diarizer, Meeting, Transcriber

#list of common words that are strong glitch indicators that speech be removed from output
glitches = ['DimaTorzok',' Tchau.']


class DataPreparation:
    @staticmethod
    def prep_transcripts(transcriptions):
        dfs = []
        for n, (t, seek, connection_id) in enumerate(transcriptions):
            df = pd.DataFrame(t)[[2, 3, 4]]
            df.columns = ['start', 'end', 'speech']
            df['start'] = pd.to_timedelta(df['start'], unit='s') + pd.Timestamp(seek)
            df['end'] = pd.to_timedelta(df['end'], unit='s') + pd.Timestamp(seek)
            df['chunk'] = n
            dfs.append(df)
        return pd.concat(dfs).reset_index(drop=True)

    @staticmethod
    def prep_diarizations(diarizations):
        dfs = []
        for d, seek, connection_id in diarizations:
            df = pd.DataFrame(d)
            df['start'] = pd.to_timedelta(df['start'], unit='s') + pd.Timestamp(seek)
            df['end'] = pd.to_timedelta(df['end'], unit='s') + pd.Timestamp(seek)
            dfs.append(df)
        return pd.concat(dfs).reset_index(drop=True)

class DiarizationProcessor:
    @staticmethod
    def apply_diarization(trans_df, diar_df):
        segments = trans_df.to_dict('records')
        for seg in segments:
            diar_df['intersection'] = np.minimum(diar_df['end'], seg['end']) - np.maximum(diar_df['start'], seg['start'])
            speaker_ = diar_df[(diar_df['intersection'] == diar_df['intersection'].max()) & (diar_df['intersection'] > pd.Timedelta(0))]['speaker']
            if len(speaker_) > 0:
                seg['speaker'] = speaker_.iloc[0]
        return pd.DataFrame(segments)

class RedisManager:
    def __init__(self, redis_host, redis_port, redis_password):
        self.redis_client = None
        self.diarization = None
        self.transcript = None
        self.meeting = None
        self.redis_host = redis_host
        self.redis_port = redis_port
        self.redis_password = redis_password
        self.diarizations = [] 
        self.transcriptions = []

    async def initialize(self, meeting_id):
        self.meeting_id=meeting_id
        self.redis_client = await get_redis_client(self.redis_host, self.redis_port, self.redis_password)
        self.diarization = Diarisation(meeting_id, redis_client=self.redis_client)
        self.transcript = Transcript(meeting_id, redis_client=self.redis_client)
        self.meeting = Meeting(self.redis_client, meeting_id)


    #=========This supposed to be replaced with corresponding audio service endpoints (AKA await self.__audio_service_api.get_transcriber_segments()) ======
    async def fetch_diarizations(self):
        await self.load_diarizations()
        while True:
            d = await self.diarization.rpop()
            if not d:
                break
            self.diarizations.append(json.loads(d))
        
        await self.store_diarizations()
        return self.diarizations

    async def fetch_transcriptions(self):
        await self.load_transcriptions()
        
        while True:
            d = await self.transcript.rpop()
            if not d:
                break
            self.transcriptions.append(json.loads(d))
            
        await self.store_transcriptions()
        return self.transcriptions
    
    #=============================above=================
    
    
    async def store_diarizations(self):
        for d in self.diarizations:
            await self.redis_client.lpush(f"{self.meeting_id}:diarizations", json.dumps(d))
    
    async def store_transcriptions(self):
        for t in self.transcriptions:
            await self.redis_client.lpush(f"{self.meeting_id}:transcriptions", json.dumps(t))
    
    async def load_diarizations(self):
        self.diarizations = []
        while True:
            d = await self.redis_client.rpop(f"{self.meeting_id}:diarizations")
            if not d:
                break
            self.diarizations.append(json.loads(d))
        return self.diarizations
    
    async def load_transcriptions(self):
        self.transcriptions = []
        while True:
            t = await self.redis_client.rpop(f"{self.meeting_id}:transcriptions")
            if not t:
                break
            self.transcriptions.append(json.loads(t))
        return self.transcriptions
    
    async def delete(self):
        return await self.redis_client.delete(f"{self.meeting_id}:transcriptions") and await self.redis_client.delete(f"{self.meeting_id}:diarizations")


check_and_process_connections_interval_sec: 5.0


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:

redis_manager = RedisManager(settings.redis_host, settings.redis_port, settings.redis_password)
await redis_manager.initialize('Uc9NnN3hmWI&')

In [4]:
# await redis_manager.delete()

In [5]:

diarizations = await redis_manager.fetch_diarizations()
transcriptions = await redis_manager.fetch_transcriptions()

trans_df = pd.DataFrame()
diar_df = pd.DataFrame(columns = ['start','end','speaker'])

if transcriptions:
    trans_df = DataPreparation.prep_transcripts(transcriptions)
    trans_df = trans_df[~trans_df['speech'].str.contains('|'.join(glitches))] #cleaning

if diarizations:
    diar_df = DataPreparation.prep_diarizations(diarizations)

if not trans_df.empty:
    df = DiarizationProcessor.apply_diarization(trans_df, diar_df)


if not df.empty:
    if 'speaker' in  df.columns: 
        df['speaker'] = df['speaker'].fillna('TBD')   #to be determined
        rename_dict = {s: n for n, s in enumerate(df['speaker'].dropna().unique().tolist())}
        df = df.replace(rename_dict).sort_values('start')
        df = df.drop_duplicates('start')
        df['change'] = df['speaker'] != df['speaker'].shift()
        df['change'] = df['change'].cumsum()
        df = df.groupby('change').agg({'speech': 'sum', 'speaker': 'first', 'start': 'first', 'end': 'last'}).reset_index(drop=True)
df
    

NameError: name 'df' is not defined

In [7]:
transcriptions

[]