In [1]:

%load_ext autoreload
%autoreload 2
import logging
from datetime import datetime, timezone

from app.database_redis.connection import get_redis_client
from app.services.apis.streamqueue_service.client import StreamQueueServiceAPI
from app.services.audio.redis import Connection, Diarizer, Meeting, Transcriber
from app.settings import settings
from app.services.audio.redis import Diarisation,Transcript
import json
import pandas as pd
import numpy as np

logger = logging.getLogger(__name__)

check_and_process_connections_interval_sec: 1.0


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
redis_client = await get_redis_client(settings.redis_host, settings.redis_port, settings.redis_password)

In [3]:
# meeting_id = 'https://meet.google.com/iqi-jawb-pqq#wefwef'
# diarization = Diarisation(meeting_id,redis_client=redis_client)
# transcript = Transcript(meeting_id,redis_client=redis_client)

In [4]:
def expand_over_timeindex(df,seek,columns):
    seek = pd.Timestamp(seek)
    expanded_df = pd.DataFrame(columns=columns+['time'])
    for index, row in df.iterrows():
        time_range = np.arange(row['start'] * 100, row['end'] * 100, 1).astype(int)


        dict_ = {c: row[c] for c in columns}
        dict_['time'] = time_range

        temp_df = pd.DataFrame(dict_)
        expanded_df = pd.concat([expanded_df, temp_df], ignore_index=True)
    expanded_df['time'] = pd.to_timedelta(expanded_df['time']/100,unit='s')
    expanded_df['time'] =expanded_df['time']+seek

    return expanded_df.sort_values("time")

def clean_diarization_result(diarizations,min_len=0.75):
    if len (diarizations)>0:
        df = pd.DataFrame(diarizations)
        df['speaker'] = np.where(df['end']-df['start']<min_len,np.nan,df['speaker'])
        return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
    else: return []
    
def transcrtiption2df(transcription,seek):
    df =pd.DataFrame(transcription[0])
    df['seek'] = pd.to_datetime(seek)
    df['start'] = pd.to_timedelta(df['start'],unit='s')+df['seek']
    df['end'] = pd.to_timedelta(df['end'],unit='s')+df['seek']
    return df.reset_index(drop=True)[['word','probability','start','end']]


def group_by_speaker(df):

    df = df.copy()
    df['speaker_switch'] = df['speaker']!=df['speaker'].shift()
    df['speaker_switch'] = df['speaker_switch'].cumsum()
    df = df.reset_index(drop=True)

    speakers = df.drop_duplicates('speaker_switch').set_index('speaker_switch')['speaker'].to_frame('speaker')
    df =  (df.groupby('speaker_switch')['word'].apply(lambda x:''.join(x)).to_frame('text')
            .join(speakers)
            .join(df.groupby('speaker_switch')['time'].min().to_frame('min'))
            .join(df.groupby('speaker_switch')['time'].max().to_frame('max'))
    )

    df['span'] = df['max']-df['min']
    return df

In [5]:
import pandas as pd
import pickle

# Set options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Now, when you display a DataFrame, it will show all rows and columns


In [6]:
with open('mocks/diarizations.pkl', 'rb') as file:
    diarizations = pickle.load(file)
with open('mocks/transcriptions.pkl', 'rb') as file:
    transcriptions = pickle.load(file)

In [8]:
# new_diarizations = []
# while True:
#     d = await diarization.rpop()
#     if not d:
#         break
#     new_diarizations.append(json.loads(d))
# new_transcriptions = [] 
# while True:
#     d = await transcript.rpop()
#     if not d:
#         break
#     new_transcriptions.append(json.loads(d))


In [9]:
first_diarization = diarizations[:5]
second_diarization = diarizations[5:]

first_transcriptions = transcriptions[:10]
second_transcriptions = transcriptions[10:]

In [10]:
mutable_df = pd.DataFrame(columns = ['time','word','speaker'])

for transcripation,diarization in [(first_transcriptions,first_diarization),(second_transcriptions,second_diarization)]:
    diar_df = pd.DataFrame(columns = ['speaker','time'])
    for d,t in diarization:
        d = clean_diarization_result(d)
        df = expand_over_timeindex(pd.DataFrame(d),t,['speaker'])
        diar_df = diar_df
        diar_df = pd.concat([diar_df[diar_df['time']<df['time'].min()],df]) if len(diar_df)>0 else df
        
    transcript_dfs = [transcrtiption2df(result,seek) for result, seek in transcripation]
    transcript_df = pd.concat(transcript_dfs).sort_values(by=['start', 'end']).reset_index(drop=True)
    words = []

    # Iterate over the concatenated dataframe and select words with the highest probability
    for i in range(len(transcript_df)):
        if i == 0:
            words.append(transcript_df.loc[i])
        else:
            if transcript_df.loc[i]['start'] >= words[-1]['end']:
                words.append(transcript_df.loc[i])
            else:
                if transcript_df.loc[i]['probability'] > words[-1]['probability']:
                    words[-1] = transcript_df.loc[i]
    transcript_df = pd.DataFrame(words)
    merged_df = transcript_df[['start','word']].set_index('start').join(diar_df[['time',"speaker"]].set_index('time')).reset_index().dropna(subset='word').rename(columns = {'start':'time','index':'time'})
    merged_df.loc[merged_df['time']>diar_df.time.max(),'speaker']='not diarized'
    merged_df= merged_df.dropna(subset='speaker')
    
    immutable_df = mutable_df[mutable_df['time']<merged_df['time'].min()]
    mutable_df = merged_df[merged_df['time']>mutable_df['time'].max()] if mutable_df['time'].max()== mutable_df['time'].max() else merged_df

  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
  return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='re

In [12]:
mutable_df.head()

Unnamed: 0,time,word,speaker
0,2024-05-22 19:31:12.161356+00:00,So,6a3902ac-d4c3-4409-b3f9-d6a6d26439fd
1,2024-05-22 19:31:12.641356+00:00,why,6a3902ac-d4c3-4409-b3f9-d6a6d26439fd
2,2024-05-22 19:31:13.201356+00:00,are,6a3902ac-d4c3-4409-b3f9-d6a6d26439fd
3,2024-05-22 19:31:13.561356+00:00,we,6a3902ac-d4c3-4409-b3f9-d6a6d26439fd
4,2024-05-22 19:31:13.641356+00:00,building,6a3902ac-d4c3-4409-b3f9-d6a6d26439fd


In [13]:
immutable_df.head()

Unnamed: 0,time,word,speaker
0,2024-05-22 19:29:24.017384+00:00,Теперь,c8c60272-a566-43ab-be73-eef8adf9fad1
1,2024-05-22 19:29:24.657384+00:00,"да,",c8c60272-a566-43ab-be73-eef8adf9fad1
2,2024-05-22 19:29:25.057384+00:00,теперь,c8c60272-a566-43ab-be73-eef8adf9fad1
3,2024-05-22 19:29:25.457384+00:00,у,c8c60272-a566-43ab-be73-eef8adf9fad1
5,2024-05-22 19:29:26.077384+00:00,все,c8c60272-a566-43ab-be73-eef8adf9fad1
