In [1]:

%load_ext autoreload
%autoreload 2
import logging
from datetime import datetime, timezone

from app.database_redis.connection import get_redis_client
from app.services.apis.streamqueue_service.client import StreamQueueServiceAPI
from app.services.audio.redis import Connection, Diarizer, Meeting, Transcriber
from app.settings import settings
from app.services.audio.redis import Diarisation,Transcript
import json
import pandas as pd
import numpy as np

logger = logging.getLogger(__name__)

check_and_process_connections_interval_sec: 1.0


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
redis_client = await get_redis_client(settings.redis_host, settings.redis_port, settings.redis_password)

In [3]:
meeting_id = 'gzg-mzys-qwi'
diarization = Diarisation(meeting_id,redis_client=redis_client)
transcript = Transcript(meeting_id,redis_client=redis_client)

In [4]:
def expand_over_timeindex(df,seek,columns):
    seek = pd.Timestamp(seek)
    expanded_df = pd.DataFrame(columns=columns+['time'])
    for index, row in df.iterrows():
        time_range = np.arange(row['start'] * 100, row['end'] * 100, 1).astype(int)


        dict_ = {c: row[c] for c in columns}
        dict_['time'] = time_range

        temp_df = pd.DataFrame(dict_)
        expanded_df = pd.concat([expanded_df, temp_df], ignore_index=True)
    expanded_df['time'] = pd.to_timedelta(expanded_df['time']/100,unit='s')
    expanded_df['time'] =expanded_df['time']+seek

    return expanded_df.sort_values("time")

def clean_diarization_result(diarizations,min_len=0.75):
    if len (diarizations)>0:
        df = pd.DataFrame(diarizations)
        df['speaker'] = np.where(df['end']-df['start']<min_len,np.nan,df['speaker'])
        return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
    else: return []
    
def transcrtiption2df(transcription,seek):
    df =pd.DataFrame(transcription[0])
    df['seek'] = pd.to_datetime(seek)
    df['start'] = pd.to_timedelta(df['start'],unit='s')+df['seek']
    df['end'] = pd.to_timedelta(df['end'],unit='s')+df['seek']
    return df.reset_index(drop=True)[['word','probability','start','end']]


def group_by_speaker(df):

    df = df.copy()
    df['speaker_switch'] = df['speaker']!=df['speaker'].shift()
    df['speaker_switch'] = df['speaker_switch'].cumsum()
    df = df.reset_index(drop=True)

    speakers = df.drop_duplicates('speaker_switch').set_index('speaker_switch')['speaker'].to_frame('speaker')
    df =  (df.groupby('speaker_switch')['word'].apply(lambda x:''.join(x)).to_frame('text')
            .join(speakers)
            .join(df.groupby('speaker_switch')['time'].min().to_frame('min'))
            .join(df.groupby('speaker_switch')['time'].max().to_frame('max'))
    )

    df['span'] = df['max']-df['min']
    return df

In [5]:
import pandas as pd
import pickle

# Set options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth',None)

# Now, when you display a DataFrame, it will show all rows and columns


In [6]:
# with open('mocks/diarizations.pkl', 'rb') as file:
#     diarizations = pickle.load(file)
# with open('mocks/transcriptions.pkl', 'rb') as file:
#     transcriptions = pickle.load(file)

In [7]:
# first_diarization = diarizations[:5]
# second_diarization = diarizations[5:]

# first_transcriptions = transcriptions[:10]
# second_transcriptions = transcriptions[10:]

In [8]:
# mutable_df = pd.DataFrame(columns = ['time','word','speaker'])
# immutable_df = pd.DataFrame(columns = ['time','word','speaker'])

In [16]:
new_diarizations = []
while True:
    d = await diarization.rpop()
    if not d:
        break
    new_diarizations.append(json.loads(d))
new_transcriptions = [] 
while True:
    d = await transcript.rpop()
    if not d:
        break
    new_transcriptions.append(json.loads(d))

In [26]:


def get_next_seek(result,seek):

    df = pd.DataFrame(result)[[2,3,4]]
    df.columns = ['start','end','speech']
    df['start'] = pd.to_timedelta(df['start'],unit='s') + pd.Timestamp(seek)
    df['end'] = pd.to_timedelta(df['end'],unit='s') + pd.Timestamp(seek)

    if len(result) > 0:
        return df.iloc[-1]['start']

    else:
        return None


In [27]:
new_transcriptions[0][1]

'2024-05-23T17:26:49.942470+00:00'

In [28]:
get_next_seek(new_transcriptions[0][0],new_transcriptions[0][1])

Timestamp('2024-05-23 17:26:57.012470+0000', tz='UTC')

In [None]:
new_transcriptions[0]

In [None]:
wefwef

In [378]:
new_transcriptions[-1][2]

'7c2f45d9-8e74-4c78-b76a-acc4de8011fd'

In [None]:
[new_transcriptions]

In [380]:
for t,seek,connection_id in new_transcriptions:
    df = pd.DataFrame(t)[[2,3,4]]
    df.columns = ['start','end','speech']
    df['start'] = pd.to_timedelta(df['start'],unit='s') + pd.Timestamp(seek)
    df['end'] = pd.to_timedelta(df['end'],unit='s') + pd.Timestamp(seek)

In [382]:
df.iloc[-1]['start']

Timestamp('2024-05-23 17:14:34.567686+0000', tz='UTC')

In [None]:
new_transcriptions

In [315]:
result = new_transcriptions[0][0]

In [316]:
pd.DataFrame(result)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1,390,2.1,3.4,Давай ещё посмотрим.,"[50365, 29196, 13993, 42293, 13, 50560]",0.0,-0.372768,0.822222,0.016266,"[[2.1, 2.72, Давай, 0.8212890625], [2.72, 2.98, ещё, 0.54296875], [2.98, 3.4, посмотрим., 0.99560546875]]"


In [296]:
result[0][-1][4]

' Побольше результатов немножко за 5 секунд.'

In [None]:
if len(result) > 0:
        last_speech = result[-1]

        ended_silence = length - last_speech["end"]

In [257]:
pd.DataFrame(new_transcriptions[0][0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1,310,4.08,6.52,Ровно таким же способом можно идти.,"[50365, 6325, 1055, 1234, 31584, 6151, 23677, 1253, 8885, 17255, 3396, 13, 50520]",0.0,-0.144671,1.0,0.013176,"[[4.08, 4.9, Ровно, 0.9274088541666666], [4.9, 5.1, таким, 0.9970703125], [5.1, 5.3, же, 0.9970703125], [5.3, 5.86, способом, 0.999755859375], [5.86, 6.06, можно, 0.9794921875], [6.06, 6.52, идти., 0.996337890625]]"


In [166]:
def widen_diarization_result(diarizations,l=2):
    if len (diarizations)>0:
        df = pd.DataFrame(diarizations)
        df['start'] = df['start']-l
        df['end'] = df['end']+l
        df['speaker'] = np.where(df['end']-df['start']<l,np.nan,df['speaker'])
        return df.fillna(method='ffill').fillna(method='bfill').to_dict(orient='records')
    else: return []

In [194]:
transcript_dfs = [transcrtiption2df(result,seek) for result, seek,_ in new_transcriptions]
transcript_df = pd.concat(transcript_dfs).sort_values(by=['start', 'end']).reset_index(drop=True)
words = []

# Iterate over the concatenated dataframe and select words with the highest probability
for i in range(len(transcript_df)):
    if i == 0:
        words.append(transcript_df.loc[i])
    else:
        if transcript_df.loc[i]['start'] >= words[-1]['end']:
            words.append(transcript_df.loc[i])
        else:
            if transcript_df.loc[i]['probability'] > words[-1]['probability']:
                words[-1] = transcript_df.loc[i]
transcript_df = pd.DataFrame(words)



In [195]:

diar_df = pd.DataFrame(columns = ['speaker','time'])
for d,t,c in new_diarizations:
   # d = widen_diarization_result(d)
    df = pd.DataFrame(d)
    diar_df = pd.concat([diar_df[diar_df['end']<df['start'].min()],df]) if len(diar_df)>0 else df

diar_df['start'] = pd.to_timedelta(diar_df['start'])+pd.Timestamp(t)
diar_df['end']   = pd.to_timedelta(diar_df['end'])+pd.Timestamp(t)
diar_df

Unnamed: 0,start,end,speaker,score
0,2024-05-23 16:33:47.849663+00:00,2024-05-23 16:33:47.849663004+00:00,1a572ba4-3c9e-4e4b-a3e8-0459e1ff532d,0.938183
1,2024-05-23 16:33:47.849663005+00:00,2024-05-23 16:33:47.849663012+00:00,1a572ba4-3c9e-4e4b-a3e8-0459e1ff532d,0.938183
2,2024-05-23 16:33:47.849663012+00:00,2024-05-23 16:33:47.849663015+00:00,1a572ba4-3c9e-4e4b-a3e8-0459e1ff532d,0.938183
3,2024-05-23 16:33:47.849663015+00:00,2024-05-23 16:33:47.849663026+00:00,1a572ba4-3c9e-4e4b-a3e8-0459e1ff532d,0.938183
4,2024-05-23 16:33:47.849663024+00:00,2024-05-23 16:33:47.849663024+00:00,6b4fe146-5cc1-4ef5-8bc2-82c474de5223,0.978394


In [196]:
transcript_df

Unnamed: 0,word,probability,start,end
0,"Hi,",0.194336,2024-05-23 16:27:17.603485+00:00,2024-05-23 16:27:17.823485+00:00
1,I,0.934082,2024-05-23 16:27:18.023485+00:00,2024-05-23 16:27:19.303485+00:00
2,look,0.858398,2024-05-23 16:27:19.303485+00:00,2024-05-23 16:27:19.943485+00:00
3,at,0.967773,2024-05-23 16:27:19.943485+00:00,2024-05-23 16:27:20.043485+00:00
4,like,0.731445,2024-05-23 16:27:20.043485+00:00,2024-05-23 16:27:20.183485+00:00
5,desktop,0.95752,2024-05-23 16:27:20.183485+00:00,2024-05-23 16:27:20.523485+00:00
6,to,0.95752,2024-05-23 16:27:20.523485+00:00,2024-05-23 16:27:20.783485+00:00
7,"mobile,",0.999512,2024-05-23 16:27:20.783485+00:00,2024-05-23 16:27:21.063485+00:00
8,questions,0.655762,2024-05-23 16:27:21.283485+00:00,2024-05-23 16:27:21.843485+00:00
9,were,0.818359,2024-05-23 16:27:21.843485+00:00,2024-05-23 16:27:22.043485+00:00


In [170]:

merged_df = transcript_df[['start','word']].set_index('start').join(diar_df[['time',"speaker"]].set_index('time')).reset_index().dropna(subset='word').rename(columns = {'start':'time','index':'time'})

In [171]:

# if len(diar_df)>0:
#     merged_df.loc[merged_df['time']>diar_df.time.max(),'speaker']='not diarized'
# else:
    
# merged_df= merged_df.dropna(subset='speaker')

immutable_df = pd.concat([immutable_df,mutable_df[mutable_df['time']<merged_df['time'].min()]])
mutable_df = merged_df[merged_df['time']>mutable_df['time'].max()] if mutable_df['time'].max()== mutable_df['time'].max() else merged_df
mutable_output_df = mutable_df.copy()
#mutable_output_df['speaker'] = mutable_output_df['speaker'].fillna('not diarized')

pd.concat([group_by_speaker(immutable_df),(group_by_speaker(mutable_output_df))])

Unnamed: 0_level_0,text,speaker,min,max,span
speaker_switch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,of easy question. I don't think I'm the expert on it. I think language does,1a572ba4-3c9e-4e4b-a3e8-0459e1ff532d,2024-05-23 16:13:53.190934+00:00,2024-05-23 16:14:00.059423+00:00,0 days 00:00:06.868489
2,Wow.,,2024-05-23 16:14:19.183871+00:00,2024-05-23 16:14:19.183871+00:00,0 days 00:00:00
3,That's,,2024-05-23 16:14:19.783871+00:00,2024-05-23 16:14:19.783871+00:00,0 days 00:00:00
4,not,,2024-05-23 16:14:20.063871+00:00,2024-05-23 16:14:20.063871+00:00,0 days 00:00:00
5,an,,2024-05-23 16:14:20.403871+00:00,2024-05-23 16:14:20.403871+00:00,0 days 00:00:00
6,easy,,2024-05-23 16:14:21.183871+00:00,2024-05-23 16:14:21.183871+00:00,0 days 00:00:00
7,question.,,2024-05-23 16:14:21.383871+00:00,2024-05-23 16:14:21.383871+00:00,0 days 00:00:00
8,I,,2024-05-23 16:14:21.983871+00:00,2024-05-23 16:14:21.983871+00:00,0 days 00:00:00
9,don't,,2024-05-23 16:14:22.643871+00:00,2024-05-23 16:14:22.643871+00:00,0 days 00:00:00
10,think,,2024-05-23 16:14:22.843871+00:00,2024-05-23 16:14:22.843871+00:00,0 days 00:00:00


In [82]:
merged_df

Unnamed: 0,time,word,speaker
0,2024-05-23 16:11:26.994697+00:00,Если,
1,2024-05-23 16:11:27.134697+00:00,"разобраться,",
2,2024-05-23 16:11:27.814697+00:00,что,
3,2024-05-23 16:11:28.054697+00:00,у,
4,2024-05-23 16:11:28.194697+00:00,меня,
5,2024-05-23 16:11:28.454697+00:00,происходит,
6,2024-05-23 16:11:29.014697+00:00,и,
7,2024-05-23 16:11:29.614697+00:00,происходит,
8,2024-05-23 16:11:29.994697+00:00,ли...,


In [83]:
diar_df.time.max()

nan