In [206]:
# 17_search.ipynb

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/app')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [218]:
from psql_helpers import read_table_async
from psql_helpers import get_session
from psql_models import Transcript, TranscriptAccess
from sqlalchemy import select, delete, func

In [219]:
from psql_models import *

In [220]:
from psql_access import get_meeting_token
from datetime import datetime
from indexing.redis_keys import RedisKeys
from indexing.worker import IndexingWorker
from redis import Redis


In [221]:
from sqlalchemy import update
from psql_helpers import get_session

async def reset_content_indexing():
    async with get_session() as session:
        await session.execute(
            update(Content).values(is_indexed=False)
        )
        await session.commit()

# Execute
#await reset_content_indexing()

In [222]:

async def delete_all_transcripts(dry_run=True, session=None):
    async with (session or get_session()) as session:
        # Count records before deletion
        transcript_count_query = select(func.count()).select_from(Transcript)
        transcript_access_count_query = select(func.count()).select_from(TranscriptAccess)
        
        transcript_count = await session.execute(transcript_count_query)
        transcript_access_count = await session.execute(transcript_access_count_query)
        
        total_transcripts = transcript_count.scalar()
        total_transcript_access = transcript_access_count.scalar()
        
        # Prepare deletion statements
        delete_transcript_access = delete(TranscriptAccess)
        delete_transcripts = delete(Transcript)
        
        # Only execute if not dry run
        if not dry_run:
            # Delete transcript access records first (due to foreign key constraint)
            if total_transcript_access > 0:
                await session.execute(delete_transcript_access)
            
            # Then delete transcript records
            if total_transcripts > 0:
                await session.execute(delete_transcripts)
                
            # Commit changes
            await session.commit()
            status = "DELETED"
        else:
            status = "DRY RUN - NO CHANGES MADE"
        
        # Return summary
        return {
            "status": status,
            "total_transcripts": total_transcripts,
            "total_transcript_access": total_transcript_access,
            "action": "would delete" if dry_run else "deleted"
        }

In [229]:
await delete_all_transcripts(dry_run=False)

{'status': 'DELETED',
 'total_transcripts': 174,
 'total_transcript_access': 174,
 'action': 'deleted'}

In [230]:
df = await read_table_async(User)

In [231]:
df

Unnamed: 0,id,email,username,first_name,last_name,image,created_timestamp,updated_timestamp,is_indexed
0,48de1194-1a9b-4197-9deb-bc1b0cabb03d,test_808d0847aa@example.com,,,,,2025-03-03 16:29:44.011560+00:00,2025-03-03 16:29:44.011564+00:00,False


In [232]:
await read_table_async(TranscriptAccess)

In [233]:
import pandas as pd
pd.options.display.max_colwidth = 1000

In [255]:
df = await read_table_async(Transcript)
df.sort_values(by='start_timestamp').tail(10)

Unnamed: 0,id,content_id,text_content,speaker,start_timestamp,end_timestamp,confidence,word_timing_data,segment_metadata
0,39d484b9-8c5e-46b3-b56f-afda94dc679a,4cc8ca66-6df8-5d79-b158-22552547e62c,он значит не только первая да значит в очереди,Dmitriy Grankin,2025-02-13 15:38:25.984000+00:00,2025-02-13 15:38:32.064000+00:00,0.825,"{'words': [{'word': 'он', 'start': 0.0, 'end': 0.2, 'confidence': 0.1343994140625}, {'word': 'значит', 'start': 0.2, 'end': 1.38, 'confidence': 0.81201171875}, {'word': 'не', 'start': 1.38, 'end': 3.42, 'confidence': 0.374755859375}, {'word': 'только', 'start': 3.42, 'end': 3.74, 'confidence': 0.346435546875}, {'word': 'первая', 'start': 3.74, 'end': 4.3, 'confidence': 0.673095703125}, {'word': 'да', 'start': 4.3, 'end': 4.92, 'confidence': 0.426025390625}, {'word': 'значит', 'start': 4.92, 'end': 5.46, 'confidence': 0.994140625}, {'word': 'в', 'start': 5.46, 'end': 5.66, 'confidence': 0.84912109375}, {'word': 'очереди', 'start': 5.66, 'end': 6.08, 'confidence': 0.969970703125}]}","{'speaker': 'Dmitriy Grankin', 'present_user_ids': ['48de1194-1a9b-4197-9deb-bc1b0cabb03d'], 'server_timestamp': '2025-02-13T15:32:16+00:00'}"
1,7a1d180e-d03e-42d9-91e9-fff25942506a,4cc8ca66-6df8-5d79-b158-22552547e62c,"И получается, что распознавалка возвращает...",Sergey Ryabenko,2025-02-13 15:38:32.104000+00:00,2025-02-13 15:38:38.064000+00:00,0.346309,"{'words': [{'word': 'И', 'start': 0.0, 'end': 2.5, 'confidence': 0.11871337890625}, {'word': 'получается,', 'start': 2.5, 'end': 3.14, 'confidence': 0.923828125}, {'word': 'что', 'start': 3.52, 'end': 3.96, 'confidence': 0.9970703125}, {'word': 'распознавалка', 'start': 3.96, 'end': 5.22, 'confidence': 0.9971516927083334}, {'word': 'возвращает...', 'start': 5.22, 'end': 5.96, 'confidence': 0.8556315104166666}]}","{'speaker': 'Sergey Ryabenko', 'present_user_ids': ['48de1194-1a9b-4197-9deb-bc1b0cabb03d'], 'server_timestamp': '2025-02-13T15:32:16+00:00'}"


In [44]:
df = await read_table_async(Content)

In [45]:
df

Unnamed: 0,id,type,text,timestamp,external_id,external_id_type,last_update,parent_id,is_indexed,content_metadata
0,4cc8ca66-6df8-5d79-b158-22552547e62c,meeting,,2025-03-03 16:55:14.986422+00:00,the-zdjv-byg,google_meet,2025-03-03 16:55:14.986426+00:00,,False,


In [13]:
df[df['is_indexed']]

Unnamed: 0,id,type,text,timestamp,external_id,external_id_type,last_update,parent_id,is_indexed,content_metadata


In [14]:
df = await read_table_async(Content)

In [15]:
df

Unnamed: 0,id,type,text,timestamp,external_id,external_id_type,last_update,parent_id,is_indexed,content_metadata
0,4cc8ca66-6df8-5d79-b158-22552547e62c,meeting,,2025-02-28 13:38:25.722251+00:00,the-zdjv-byg,google_meet,2025-02-28 13:38:25.722259+00:00,,False,


In [16]:
df['is_indexed'].value_counts()


is_indexed
False    1
Name: count, dtype: int64

In [17]:
df['type'].value_counts()


type
meeting    1
Name: count, dtype: int64

In [18]:
processor = ContentProcessor(
            qdrant_engine=self.qdrant_engine,
            es_engine=self.es_engine
        )

NameError: name 'ContentProcessor' is not defined