In [1]:
# 17_search.ipynb

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/app')

In [6]:
from services.psql_helpers import read_table_async
from services.psql_helpers import get_session
from models.db import *
from sqlalchemy import select, delete, func

In [13]:

async def delete_all_transcripts(dry_run=True, session=None):
    async with (session or get_session()) as session:
        # Count records before deletion
        transcript_count_query = select(func.count()).select_from(Transcript)
        transcript_access_count_query = select(func.count()).select_from(TranscriptAccess)
        
        transcript_count = await session.execute(transcript_count_query)
        transcript_access_count = await session.execute(transcript_access_count_query)
        
        total_transcripts = transcript_count.scalar()
        total_transcript_access = transcript_access_count.scalar()
        
        # Prepare deletion statements
        delete_transcript_access = delete(TranscriptAccess)
        delete_transcripts = delete(Transcript)
        
        # Only execute if not dry run
        if not dry_run:
            # Delete transcript access records first (due to foreign key constraint)
            if total_transcript_access > 0:
                await session.execute(delete_transcript_access)
            
            # Then delete transcript records
            if total_transcripts > 0:
                await session.execute(delete_transcripts)
                
            # Commit changes
            await session.commit()
            status = "DELETED"
        else:
            status = "DRY RUN - NO CHANGES MADE"
        
        # Return summary
        return {
            "status": status,
            "total_transcripts": total_transcripts,
            "total_transcript_access": total_transcript_access,
            "action": "would delete" if dry_run else "deleted"
        }

In [14]:
await delete_all_transcripts(dry_run=False)

{'status': 'DELETED',
 'total_transcripts': 130,
 'total_transcript_access': 130,
 'action': 'deleted'}

In [15]:
df = await read_table_async(User)

In [16]:
df

Unnamed: 0,id,email,username,first_name,last_name,image,created_timestamp,updated_timestamp,is_indexed
0,48de1194-1a9b-4197-9deb-bc1b0cabb03d,test_808d0847aa@example.com,,,,,2025-03-03 16:29:44.011560+00:00,2025-03-03 16:29:44.011564+00:00,False


In [17]:
await read_table_async(TranscriptAccess)

Unnamed: 0,id,transcript_id,user_id,access_level,granted_at,granted_by
0,305,da1407c6-6ecb-497d-b3d5-bf49b679df45,48de1194-1a9b-4197-9deb-bc1b0cabb03d,owner,2025-03-08 17:48:56.571385+00:00,48de1194-1a9b-4197-9deb-bc1b0cabb03d


In [18]:
import pandas as pd
pd.options.display.max_colwidth = 1000

In [19]:
df = await read_table_async(Transcript)
df.sort_values(by='start_timestamp').tail(10)

Unnamed: 0,id,content_id,text_content,speaker,start_timestamp,end_timestamp,confidence,word_timing_data,segment_metadata
0,da1407c6-6ecb-497d-b3d5-bf49b679df45,4cc8ca66-6df8-5d79-b158-22552547e62c,"Так и остался один endpoint, просто он в облака переехал",Dmitriy Grankin,2025-02-13 15:35:45.400000+00:00,2025-02-13 15:35:49.060000+00:00,1.0,"{'words': [{'word': 'Так', 'start': 0.0, 'end': 0.44, 'confidence': 0.237060546875}, {'word': 'и', 'start': 0.44, 'end': 0.64, 'confidence': 0.83203125}, {'word': 'остался', 'start': 0.64, 'end': 0.96, 'confidence': 0.998291015625}, {'word': 'один', 'start': 0.96, 'end': 1.26, 'confidence': 0.97705078125}, {'word': 'endpoint,', 'start': 1.26, 'end': 1.58, 'confidence': 0.65673828125}, {'word': 'просто', 'start': 1.86, 'end': 2.08, 'confidence': 0.9296875}, {'word': 'он', 'start': 2.08, 'end': 2.34, 'confidence': 0.9765625}, {'word': 'в', 'start': 2.34, 'end': 2.46, 'confidence': 0.98779296875}, {'word': 'облака', 'start': 2.46, 'end': 2.88, 'confidence': 0.9811197916666666}, {'word': 'переехал', 'start': 2.88, 'end': 3.66, 'confidence': 0.9990234375}]}","{'speaker': 'Dmitriy Grankin', 'present_user_ids': ['48de1194-1a9b-4197-9deb-bc1b0cabb03d'], 'server_timestamp': '2025-02-13T15:32:16+00:00'}"


In [44]:
df = await read_table_async(Content)

In [45]:
df

Unnamed: 0,id,type,text,timestamp,external_id,external_id_type,last_update,parent_id,is_indexed,content_metadata
0,4cc8ca66-6df8-5d79-b158-22552547e62c,meeting,,2025-03-03 16:55:14.986422+00:00,the-zdjv-byg,google_meet,2025-03-03 16:55:14.986426+00:00,,False,


In [13]:
df[df['is_indexed']]

Unnamed: 0,id,type,text,timestamp,external_id,external_id_type,last_update,parent_id,is_indexed,content_metadata


In [14]:
df = await read_table_async(Content)

In [15]:
df

Unnamed: 0,id,type,text,timestamp,external_id,external_id_type,last_update,parent_id,is_indexed,content_metadata
0,4cc8ca66-6df8-5d79-b158-22552547e62c,meeting,,2025-02-28 13:38:25.722251+00:00,the-zdjv-byg,google_meet,2025-02-28 13:38:25.722259+00:00,,False,


In [16]:
df['is_indexed'].value_counts()


is_indexed
False    1
Name: count, dtype: int64

In [17]:
df['type'].value_counts()


type
meeting    1
Name: count, dtype: int64

In [18]:
processor = ContentProcessor(
            qdrant_engine=self.qdrant_engine,
            es_engine=self.es_engine
        )

NameError: name 'ContentProcessor' is not defined