In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
from datetime import datetime,timedelta
from qdrant_client import models
from typing import Dict, Any, Tuple
from pydantic import BaseModel, Field
from core import BaseCall
from core import user_msg,assistant_msg,system_msg

In [6]:
from vexa import VexaAPI
vexa = VexaAPI()
await vexa.get_user_info()

user_id = vexa.user_id
user_name = vexa.user_name
import asyncio
from vector_search import VectorSearch,build_context_string
from typing import Optional, Union, List

vector_search = VectorSearch()

User information retrieved successfully.


In [113]:
class SearchDocumentsInput(BaseCall):
    vector_search_query: Optional[str] = Field(None, description="Query string for vector search")
    start: Optional[Union[str, datetime]] = Field(None, description="Start date/time for filtering")
    end: Optional[Union[str, datetime]] = Field(None, description="End date/time for filtering")
    speakers_include: Optional[List[str]] = Field(None, description="List of speakers to include in filtering")
    speakers_exclude: Optional[List[str]] = Field(None, description="List of speakers to exclude from filtering")

    async def search_documents(self, vector_search: VectorSearch=vector_search,
                               user_id: Optional[str] = user_id, 
                               user_name: Optional[str] = user_name,
                               k: int = 100,
                               include_summary: bool = False) -> List[Tuple[Dict[str, Any], float, str]]:

        filter_conditions = []
        must_not_conditions = []
        
        if not include_summary:
            filter_conditions.append(models.FieldCondition(key="type", match=models.MatchValue(value="point")))
        
        if self.start:
            adjusted_start = self.start - timedelta(days=1) if isinstance(self.start, datetime) else datetime.fromisoformat(self.start) - timedelta(days=1)
            filter_conditions.append(models.FieldCondition(
                key="start_datetime",
                range=models.Range(gte=int(adjusted_start.timestamp()))
            ))
        if self.end:
            adjusted_end = self.end + timedelta(days=1) if isinstance(self.end, datetime) else datetime.fromisoformat(self.end) + timedelta(days=1)
            filter_conditions.append(models.FieldCondition(
                key="start_datetime",
                range=models.Range(lte=int(adjusted_end.timestamp()))
            ))
        
        if user_id:
            if isinstance(user_id, str):
                filter_conditions.append(models.FieldCondition(key="user_id", match=models.MatchValue(value=user_id)))
            else:
                filter_conditions.append(models.FieldCondition(key="user_id", match=models.MatchAny(any=user_id)))
        
        if user_name:
            if isinstance(user_name, str):
                filter_conditions.append(models.FieldCondition(key="user_name", match=models.MatchValue(value=user_name)))
            else:
                filter_conditions.append(models.FieldCondition(key="user_name", match=models.MatchAny(any=user_name)))

        if self.speakers_include:
            filter_conditions.append(models.FieldCondition(
                key="speakers", 
                match=models.MatchAny(any=self.speakers_include)
            ))
        
        if self.speakers_exclude:
            must_not_conditions.append(models.FieldCondition(
                key="speakers", 
                match=models.MatchAny(any=self.speakers_exclude)
            ))

        query_filter = models.Filter(
            must=filter_conditions,
            must_not=must_not_conditions
        )

        if self.vector_search_query:
            query_vector = await vector_search.get_embeddings([self.vector_search_query])
            search_result = await vector_search.qdrant_client.search(
                collection_name=vector_search.collection_name,
                query_vector=query_vector[0].tolist(),
                query_filter=query_filter,
                limit=k
            )
        else:
            search_result = await vector_search.qdrant_client.scroll(
                collection_name=vector_search.collection_name,
                scroll_filter=query_filter,
                limit=k,
                with_payload=True,
                with_vectors=True
            )
            search_result = search_result[0]  # Unpack the tuple returned by scroll

        return [(hit.payload, getattr(hit, 'score', 1.0), hit.id) for hit in search_result]

In [114]:
from prompts import Prompts
prompts = Prompts()


In [115]:
summaries = await vector_search.get_summaries(user_id=user_id)
general_context = build_context_string(summaries, only_summaries=True)[0]

In [116]:
speakers = await vector_search.get_speakers_by_user_id(user_id)

given the list of counterpart types:

coworkers, candidates, customers, marketing specialists

who is this guy?

In [117]:
speaker_categories = {
    "me": ["Dmitriy Grankin"],
    
    "coworkers": [
        "Karine Stepanyan",
        "Alex Shevliakov",
        "Andrey Pisankin",
        "Sergey Ryabenko",
        "Sergey Fofanov",
        "Olga Nemirovskaya"
    ],
    
    "candidates": [
        "Jenish Mursidinov",
        "Daniil Andreev",
        "Nick Frolov",
        "Vasilii Glebov",
        "Nikolay",
        "Юрий Б",
        "Xut tuX"
    ],
    
    "customers": [
        "Oleg Maleev",
        "Anastasiia GULIAEVA",
        "Ilia Semukhin",
        "Dmitrii Chistov",
        "Lidiia Abramova",
        "Ilya Semukhin",
        "Tatiana Illarionova-Zervas",
        "Nadya Gorodetskaya",
        "Stephanus Gunawan",
        "Robert Hangu",
        "Ahmed Abdelaziz",
        "Mayank Tayal",
        "Shota Arabuli"
    ],
    
    "marketing_specialists": [
        "Сергей Жилко",
        "Olga Miller",
        "Julia Zakharova",
        "Dmitrii Bashkirov",
        "Igor Vinidiktov",
        "Umar Lateef",
        "Ethan Noah",
        "Andy Black",
        "AI Marketing Directory",
        "Lara Vargas"
    ],
    
    "not_specified": [
        "Sergio Goriachev",
        "Олег",
        "Павел's AI Notetaker"
    ],
    
    "peers": [
        "Angelina Geru",
        "Павел Коркодинов",
        "Olga Mykhoparkina",
        "Tatiana Sukhova",
        "Maksim Jmihov",
        "Igor Bessonov",
        "Max Mironov",
        "Александр Мелихов",
        "Alex Loktev",
        "Eugene Tartakovsky",
        "Slawa Kister",
        "Matt Lewis",
        "David Sterry",
        "Rick Tousseyn"
    ],
}

In [118]:
system_prompt = system_msg(f'Create search engine input from the user request. You can use eather one of speakers include or speakers exclude')
speakers_prompt = system_msg(f'list of available speakers:{speaker_categories}. ')


In [119]:
import pandas as pd

In [120]:
r = await SearchDocumentsInput.call([system_prompt,speakers_prompt, user_msg('что говорят о продукте vexa клиенты')])


In [121]:
r[0].model_dump()

{'vector_search_query': 'отзывы о продукте vexa',
 'start': None,
 'end': None,
 'speakers_include': ['Oleg Maleev',
  'Anastasiia GULIAEVA',
  'Ilia Semukhin',
  'Dmitrii Chistov',
  'Lidiia Abramova',
  'Ilya Semukhin',
  'Tatiana Illarionova-Zervas',
  'Nadya Gorodetskaya',
  'Stephanus Gunawan',
  'Robert Hangu',
  'Ahmed Abdelaziz',
  'Mayank Tayal',
  'Shota Arabuli'],
 'speakers_exclude': None}

extract entities like people,companies and products  from the text and 50 char description of the context why mentioned as well as their relations.
words generic words like:  Mentioned, Discussing,Suggested, etc. 

entity; type; description including facts, numbers and connections mentioned; speaker

In [122]:
points_with_scores =await r[0].search_documents()
points_by_meeting = {}
for point, score, id in points_by_meeting:
    if score > 0.1:
        meeting_id = point['meeting_session_id']
        if meeting_id not in points_by_meeting:
            points_by_meeting[meeting_id] = []
        points_by_meeting[meeting_id].append(point)

full_context, meeting_ids = build_context_string(summaries, points_by_meeting, only_summaries=False, include_all_summaries=False)
df = pd.DataFrame([
    {
        'qoutes': point['qoutes'],
        'score': score,
        'id': id,
        'meeting_session_id': point['meeting_session_id'],
        'start_datetime': point['start_datetime'],
        'speakers': ', '.join(point['speakers']),
        'user_id': point['user_id'],
        'user_name': point['user_name']
    }
    for point, score, id in points_with_scores
])

# Convert start_datetime to a readable format
df['start_datetime'] = pd.to_datetime(df['start_datetime'], unit='s')

# Sort the DataFrame by score in descending order
df = df.sort_values('score', ascending=False)

In [123]:
df

Unnamed: 0,qoutes,score,id,meeting_session_id,start_datetime,speakers,user_id,user_name
0,Robert Hangu: So do you understand the proble...,0.704958,4224a021-1e4f-47ba-be52-5ee43ddde4e3,4ff12d56-f520-4391-a283-03d6365b49e6,2024-09-09 10:46:17,"Dmitry Grankin, Robert Hangu",ef7c085b-fdb5-4c94-b7b6-a61a3d04c210,Dmitriy Grankin
1,"Robert Hangu: Like, I always collapse this th...",0.702892,bdfbb3db-00fb-42a0-8cd2-d9cbef30786a,4ff12d56-f520-4391-a283-03d6365b49e6,2024-09-09 10:46:17,"Dmitry Grankin, Robert Hangu",ef7c085b-fdb5-4c94-b7b6-a61a3d04c210,Dmitriy Grankin
2,Dmitry Grankin: use the real-time extension o...,0.681285,b0d5a147-39d3-4a98-8d01-960b34b64ab1,4ff12d56-f520-4391-a283-03d6365b49e6,2024-09-09 10:46:17,"Dmitry Grankin, Robert Hangu",ef7c085b-fdb5-4c94-b7b6-a61a3d04c210,Dmitriy Grankin
3,Dmitry Grankin: rigid frameworks which give y...,0.680950,799c469a-b872-4fdc-acc0-637eb470cf21,4ff12d56-f520-4391-a283-03d6365b49e6,2024-09-09 10:46:17,"Dmitry Grankin, Robert Hangu",ef7c085b-fdb5-4c94-b7b6-a61a3d04c210,Dmitriy Grankin
4,Dmitry Grankin: use the real-time extension o...,0.670869,5d99526d-c59d-4758-ad65-0f088d05b279,4ff12d56-f520-4391-a283-03d6365b49e6,2024-09-09 10:46:17,"Dmitry Grankin, Robert Hangu",ef7c085b-fdb5-4c94-b7b6-a61a3d04c210,Dmitriy Grankin
...,...,...,...,...,...,...,...,...
95,Dmitriy Grankin: как такой pipeline когда ты ...,0.316462,33ee6c7f-7928-45f9-b62d-74802548e064,bf174837-0662-4226-90bf-ca18ad302ae6,2024-09-13 11:30:23,"Ilia Semukhin, Ilia Semukhin (Presentation), D...",ef7c085b-fdb5-4c94-b7b6-a61a3d04c210,Dmitriy Grankin
96,Ilia Semukhin: так это этот активирует у меня...,0.313806,b6c78bf1-2424-489f-a6a1-7031e62c1283,bf174837-0662-4226-90bf-ca18ad302ae6,2024-09-13 11:30:23,"Ilia Semukhin, Ilia Semukhin (Presentation), D...",ef7c085b-fdb5-4c94-b7b6-a61a3d04c210,Dmitriy Grankin
97,David Sterry: So what they do is they take ty...,0.312480,9ffa62e3-7be3-43d8-aa60-283185dfbc19,cdf9078d-87e0-49cb-9b62-71596961f436,2024-10-08 10:33:20,"David Sterry, Nadya Gorodetskaya, Dmitry Grankin",ef7c085b-fdb5-4c94-b7b6-a61a3d04c210,Dmitriy Grankin
98,Olga Nemirovskaya: Тоже много общения с проду...,0.311079,5adbe358-ca90-469a-b652-1ef5eca2e0af,f0d5f231-b866-4cb4-b59d-73899f2e0dc9,2024-09-04 15:00:03,"Dmitriy Grankin, Olga Nemirovskaya, Lidiia Abr...",ef7c085b-fdb5-4c94-b7b6-a61a3d04c210,Dmitriy Grankin
