In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from sampling import fetch_joined_data,WeightedSampler

In [3]:
joined_df = await fetch_joined_data()
joined_df['meeting_time'] = pd.to_datetime(joined_df['meeting_timestamp']).dt.strftime('%Y-%m-%d')

In [4]:
sampler = WeightedSampler(joined_df, date_column='meeting_time',decay_factor=0.2)
sampled_redults = sampler.sample(n_samples=100)  # mode defaults to 'recency'

In [5]:
def get_unique_value_counts(df, columns, format_output=False):
    """
    Returns unique values and their counts for specified columns.
    
    Args:
        df (pd.DataFrame): Input dataframe
        columns (list): List of column names to analyze
        format_output (bool): If True, returns formatted string instead of dict
    
    Returns:
        dict or str: Dictionary of value counts or formatted string if format_output=True
    """
    result = {}
    
    for col in columns:
        try:
            value_counts = df[col].value_counts()
            value_dict_list = [{'value': value, 'count': count} 
                             for value, count in value_counts.items()]
            result[col] = value_dict_list
        except (TypeError, AttributeError):
            value_counts = df[col].astype(str).value_counts()
            value_dict_list = [{'value': value, 'count': count} 
                             for value, count in value_counts.items()]
            result[col] = value_dict_list
    
    if format_output:
        output_str = ""
        for col, value_list in result.items():
            output_str += f"\n{col}:\n"
            for item in value_list:
                output_str += f"  {item['value']}: {item['count']}\n"
        return output_str.strip()
    
    return result

# Get dictionary result
dict_result = get_unique_value_counts(joined_df, ['topic_type', 'topic_name', 'speaker_name'])

# Get formatted string result
str_result = get_unique_value_counts(joined_df, ['topic_type', 'topic_name', 'speaker_name'], format_output=True)

In [6]:
from core import BaseCall,system_msg,user_msg
from pydantic import BaseModel,Field
from datetime import datetime
from typing import Optional,List


In [7]:
class DateRange(BaseModel):
    start_date: Optional[datetime] = Field(None, description="Start date of the range")
    end_date: Optional[datetime] = Field(None, description="End date of the range")

class SearchFilter(BaseModel):
    topic_type: Optional[List[str]] = Field(None, description="List of topic types to filter by")
    topic_name: Optional[List[str]] = Field(None, description="List of topic names to filter by")
    speaker_name: Optional[List[str]] = Field(None, description="List of speaker names to filter by")
    date_range: Optional[DateRange] = Field(None, description="Date range to filter by")
    search_text: Optional[str] = Field(None, description="Text to search for in summary and details columns using similarity search")


In [8]:
def apply_search_filters(df: pd.DataFrame, filters: SearchFilter) -> pd.DataFrame:
    """
    Apply SearchFilter parameters to filter a DataFrame.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        filters (SearchFilter): Search parameters
        
    Returns:
        pd.DataFrame: Filtered DataFrame
    """
    filtered_df = df.copy()
    
    # Apply list-based filters
    if filters.topic_type:
        filtered_df = filtered_df[filtered_df['topic_type'].isin(filters.topic_type)]
    
    if filters.topic_name:
        filtered_df = filtered_df[filtered_df['topic_name'].isin(filters.topic_name)]
        
    if filters.speaker_name:
        filtered_df = filtered_df[filtered_df['speaker_name'].isin(filters.speaker_name)]
    
    # Apply date range filter if provided
    if filters.date_range:
        if filters.date_range.start_date:
            filtered_df = filtered_df[
                pd.to_datetime(filtered_df['meeting_time']) >= filters.date_range.start_date
            ]
        if filters.date_range.end_date:
            filtered_df = filtered_df[
                pd.to_datetime(filtered_df['meeting_time']) <= filters.date_range.end_date
            ]
    
    # Apply text search if provided
    # Note: This assumes you have a text similarity search function
    # You might want to implement this based on your specific needs
    if filters.search_text:
        # Example using simple substring matching
        # Replace this with your actual similarity search implementation
        text_mask = (
            filtered_df['summary'].str.contains(filters.search_text, case=False, na=False) |
            filtered_df['details'].str.contains(filters.search_text, case=False, na=False)
        )
        filtered_df = filtered_df[text_mask]
    
    return filtered_df

In [9]:
class SearchParams(BaseCall):
    plan: SearchPlan = Field(..., description="Search plan with current step and feedback")
    
    @classmethod
    async def extract(cls, user_query: str, table_context: str, sampled_df: pd.DataFrame, 
                     previous_results: Optional[pd.DataFrame] = None, 
                     current_results: Optional[pd.DataFrame] = None,
                     step_number: int = 1,
                     model: str = "gpt-4o-mini", 
                     use_cache: bool = False, 
                     force_store: bool = False):
        current_time = datetime.now()
        
        # Convert sampled results to a readable format
        sample_context = f"""Sample of recent records:
{sampled_df[['meeting_time', 'topic_type', 'topic_name', 'speaker_name']].head().to_markdown()}
"""
        
        # Add current results context if available
        results_context = ""
        if current_results is not None:
            results_context = f"""
Current search results:
{current_results[['meeting_time', 'topic_type', 'topic_name', 'speaker_name', 'summary']].head().to_markdown()}
Total results: {len(current_results)} entries
"""
        
        output = await cls.call([
            system_msg("""Plan and execute a multi-step search strategy to thoroughly answer the user's query.
                      Use the provided table context to identify and handle variations and possible misspellings.
                      
                      For each step:
                      1. Check the table context for all relevant variations of search terms
                      2. Include all valid variations in the search filters
                      3. Analyze current results if available
                      4. Reflect on whether results are sufficient to answer the query
                      5. Determine if additional search steps are needed
                      
                      When reflecting on results, consider:
                      - Do we have enough context about all mentioned entities?
                      - Are we capturing all relevant time periods?
                      - Have we found all important perspectives/opinions?
                      - Are the results specific enough to answer the query?
                      - Would additional filters help focus the results?
                      
                      Provide a quality score (0-1) based on:
                      - Relevance to the query
                      - Completeness of the answer
                      - Specificity of the results
                      - Coverage of different aspects
                      
                      Include key findings that summarize what we've learned from the results."""),
            user_msg(f"""Current datetime: {current_time}

Table context showing available values:
{table_context}

{sample_context}
{results_context}

User query: {user_query}
Current step: {step_number}

Plan the search strategy and reflect on current results.""")
        ], model=model, use_cache=use_cache, force_store=force_store)
        
        return output[0].plan

NameError: name 'SearchPlan' is not defined

In [None]:

class SearchReflection(BaseModel):
    is_sufficient: bool = Field(..., description="Whether current results are sufficient to answer the user query")
    missing_aspects: List[str] = Field(..., description="List of aspects still missing from the results")
    suggested_filters: Optional[SearchFilter] = Field(None, description="Suggested additional filters if needed")
    reasoning: str = Field(..., description="Explanation of why results are/aren't sufficient")
    quality_score: float = Field(..., ge=0, le=1, description="Score indicating how well these results answer the query (0-1)")
    key_findings: List[str] = Field(..., description="List of key insights found in current results")

class SearchPlan(BaseModel):
    steps: List[str] = Field(..., description="List of search steps to execute")
    current_filters: SearchFilter = Field(..., description="Current step search parameters")
    is_final: bool = Field(..., description="Whether this is the final search step")
    feedback: str = Field(..., description="Analysis of current results and next steps needed")
    reflection: Optional[SearchReflection] = Field(None, description="Reflection on search results quality")

async def iterative_search(query: str, joined_df: pd.DataFrame, context: str, sampled_df: pd.DataFrame, verbose: bool = True):
    all_iterations = []  # Store all iteration results and their reflections
    results = None
    step = 1
    best_score = 0
    best_results = None
    
    while True:
        if verbose:
            print(f"\n=== Step {step} ===")
            
        # Get search plan for current step
        search_plan = await SearchParams.extract(
            user_query=query,
            table_context=context,
            sampled_df=sampled_df,
            previous_results=results,
            current_results=results,  # Pass current results for reflection
            step_number=step
        )
        
        if verbose:
            print("\nSearch Plan:")
            print(f"Steps: {search_plan.steps}")
            print(f"Current Filters: {search_plan.current_filters}")
            print(f"Feedback: {search_plan.feedback}")
            if search_plan.reflection:
                print("\nReflection:")
                print(f"Sufficient: {search_plan.reflection.is_sufficient}")
                print(f"Missing aspects: {search_plan.reflection.missing_aspects}")
                print(f"Quality Score: {search_plan.reflection.quality_score}")
                print(f"Key Findings: {search_plan.reflection.key_findings}")
                print(f"Reasoning: {search_plan.reflection.reasoning}")
        
        # Apply current filters
        current_results = apply_search_filters(joined_df, search_plan.current_filters)
        
        if verbose:
            print(f"\nFound {len(current_results)} results in this step")
        
        # Update results - handle list columns by converting to tuples
        if results is None:
            results = current_results
            if verbose:
                print("First step - using initial results")
        else:
            # Convert list columns to tuples for both DataFrames
            for df in [results, current_results]:
                for col in df.columns:
                    if df[col].apply(lambda x: isinstance(x, list)).any():
                        df[col] = df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x)
            
            old_len = len(results)
            results = pd.concat([results, current_results]).drop_duplicates()
            new_len = len(results)
            
            if verbose:
                print(f"Added {new_len - old_len} new unique results")
        
        # Store current iteration results and reflection
        if search_plan.reflection:
            iteration_info = {
                'step': step,
                'results': results.copy(),
                'reflection': search_plan.reflection,
                'filters': search_plan.current_filters,
                'quality_score': search_plan.reflection.quality_score
            }
            all_iterations.append(iteration_info)
            
            # Update best results if current score is higher
            if search_plan.reflection.quality_score > best_score:
                best_score = search_plan.reflection.quality_score
                best_results = results.copy()
                
                if verbose:
                    print(f"\nNew best results found! Score: {best_score}")
        
        # Check if we're done based on reflection
        if search_plan.reflection and search_plan.reflection.is_sufficient:
            if verbose:
                print("\nSearch complete - results deemed sufficient")
            break
            
        step += 1
        if step > 5:  # Safety limit
            if verbose:
                print("\nSearch stopped - reached maximum steps (5)")
            break
    
    # Use best results found during iterations
    results = best_results if best_results is not None else results
    
    # Convert tuple columns back to lists in final results
    for col in results.columns:
        if results[col].apply(lambda x: isinstance(x, tuple)).any():
            results[col] = results[col].apply(lambda x: list(x) if isinstance(x, tuple) else x)
    
    # Calculate relevance scores and sort results
    search_terms = set([term.lower() for term in query.split()])
    
    def calculate_relevance(row):
        text = f"{row['summary']} {row['details']}".lower()
        # Count occurrences of search terms
        term_matches = sum(text.count(term) for term in search_terms)
        # Boost score for more recent dates
        recency_boost = pd.to_datetime(row['meeting_time']).timestamp() / 1e9
        return term_matches + (recency_boost / 1e11)  # Normalize recency boost
    
    results['relevance_score'] = results.apply(calculate_relevance, axis=1)
    results = results.sort_values('relevance_score', ascending=False).drop(columns=['relevance_score'])
    
    if verbose:
        print(f"\nFinal Results: {len(results)} total unique entries")
        print(f"Best Quality Score: {best_score}")
        print("\nKey Findings Across Iterations:")
        for iteration in all_iterations:
            print(f"\nStep {iteration['step']} (Score: {iteration['quality_score']}):")
            for finding in iteration['reflection'].key_findings:
                print(f"- {finding}")
    
    return results, all_iterations

In [10]:
# Example usage
results = await iterative_search(
    query="what users say about vexa",
    joined_df=joined_df,
    context=str_result,
    sampled_df=sampled_redults
)

NameError: name 'iterative_search' is not defined

In [16]:
results[0]

Unnamed: 0,summary_index,summary,details,referenced_text,topic_name,topic_type,meeting_id,meeting_timestamp,speaker_name,other_speakers,meeting_time
6,8,"Dmitriy Grankin is a speaker in the meeting, d...",He is focused on improving the user experience...,Dmitriy Grankin: можно показать тебе кое-что ...,Dmitriy Grankin,person,f5f969d6-675f-4d55-a604-78208e545dcd,2024-09-16 15:35:48.424000,Dmitriy Grankin,[Ilia Semukhin],2024-09-16
9,7,A speaker in the meeting who engages in a dial...,Dmitriy discusses the need for the system to a...,"Sergey Ryabenko: Ну нет, таски – это одно из....",Dmitriy Grankin,person,2be605d6-d98d-475a-8969-838cb44a6fe9,2024-09-16 13:09:25.985999,Dmitriy Grankin,[Sergey Ryabenko],2024-09-16
58,9,Co-speaker in the meeting focusing on product ...,Dmitriy discusses the challenges of defining a...,,Dmitriy Grankin,person,e3dea53c-a0d6-4b10-ab63-8b4c120bd245,2024-09-16 19:02:08.440000,Dmitriy Grankin,[Olga Nemirovskaya],2024-09-16
74,7,Dmitriy Grankin is a speaker in the meeting di...,He expresses opinions on full-time work versus...,Alex Shevliakov: набирать тип как консультант...,Dmitriy Grankin,person,1c71070f-4427-4aae-ace0-2091f70dec5b,2024-09-17 09:26:40.185999,Dmitriy Grankin,[Alex Shevliakov],2024-09-17
101,7,Dmitriy Grankin is a speaker in the meeting wh...,Dmitriy emphasizes the importance of not spamm...,,Dmitriy Grankin,person,f3dd4aba-acde-46e0-9b6c-12591c740e0e,2024-09-17 09:49:06.520000,Dmitriy Grankin,[Olga Nemirovskaya],2024-09-17
...,...,...,...,...,...,...,...,...,...,...,...
3082,10,Dmitry Grankin is a speaker in the meeting who...,He discusses the importance of LinkedIn outrea...,Dmitrii Chistov: Все хорошо.,Dmitry Grankin,person,dbf8ad27-649c-4a16-b5a1-5ab3db002bd9,2024-09-13 15:15:37.764000,Dmitry Grankin,[Dmitrii Chistov],2024-09-13
3107,8,Dmitriy Grankin is a speaker who is developing...,He has been working on this project for the la...,Dmitriy Grankin: или около того последние три...,Dmitriy Grankin,person,6e6e65ac-b626-4326-aab4-f2a8e9b0cdef,2024-09-13 16:03:40.114000,Dmitriy Grankin,"[Дмитрий Гранкин, Александр Мелихов]",2024-09-13
3120,3,"Dmitriy Grankin is the speaker of the meeting,...",Dmitriy Grankin expressed thoughts on work and...,"Dmitriy Grankin: Работать, не работать, работ...",Dmitriy Grankin,person,8e319925-f6d4-4acd-82d3-705d94a824f7,2024-09-15 10:53:01.284000,Dmitriy Grankin,[],2024-09-15
3127,5,"Dmitriy Grankin is the speaker of the meeting,...",Dmitriy initiates the meeting and interacts wi...,Dmitriy Grankin: Hmm. | Dmitriy Grankin: Der...,Dmitriy Grankin,person,24f3dec7-23a0-4278-a21c-fe355332a853,2024-09-16 09:17:52.070000,Dmitriy Grankin,[],2024-09-16


In [22]:
search_params

SearchFilter(topic_type=None, topic_name=['Vexa', 'VEXA', 'VEX.AI'], speaker_name=None, date_range=None, search_text='vexa')

In [None]:
apply_search_filters(joined_df, search_params)

Unnamed: 0,summary_index,summary,details,referenced_text,topic_name,topic_type,meeting_id,meeting_timestamp,speaker_name,other_speakers,meeting_time
11,9,The process of managing tasks and responsibili...,Dmitriy and Sergey discuss the need for a syst...,"Sergey Ryabenko: Ну нет, таски – это одно из....",task management,concept,2be605d6-d98d-475a-8969-838cb44a6fe9,2024-09-16 13:09:25.985999,Dmitriy Grankin,[Sergey Ryabenko],2024-09-16
109,1,The updated code needs to be built and tested ...,Dmitriy Grankin confirmed that the code has be...,"Dmitriy Grankin: Вот, тем не менее, он обнови...",Code Update and Build,task,377bbdf0-3502-4e32-950f-44a1ef0fa0ed,2024-09-17 10:21:16.480000,Dmitriy Grankin,[Sergey Ryabenko],2024-09-17
184,11,Celery is a task queue used for managing async...,Dmitriy clarifies its function as a scheduler ...,"Sergey Ryabenko: Ну, говорю, самая главная пр...",Celery,product,52cd3cf1-7c60-4e77-b861-69ca6ffe914e,2024-09-18 11:57:28.220000,Dmitriy Grankin,[Sergey Ryabenko],2024-09-18
195,0,The team discussed the qualifications of two c...,Dmitriy and Sergey expressed concerns about on...,"Dmitriy Grankin: Слушай, ну, ты слышал, как б...",Discussion on candidate qualifications and fit...,concern,52cd3cf1-7c60-4e77-b861-69ca6ffe914e,2024-09-18 11:57:28.220000,Dmitriy Grankin,[Sergey Ryabenko],2024-09-18
205,9,Andrey is referenced as a good developer with ...,Dmitriy and Sergey discuss Andrey's coding spe...,"Dmitriy Grankin: Реально, Андрей, допустим, х...",Andrey,person,52cd3cf1-7c60-4e77-b861-69ca6ffe914e,2024-09-18 11:57:28.220000,Dmitriy Grankin,[Sergey Ryabenko],2024-09-18
210,15,Artificial Intelligence (AI) refers to the sim...,Dmitriy and Sergey discuss the implications of...,Dmitriy Grankin: Overqualified. Интересная фо...,AI,concept,52cd3cf1-7c60-4e77-b861-69ca6ffe914e,2024-09-18 11:57:28.220000,Dmitriy Grankin,[Sergey Ryabenko],2024-09-18
214,0,The team discussed the current state of a Reac...,Dmitriy and Sergey reviewed the React files an...,Dmitriy Grankin: показать и понять что тут во...,Current state of React project,discussion,8ead624d-bab2-4c6b-b837-0c97d5e6f32a,2024-09-18 17:37:35.184000,Dmitriy Grankin,[Sergey Ryabenko],2024-09-18
217,20,GitHub is a platform for version control and c...,Dmitriy and Sergey discuss the importance of v...,,GitHub,company,ce1988d6-b5a7-4c04-b322-d986523f0e3c,2024-09-23 13:29:51.040000,Dmitriy Grankin,[Sergey Ryabenko],2024-09-23
256,4,The team is open to experimenting with the cur...,Dmitriy and Sergey expressed interest in testi...,": Да, надо попробовать, это практически безоп...",Experimentation with existing systems,opportunity,4336b4b4-7a6d-4ec7-9d62-d4620ecdbe4d,2024-09-19 13:57:01.704000,Dmitriy Grankin,[Sergey Ryabenko],2024-09-19
307,21,JavaScript is a programming language commonly ...,Dmitriy and Sergey reference JavaScript in the...,,JavaScript,programming language,ce1988d6-b5a7-4c04-b322-d986523f0e3c,2024-09-23 13:29:51.040000,Dmitriy Grankin,[Sergey Ryabenko],2024-09-23


In [47]:
search_params

SearchFilter(topic_type=['product', 'company', 'idea', 'discussion'], topic_name=['VEX', 'Vexa', 'VEXA'], speaker_name=None, date_range=None, search_text=None)

In [45]:
joined_df[joined_df['topic_type'].isin(['product', 'company', 'idea', 'discussion'])].sort_values(by='meeting_time',ascending=True)

Unnamed: 0,summary_index,summary,details,referenced_text,topic_name,topic_type,meeting_id,meeting_timestamp,speaker_name,other_speakers,meeting_time
1289,12,Google Ads is a platform being set up for mark...,Olga mentions that Google Ads is in the setup ...,"Olga Nemirovskaya: Google Ads. Короче, все в ...",Google Ads,product,cd43879a-0b16-4473-9744-04ae567626ff,2024-08-26 19:14:18.976,Olga Nemirovskaya,[Dmitriy Grankin],2024-08-26
1293,16,Телеграм is a messaging platform being conside...,Olga expresses concerns about the future of Те...,"Olga Nemirovskaya: Вот. Значит, дальше. Что у...",Телеграм,product,cd43879a-0b16-4473-9744-04ae567626ff,2024-08-26 19:14:18.976,Olga Nemirovskaya,[Dmitriy Grankin],2024-08-26
1321,10,"The dashboard is a key focus of the meeting, w...",Dmitriy emphasizes the need for a user-friendl...,"Dmitriy Grankin: Yeah, but like, it looks lik...",dashboard,product,29fb1a35-e656-4435-89db-cbd8ad221a79,2024-08-27 09:17:03.250,Dmitriy Grankin,[Sergey Ryabenko],2024-08-27
1322,11,The extension is mentioned as a product that i...,Sergey indicates that working on the dashboard...,"Sergey Ryabenko: Um, yeah, then let's, let's ...",extension,product,29fb1a35-e656-4435-89db-cbd8ad221a79,2024-08-27 09:17:03.250,Sergey Ryabenko,[Dmitriy Grankin],2024-08-27
1377,12,Google Analytics is a web analytics service th...,The discussion revolves around integrating Goo...,"Dmitriy Grankin: Своя система аналитики, нам ...",Google Analytics,product,99ade764-fd60-4e9a-9621-66cc037cb865,2024-08-27 19:34:48.996,Dmitriy Grankin,[Olga Nemirovskaya],2024-08-27
...,...,...,...,...,...,...,...,...,...,...,...
1240,12,Zoom is another video conferencing tool mentio...,"Similar to Google Meet, Zoom is referenced as ...",Dmitry Grankin: yes definitely you do and uh ...,Zoom,product,50d5d659-3a57-42f2-8f32-31c7145ebe91,2024-10-24 09:32:32.980,Dmitry Grankin,[Özay Demirezen],2024-10-24
1241,13,Telegram is a messaging app that Dmitry used t...,Dmitry mentions using Telegram in conjunction ...,"Özay Demirezen: Because like, yeah, pretty mu...",Telegram,product,50d5d659-3a57-42f2-8f32-31c7145ebe91,2024-10-24 09:32:32.980,Dmitry Grankin,[Özay Demirezen],2024-10-24
1242,14,A Chrome Extension developed by Dmitry to enha...,Dmitry discusses the development of a Chrome e...,"Özay Demirezen: Okay, yeah, this,",Chrome Extension,product,50d5d659-3a57-42f2-8f32-31c7145ebe91,2024-10-24 09:32:32.980,Dmitry Grankin,[Özay Demirezen],2024-10-24
1243,16,WhatsApp is a messaging platform that Özay men...,Özay discusses the difficulty of recording cal...,Dmitry Grankin: with fred fred you know okay ...,WhatsApp,product,50d5d659-3a57-42f2-8f32-31c7145ebe91,2024-10-24 09:32:32.980,Özay Demirezen,[Dmitry Grankin],2024-10-24
