In [1]:
import json
import random
import logging
import pandas as pd
from tqdm.auto import tqdm

from dotenv import load_dotenv
load_dotenv('../.env')

from kaggle_competition_assistant.utils import create_documents
from kaggle_competition_assistant.index.opensearch_index import OpenSearchIndex

# Set the maximum number of rows to display
pd.set_option('display.max_rows', 100)

logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

## Ingestion

In [2]:
def create_index(competition_slug):
    competition_data_path = '../data/' + competition_slug
    documents = create_documents(competition_slug, competition_data_path)
    index = OpenSearchIndex(text_fields=['source', 'section', 'text'], keyword_fields=['url', 'id'])
    index.index(documents)
    return index

In [3]:
competition_slug = 'llm-zoomcamp-2024-competition'

In [4]:
index = create_index(competition_slug)



Creating document embeddings:   0%|          | 0/91 [00:00<?, ?it/s]

## Retrieval evaluation

In [5]:
df_question = pd.read_csv(f'../data/evaluation/{competition_slug}-ground-truth.csv')

ground_truth = df_question.to_dict(orient='records')

df_question.sample(5)

Unnamed: 0,doc_id,question,answer
11,7,What is the expected format of the submission ...,The submission file should have two columns: p...
29,20,How many files are provided in the dataset?,7
16,9,Who is the competition host?,ololo
6,3,What languages are the mathematical problems p...,English and Russian
39,29,What is the score of ArturG's submission?,0.93750


In [6]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] is True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [7]:
def evaluate(ground_truth, search_function, progress_bar=True):
    relevance_total = []

    if progress_bar:
        ground_truth = tqdm(ground_truth)
        
    for q in ground_truth:
        doc_id = q['doc_id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [8]:
boost_dict = {'source': 1.0, 'section': 10.0, 'text': 2.0}

In [9]:
results = []

for search_type in ['lexical', 'semantic', 'hybrid_rff']:
    for num_results in [5, 10]:
        for boost_dict in [{'source': 1.0, 'section': 10.0, 'text': 2.0}, {}]:
            if search_type == 'semantic' and boost_dict:
                continue
                
            evaluation_results = evaluate(ground_truth, 
                                          lambda q: index.search(q['question'], search_type=search_type, boost_dict=boost_dict, num_results=num_results))

            results.append({
                'search_type': search_type,
                'num_results': num_results,
                'boost_dict': json.dumps(boost_dict),
                'hit_rate': evaluation_results['hit_rate'],
                'mrr': evaluation_results['mrr'],
            })

results = pd.DataFrame(results, columns=['search_type', 'num_results', 'boost_dict', 'hit_rate', 'mrr'])
results.sort_values(by='mrr', ascending=True)

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

Unnamed: 0,search_type,num_results,boost_dict,hit_rate,mrr
1,lexical,5,{},0.6,0.45
3,lexical,10,{},0.688889,0.459444
0,lexical,5,"{""source"": 1.0, ""section"": 10.0, ""text"": 2.0}",0.666667,0.545185
2,lexical,10,"{""source"": 1.0, ""section"": 10.0, ""text"": 2.0}",0.8,0.562681
7,hybrid_rff,5,{},0.8,0.584815
9,hybrid_rff,10,{},0.933333,0.628113
6,hybrid_rff,5,"{""source"": 1.0, ""section"": 10.0, ""text"": 2.0}",0.822222,0.66
4,semantic,5,{},0.844444,0.667037
5,semantic,10,{},0.933333,0.679691
8,hybrid_rff,10,"{""source"": 1.0, ""section"": 10.0, ""text"": 2.0}",0.911111,0.691878


After light-weight search configs tuning:
- by MRR: hybrid search + rff reranking turned out to be the best
- by hit rate: semantic search is the best

### Search hyperparameters tuning

In [10]:
# select a validation set for tuning
val_size = len(df_question) // 2
df_validation = df_question.sample(n=val_size, random_state=42)

gt_val = df_validation.to_dict(orient='records')

In [11]:
def simple_random_search_optimization(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')

    for _ in tqdm(range(n_iterations), total=n_iterations, desc='Random search iterations'):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [12]:
param_ranges = {
    'source': (0.0, 10.0),
    'section': (0.0, 10.0),
    'text': (0.0, 10.0)
}

def objective(boost_params):
    def search_function(q):
        return index.search(q['question'], boost_dict=boost_params)

    results = evaluate(gt_val, search_function, progress_bar=False)
    return results['mrr']

Test run

In [13]:
simple_random_search_optimization(param_ranges, objective, n_iterations=20)

Random search iterations:   0%|          | 0/20 [00:00<?, ?it/s]

({'source': 2.1384018352401393,
  'section': 9.908948524711427,
  'text': 2.879808810954154},
 0.6833333333333332)

Full run with all search types and num results

In [14]:
def full_run_optimization(index, ground_truth, random_state=42) -> pd.DataFrame:
    random.seed(random_state)
    
    param_ranges = {
        'source': (0.0, 10.0),
        'section': (0.0, 10.0),
        'text': (0.0, 10.0)
    }
    
    # select a validation set for tuning
    val_size = len(df_question) // 2
    df_validation = df_question.sample(n=val_size, random_state=42)
    gt_val = df_validation.to_dict(orient='records')

    results = []
    for search_type in ['lexical', 'semantic', 'hybrid_rff']:
        for num_results in [5, 10]:
            if search_type == 'semantic':
                boost_dict = {}
            else:
                def objective(boost_params):
                    def search_function(q):
                        return index.search(q['question'], search_type=search_type, boost_dict=boost_params, num_results=num_results)
                
                    results = evaluate(gt_val, search_function, progress_bar=False)
                    return results['mrr']
            
                # find the best boosting parameters
                best_params, best_score = simple_random_search_optimization(param_ranges, objective, n_iterations=100)
                boost_dict = best_params
            
            # rerun evaluation using full test dataset
            evaluation_results = evaluate(ground_truth, 
                                          lambda q: index.search(q['question'], search_type=search_type, boost_dict=boost_dict, num_results=num_results), 
                                          progress_bar=False)
    
            results.append({
                'search_type': search_type,
                'num_results': num_results,
                'boost_dict': json.dumps(boost_dict),
                'hit_rate': evaluation_results['hit_rate'],
                'mrr': evaluation_results['mrr'],
            })
    
    results = pd.DataFrame(results, columns=['search_type', 'num_results', 'boost_dict', 'hit_rate', 'mrr'])
    return results

In [15]:
results = full_run_optimization(index, ground_truth)
results.sort_values(by='mrr', ascending=True, inplace=True)
results.to_csv(f'../data/evaluation/{competition_slug}-retrieval-results.csv', index=False)
results

Random search iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Random search iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Random search iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Random search iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,search_type,num_results,boost_dict,hit_rate,mrr
0,lexical,5,"{""source"": 6.981393949882269, ""section"": 3.402...",0.688889,0.524815
1,lexical,10,"{""source"": 0.24786361898188725, ""section"": 7.3...",0.777778,0.537222
4,hybrid_rff,5,"{""source"": 4.937952193450843, ""section"": 0.804...",0.844444,0.655556
2,semantic,5,{},0.844444,0.667037
3,semantic,10,{},0.933333,0.679691
5,hybrid_rff,10,"{""source"": 9.152559087431595, ""section"": 2.213...",0.955556,0.705459


After more advanced search hyperparameters tuning hybrid search + rff reranking turned out to be the best

### Evaluation on the 2nd dataset

In [16]:
competition_slug = 'rohlik-orders-forecasting-challenge'
index = create_index(competition_slug)

df_question = pd.read_csv(f'../data/evaluation/{competition_slug}-ground-truth.csv')
ground_truth = df_question.to_dict(orient='records')
df_question.head()

results = full_run_optimization(index, ground_truth)
results.sort_values(by='mrr', ascending=True, inplace=True)
results.to_csv(f'../data/evaluation/{competition_slug}-retrieval-results.csv', index=False)
results



Creating document embeddings:   0%|          | 0/1117 [00:00<?, ?it/s]

Random search iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Random search iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Random search iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Random search iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,search_type,num_results,boost_dict,hit_rate,mrr
0,lexical,5,"{""source"": 3.380855621474553, ""section"": 5.883...",0.652174,0.520435
1,lexical,10,"{""source"": 6.6945884461991065, ""section"": 5.63...",0.73913,0.533427
4,hybrid_rff,5,"{""source"": 5.077376758096564, ""section"": 8.205...",0.817391,0.68
5,hybrid_rff,10,"{""source"": 8.238553513904476, ""section"": 9.093...",0.886957,0.685059
2,semantic,5,{},0.808696,0.694203
3,semantic,10,{},0.886957,0.70647


For the 2nd larger dataset semantic search turned out to be slightly better on MRR.

**Note:** both datasets and index sizes are small.