In [1]:
import pandas as pd
import numpy as np
import ast

from dotenv import load_dotenv
from llm_requests import *
from pinecone_db import *

from tqdm import tqdm

load_dotenv()

True

In [2]:
import os
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['TOKENIZERS_PARALLELISM'] = "false"

In [3]:
with open('TestQueries.txt', 'r', encoding='utf-8') as f:
    search_queries = f.readlines()
    search_queries = list(map(lambda q: q.strip(), search_queries))

search_queries[:3]

['\ufeffAnimation movie toys move into new house, jealous toy sheriff, space ranger action figure, sadistic neighbor, mutant toys, Pizza Planet, Christmas gift-opening scene',
 'Documentary martial arts dance rituals, war dances, sword dances, cultural significance, martial arts performance with music',
 'Dark comedy series based on graphic novel, teen psychopath road trip, rebel adventure, star-crossed teenagers']

In [7]:
from openai import OpenAI
client = OpenAI()

def gpt_evaluate(search_query: str, search_results: list, k: int):
    prompt = ""
    cnt = 1
    for res in search_results:
        title, description = res['title'], res['description']
        if title not in prompt and cnt <= k and len(description) > 20:
            prompt += f"{cnt}. Title: {title}\nDescription: {description}\n\n"
            cnt += 1
    prompt = prompt.strip()

    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a fair evaluator of movie search system."
         "You will be provided a search query and search results with movies descriptions. "
         "Your task is for each movie to return either 0 or 1 to indicate whether movie is relevant "
         "and good reccomendation based on provided search query or not. Output format should be Python list "
         "of 0s and 1s of length returned movies."},
        {
            "role": "user",
            "content": f"""
Search query: {search_query}\n\nOutput movies:
{prompt}
Provide evaluation list of length {k}.
"""
            },
        ]
    )
    response_str = response.choices[0].message.content
    start, end = response_str.find('['), response_str.find(']')
    return ast.literal_eval(response_str[start:end+1])

In [8]:
def extract_metadata(query: str) -> dict:
    url = f"http://localhost:8085/extract_metadata"
    body = {
        "query": query,
        "parameters": {}
    }
    try:
        response = requests.post(url, json=body)
        if response.status_code == 200:
            return response.json()
        return {"error":
                f"Request failed with status code {response.status_code}\nAPI URL: {url}"
                }
    except Exception as e:
        return {"error": f"An error occurred: {str(e)}"}

def search_movies(
        chunking_strategy: str,
        embedding_model: str,
        query: str,
        metadata: dict,
        k: int,
        min_similarity_score: float) -> dict:
    url = f"http://localhost:8080/search_movies"
    body = {
        "chunking_strategy": chunking_strategy,
        "embedding_model": embedding_model,
        "query": query,
        "metadata": metadata,
        "k": k,
        "min_similarity_score": min_similarity_score
    }
    try:
        response = requests.post(url, json=body)
        if response.status_code == 200:
            return response.json()
        return {"error":
                f"Request failed with status code {response.status_code}"
                }
    except Exception as e:
        return {"error": f"An error occurred: {str(e)}"}

In [6]:
CHUNKING_STRATEGIES = [
    'fixed-size-splitter',
    'recursive-splitter',
    'semantic-splitter'
    ]
EMBEDDING_MODELS = [
    'all-MiniLM-L6-v2',
    'bert-base-nli-mean-tokens',
    'gtr-t5-base'
]

In [87]:
def evaluate_retrieval(search_queries: list, k: int, search: str):
    MAX_ATTEMPTS = 3
    evaluation_results = pd.DataFrame(
        {
            'chunking_strategy': [],
            'embedding_model': [],
            'search_query': [],
            'extracted_metadata': [],
            'scores': []
        }
    )

    for i, query in zip(tqdm(range(len(search_queries))), search_queries):
        attempt = 0
        success = False
        while not success and attempt < MAX_ATTEMPTS:
            try:
                if search == 'hybrid':
                    metadata = extract_metadata(query)
                    metadata['generated_response'] = ast.literal_eval(metadata['generated_response'])
                    metadata = metadata['generated_response']
                else:
                    metadata = {}
                for chunking_strategy in CHUNKING_STRATEGIES:
                    for embedding_model in EMBEDDING_MODELS:
                        search_results = search_movies(
                            chunking_strategy=chunking_strategy,
                            embedding_model=embedding_model,
                            query=query,
                            metadata=metadata,
                            k=k,
                            min_similarity_score=0
                        )['search_results']

                        scores = gpt_evaluate(query, search_results, k)
                        evaluation_results = pd.concat([evaluation_results, pd.DataFrame(
                                {
                                    'chunking_strategy': [chunking_strategy],
                                    'embedding_model': [embedding_model],
                                    'search_query': [query],
                                    'extracted_metadata': [metadata],
                                    'scores': [scores]
                                    }
                        )])
                        success = True
                        evaluation_results.to_csv(f'retrieval_validation/gpt_evaluations_{search}_search.csv', index=False)
            except:
                attempt += 1

    return evaluation_results

evaluation_results = evaluate_retrieval(search_queries=search_queries, k=10, search='hybrid')
evaluation_results.to_csv('retrieval_validation/gpt_evaluations_hybrid_search.csv', index=False)
evaluation_results

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [01:30<00:00, 45.17s/it]


Unnamed: 0,chunking_strategy,embedding_model,search_query,extracted_metadata,scores
0,fixed-size-splitter,all-MiniLM-L6-v2,"﻿Animation movie toys move into new house, jea...","{'title': 'Toy Story', 'genre': 'animation', '...","[1, 1, 0, 1, 0, 0, 1, 1, 0, 0]"
1,fixed-size-splitter,bert-base-nli-mean-tokens,"﻿Animation movie toys move into new house, jea...","{'title': 'Toy Story', 'genre': 'animation', '...","[1, 1, 0, 1, 0, 0, 0, 0, 0, 0]"
2,fixed-size-splitter,gtr-t5-base,"﻿Animation movie toys move into new house, jea...","{'title': 'Toy Story', 'genre': 'animation', '...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
3,recursive-splitter,all-MiniLM-L6-v2,"﻿Animation movie toys move into new house, jea...","{'title': 'Toy Story', 'genre': 'animation', '...","[1, 0, 1, 1, 0, 0, 1, 0, 1, 1]"
4,recursive-splitter,bert-base-nli-mean-tokens,"﻿Animation movie toys move into new house, jea...","{'title': 'Toy Story', 'genre': 'animation', '...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 1]"
...,...,...,...,...,...
0,semantic-splitter,bert-base-nli-mean-tokens,British crime drama centered around a detectiv...,"{'title': '', 'genre': 'british shows', 'min_y...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
0,semantic-splitter,gtr-t5-base,British crime drama centered around a detectiv...,"{'title': '', 'genre': 'british shows', 'min_y...","[1, 1, 0, 1, 1, 1, 0, 1, 1, 1]"
0,semantic-splitter,all-MiniLM-L6-v2,"Spanish-language shows telenovelas, Latin Amer...","{'title': '', 'genre': 'Spanish-language shows...","[1, 1, 0, 1, 0, 1, 0, 1, 0, 0]"
0,semantic-splitter,bert-base-nli-mean-tokens,"Spanish-language shows telenovelas, Latin Amer...","{'title': '', 'genre': 'Spanish-language shows...","[1, 1, 1, 0, 1, 0, 1, 1, 0, 0]"


In [12]:
evaluation_results = evaluate_retrieval(search_queries=search_queries, k=10, search='vector')
evaluation_results.to_csv('retrieval_validation/gpt_evaluations_vector_search.csv', index=False)
evaluation_results

100%|██████████| 100/100 [17:04<00:00, 10.25s/it]


Unnamed: 0,chunking_strategy,embedding_model,search_query,extracted_metadata,scores
0,fixed-size-splitter,all-MiniLM-L6-v2,"﻿Animation movie toys move into new house, jea...",{},"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
0,fixed-size-splitter,bert-base-nli-mean-tokens,"﻿Animation movie toys move into new house, jea...",{},"[1, 1, 0, 0, 1, 1, 0, 0, 0, 0]"
0,fixed-size-splitter,gtr-t5-base,"﻿Animation movie toys move into new house, jea...",{},"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0]"
0,recursive-splitter,all-MiniLM-L6-v2,"﻿Animation movie toys move into new house, jea...",{},"[1, 0, 1, 1, 1, 0, 0, 0, 0, 1]"
0,recursive-splitter,bert-base-nli-mean-tokens,"﻿Animation movie toys move into new house, jea...",{},"[1, 1, 0, 1, 0, 0, 0, 0, 1, 0]"
...,...,...,...,...,...
0,recursive-splitter,bert-base-nli-mean-tokens,"Film-noir classic noir atmosphere, gritty crim...",{},"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
0,recursive-splitter,gtr-t5-base,"Film-noir classic noir atmosphere, gritty crim...",{},"[0, 0, 1, 1, 0, 0, 0, 1, 0, 0]"
0,semantic-splitter,all-MiniLM-L6-v2,"Film-noir classic noir atmosphere, gritty crim...",{},"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
0,semantic-splitter,bert-base-nli-mean-tokens,"Film-noir classic noir atmosphere, gritty crim...",{},"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [3]:
def compute_precision_at_k(evaluation_results: pd.DataFrame) -> float:
    scores = []

    for _, row in evaluation_results.iterrows():
        score = row['scores']
        precision = sum(score) / len(score)
        scores.append(precision)
    return round(np.mean(scores), 4)

def compute_hit_rate(evaluation_results: pd.DataFrame) -> float:
    scores = []

    for _, row in evaluation_results.iterrows():
        score = row['scores']
        if 1 in score:
            scores.append(1)
        else:
            scores.append(0)
    return round(np.mean(scores), 4)

def compute_ndcg(evaluation_results: pd.DataFrame) -> float:
    scores = []

    for _, row in evaluation_results.iterrows():
        score = row['scores']
        dcg = sum([s/np.log(i+2) for i, s in enumerate(score)])
        idcg = sum([1/np.log(i+2) for i, s in enumerate(score)])
        scores.append(dcg/idcg)
    return round(np.mean(scores), 4)

def compute_mrr(evaluation_results: pd.DataFrame) -> float:
    scores = []

    for _, row in evaluation_results.iterrows():
        score = row['scores']
        mrr = [1/(i+1) for i, s in enumerate(score) if s == 1]
        mrr = mrr[0] if len(mrr) > 0 else 0
        scores.append(mrr)
    return round(np.mean(scores), 4)

def compute_all_metrics(evaluation_results: pd.DataFrame) -> pd.DataFrame:
    models = []
    precision_at_k = []
    hit_rate = []
    ndcg = []
    mrr = []

    for chunking_strategy in CHUNKING_STRATEGIES:
        for embedding_model in EMBEDDING_MODELS:
            model = f"{chunking_strategy} + {embedding_model}"
            valid_df = evaluation_results[(evaluation_results['chunking_strategy'] == chunking_strategy) & (evaluation_results['embedding_model'] == embedding_model)]
            models.append(model)
            precision_at_k.append(compute_precision_at_k(valid_df))
            hit_rate.append(compute_hit_rate(valid_df))
            ndcg.append(compute_ndcg(valid_df))
            mrr.append(compute_mrr(valid_df))

    metrics_df = pd.DataFrame(
        {
            'Chunking Strategy + Embedding model': models,
            'Precision@K': precision_at_k,
            'HitRate': hit_rate,
            'NDCG': ndcg,
            'MRR': mrr
        }
    )

    return metrics_df

In [22]:
evaluation_results = pd.read_csv('retrieval_validation/gpt_evaluations_hybrid_search.csv')
evaluation_results['scores'] = evaluation_results['scores'].apply(lambda score: list(map(int, ast.literal_eval(score))))
metrics = compute_all_metrics(evaluation_results)

metrics.to_csv('retrieval_validation/metrics_hybrid_search.csv', index=False)
metrics

Unnamed: 0,Chunking Strategy + Embedding model,Precision@K,HitRate,NDCG,MRR
0,fixed-size-splitter + all-MiniLM-L6-v2,0.4337,0.91,0.4813,0.7907
1,fixed-size-splitter + bert-base-nli-mean-tokens,0.3979,0.86,0.4392,0.7095
2,fixed-size-splitter + gtr-t5-base,0.3863,0.91,0.441,0.7893
3,recursive-splitter + all-MiniLM-L6-v2,0.3919,0.86,0.4401,0.7655
4,recursive-splitter + bert-base-nli-mean-tokens,0.3964,0.91,0.4407,0.7345
5,recursive-splitter + gtr-t5-base,0.412,0.94,0.456,0.7756
6,semantic-splitter + all-MiniLM-L6-v2,0.4293,0.91,0.4679,0.7532
7,semantic-splitter + bert-base-nli-mean-tokens,0.4177,0.91,0.4568,0.734
8,semantic-splitter + gtr-t5-base,0.4132,0.91,0.4589,0.7739


In [7]:
evaluation_results = pd.read_csv('retrieval_validation/gpt_evaluations_vector_search.csv')
evaluation_results['scores'] = evaluation_results['scores'].apply(lambda score: list(map(int, ast.literal_eval(score))))

metrics = compute_all_metrics(evaluation_results)
metrics.to_csv('retrieval_validation/metrics_vector_search.csv', index=False)
metrics

Unnamed: 0,Chunking Strategy + Embedding model,Precision@K,HitRate,NDCG,MRR
0,fixed-size-splitter + all-MiniLM-L6-v2,0.345,0.94,0.3819,0.6874
1,fixed-size-splitter + bert-base-nli-mean-tokens,0.3,0.86,0.3255,0.5582
2,fixed-size-splitter + gtr-t5-base,0.348,0.95,0.3916,0.7128
3,recursive-splitter + all-MiniLM-L6-v2,0.332,0.93,0.3716,0.6805
4,recursive-splitter + bert-base-nli-mean-tokens,0.3,0.83,0.3222,0.5478
5,recursive-splitter + gtr-t5-base,0.3323,0.9091,0.3741,0.676
6,semantic-splitter + all-MiniLM-L6-v2,0.3313,0.9394,0.365,0.6639
7,semantic-splitter + bert-base-nli-mean-tokens,0.2768,0.8485,0.3102,0.5682
8,semantic-splitter + gtr-t5-base,0.3333,0.9596,0.3709,0.6636
