In [8]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

from langchain_elasticsearch import ElasticsearchRetriever
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import Dict
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.embeddings import SentenceTransformerEmbeddings

In [11]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
embeddings = SentenceTransformerEmbeddings(model_name=model_name)
es_url = 'http://localhost:9200'
def hybrid_query(query: str) -> Dict:
    vector = embeddings.embed_query(query)  # same embeddings as for indexing
    return {
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fuzziness": "AUTO",
                        "fields": ["content", "title","description"],
                        "type": "best_fields",
                        "boost": 0.5,
                    }
                },
            }
        },
        "knn": {
            "field": "content_vector",
            "query_vector": vector,
            "k": 3,
            "num_candidates": 10000,
            "boost": 0.5,
        },
        "size": 3,
    }

index_name = "relationship_consult"
hybrid_retriever = ElasticsearchRetriever.from_es_params(
    index_name=index_name,
    body_func=hybrid_query,
    content_field='content',
    url=es_url,
)



In [3]:
query = "เศร้า"
hybrid_results = hybrid_retriever.invoke(query)
for result in hybrid_results:
    print(result.metadata['_source']['title'], result.metadata['_score'])

ทำความรู้จักกับ โรคซึมเศร้า โดยจิตแพทย์ - Alljit Blog 2.001563
แฟนป่วยโรคซึมเศร้า ส่วนเราเอาไงดี? รับมืออย่างไร - Alljit Blog 1.9367619
มีแฟนแต่รู้สึกเหงา นักจิตวิทยามองว่าอย่างไร? - Alljit Blog 0.36788744


In [9]:
eval_df=pd.read_csv('ground_truth.csv')
eval_df.head()

Unnamed: 0,title,section,question
0,แฟนที่เคยแสนดี นอกใจ มาขอโอกาส นักจิตวิทยาช่วย...,relationship,เมื่อแฟนที่เคยแสนดี นอกใจ มาขอโอกาส เราควรพิจา...
1,แฟนที่เคยแสนดี นอกใจ มาขอโอกาส นักจิตวิทยาช่วย...,relationship,การให้โอกาสแฟนที่นอกใจหมายถึงอะไร? มันแตกต่างจ...
2,แฟนที่เคยแสนดี นอกใจ มาขอโอกาส นักจิตวิทยาช่วย...,relationship,ทำไมเราถึงรู้สึกอยากให้โอกาสแฟนที่นอกใจ แม้จะร...
3,แฟนที่เคยแสนดี นอกใจ มาขอโอกาส นักจิตวิทยาช่วย...,relationship,การให้โอกาสแฟนที่นอกใจอาจนำไปสู่ผลลัพธ์อะไรบ้าง?
4,แฟนที่เคยแสนดี นอกใจ มาขอโอกาส นักจิตวิทยาช่วย...,relationship,เราจะรู้ได้อย่างไรว่าการให้โอกาสแฟนที่นอกใจเป็...


In [10]:
def precision_at_k(retrieved_results, relevant_results, k):
    retrieved_at_k = retrieved_results[:k]
    relevant_at_k = [1 if doc in relevant_results else 0 for doc in retrieved_at_k]
    return sum(relevant_at_k) / k

def recall(retrieved_results, relevant_results):
    relevant_retrieved = [1 if doc in relevant_results else 0 for doc in retrieved_results]
    return sum(relevant_retrieved) / len(relevant_results)

def average_precision(retrieved_results, relevant_results):
    relevant_retrieved = [1 if doc in relevant_results else 0 for doc in retrieved_results]
    precisions = [precision_at_k(retrieved_results, relevant_results, k+1) for k in range(len(relevant_retrieved)) if relevant_retrieved[k] == 1]
    if len(precisions) == 0:
        return 0
    return sum(precisions) / len(relevant_results)

def mean_average_precision(retrieved_results_list, relevant_results_list):
    return sum([average_precision(retrieved_results, relevant_results) 
                for retrieved_results, relevant_results in zip(retrieved_results_list, relevant_results_list)]) / len(relevant_results_list)

import numpy as np

# Discounted Cumulative Gain (DCG)
def dcg(relevances, p):
    """
    Compute DCG for the given relevances and position p.
    relevances: List of binary relevance scores (1 for relevant, 0 for non-relevant)
    p: The number of top results to consider for DCG
    """
    return sum((2**relevances[i] - 1) / np.log2(i + 2) for i in range(min(len(relevances), p)))

# Normalized Discounted Cumulative Gain (NDCG)
def ndcg(retrieved_results, relevant_results, p):
    """
    Compute NDCG at position p.
    retrieved_results: List of retrieved document titles
    relevant_results: List of relevant document titles
    p: The cutoff position for evaluation (e.g., NDCG@p)
    """
    # Calculate relevance scores for the retrieved results (1 if relevant, 0 if not)
    relevances = [1 if doc in relevant_results else 0 for doc in retrieved_results]
    
    # Compute DCG for the retrieved results
    dcg_value = dcg(relevances, p)
    
    # Create ideal relevance ordering (all relevant documents ranked first)
    ideal_relevances = sorted(relevances, reverse=True)
    
    # Compute ideal DCG (IDCG)
    idcg_value = dcg(ideal_relevances, p)
    
    # To avoid division by zero, return 0 if IDCG is 0
    if idcg_value == 0:
        return 0
    
    # Compute NDCG by normalizing DCG with IDCG
    return dcg_value / idcg_value


def reciprocal_rank(retrieved_results, relevant_results):
    for i, doc in enumerate(retrieved_results):
        if doc in relevant_results:
            return 1 / (i + 1)
    return 0

def mean_reciprocal_rank(retrieved_results_list, relevant_results_list):
    return sum([reciprocal_rank(retrieved_results, relevant_results)
                for retrieved_results, relevant_results in zip(retrieved_results_list, relevant_results_list)]) / len(relevant_results_list)

def f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

from sklearn.metrics import roc_auc_score

def auc_roc(retrieved_relevances, true_relevances):
    return roc_auc_score(true_relevances, retrieved_relevances)


def hit_rate_at_k(retrieved_results, relevant_results, k):
    retrieved_at_k = retrieved_results[:k]
    return 1 if any(doc in relevant_results for doc in retrieved_at_k) else 0

def hit_rate(retrieved_results_list, relevant_results_list, k):
    return sum([hit_rate_at_k(retrieved_results, relevant_results, k)
                for retrieved_results, relevant_results in zip(retrieved_results_list, relevant_results_list)]) / len(relevant_results_list)


def err(retrieved_results, relevant_results):
    err_value = 0
    relevance_probability = 1
    for i, doc in enumerate(retrieved_results):
        relevance = 1 if doc in relevant_results else 0
        err_value += relevance_probability * (relevance / (i + 1))
        relevance_probability *= (1 - relevance)
    return err_value


In [12]:
# Iterate over each question in ground_truth
precision_scores = []
recall_scores = []
map_scores = []
ndcg_scores = []
mrr_scores = []
f1_scores = []
hit_rate_scores = []
err_scores = []

for i, row in eval_df.iterrows():
    question = row['question']
    
    # Use the question as the query in the hybrid retriever
    hybrid_results = hybrid_retriever.invoke(question)
    
    # Extract the retrieved titles
    retrieved_titles = [result.metadata['_source']['title'] for result in hybrid_results]
    
    # Define the relevant title (from the ground_truth row)
    relevant_titles = [row['title']]
    
    # Calculate Precision@k, Recall, and MAP
    precision_k = precision_at_k(retrieved_titles, relevant_titles, k=3)
    recall_score = recall(retrieved_titles, relevant_titles)
    map_score = average_precision(retrieved_titles, relevant_titles)
    
    # Calculate NDCG@k (assuming k=3)
    ndcg_score = ndcg(retrieved_titles, relevant_titles, p=3)
    
    # Calculate Reciprocal Rank (for MRR)
    mrr_score = reciprocal_rank(retrieved_titles, relevant_titles)
    
    # Calculate F1 Score
    f1 = f1_score(precision_k, recall_score)
    
    # Calculate Hit Rate@k (k=3)
    hit_rate_score = hit_rate_at_k(retrieved_titles, relevant_titles, k=3)
    
    # Calculate ERR
    err_score = err(retrieved_titles, relevant_titles)
    
    # Append results for each query
    precision_scores.append(precision_k)
    recall_scores.append(recall_score)
    map_scores.append(map_score)
    ndcg_scores.append(ndcg_score)
    mrr_scores.append(mrr_score)
    f1_scores.append(f1)
    hit_rate_scores.append(hit_rate_score)
    err_scores.append(err_score)

# Calculate overall average metrics
avg_precision = sum(precision_scores) / len(precision_scores)
avg_recall = sum(recall_scores) / len(recall_scores)
avg_map = sum(map_scores) / len(map_scores)
avg_ndcg = sum(ndcg_scores) / len(ndcg_scores)
avg_mrr = sum(mrr_scores) / len(mrr_scores)
avg_f1 = sum(f1_scores) / len(f1_scores)
avg_hit_rate = sum(hit_rate_scores) / len(hit_rate_scores)
avg_err = sum(err_scores) / len(err_scores)

# Print out the final metrics
print(f"Average Precision@3: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average MAP: {avg_map}")
print(f"Average NDCG@3: {avg_ndcg}")
print(f"Average MRR: {avg_mrr}")
print(f"Average F1 Score: {avg_f1}")
print(f"Average Hit Rate@3: {avg_hit_rate}")
print(f"Average ERR: {avg_err}")


Average Precision@3: 0.23715651135005864
Average Recall: 0.7114695340501792
Average MAP: 0.6439665471923534
Average NDCG@3: 0.6613311266385073
Average MRR: 0.6439665471923534
Average F1 Score: 0.3557347670250896
Average Hit Rate@3: 0.7114695340501792
Average ERR: 0.6439665471923534


## Elastic Search Hybri Search Result

### Evaluation Metrics

| Metric                 | Score         |
|------------------------|---------------|
| **Average Precision@3** | 0.2389        |
| **Average Recall**      | 0.7168        |
| **Average MAP**         | 0.6478        |
| **Average NDCG@3**      | 0.6656        |
| **Average MRR**         | 0.6478        |
| **Average F1 Score**    | 0.3584        |
| **Average Hit Rate@3**  | 0.7168        |
| **Average ERR**         | 0.6478        |

### Summary:

- **Precision@3 (23.89%)**: The system retrieves only a small fraction of relevant documents in the top 3 positions, indicating low precision.
- **Recall (71.68%)**: The system does a good job of retrieving most relevant documents, but some are still missed.
- **Mean Average Precision (MAP 64.78%)**: Relevant documents tend to be found early in the results, which is a positive indicator of system performance.
- **NDCG@3 (66.56%)**: The system ranks the documents **66.56% as effectively as an ideal ranking**, showing it’s fairly good at ordering results by relevance.
- **MRR (64.78%)**: On average, the first relevant document is found relatively high in the list.
- **F1 Score (35.84%)**: Although recall is high, low precision results in a lower F1 score, indicating a need for improvement in reducing irrelevant results.
- **Hit Rate@3 (71.68%)**: For about **71.68% of the queries**, at least one relevant document is found within the top 3 results.
- **ERR (64.78%)**: Users are likely to find relevant documents without needing to go far down the results, as **64.78% of the first relevant documents are found early** in the list.

### Conclusion:
The system performs well in terms of recall (finding most relevant documents) and ranks relevant documents relatively high. However, there’s room for improvement in terms of precision, as the top results often include irrelevant documents. Improving the precision would also improve the F1 score and overall user satisfaction with the ranking of results.


## Chroma DB Search

In [3]:
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [4]:
api_key='xxxx'
doc_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001", task_type="retrieval_document",google_api_key=api_key
)

In [13]:
vector_store = Chroma(
    collection_name="relationship",
    embedding_function=doc_embeddings,
    persist_directory="vector_stores",
)

In [16]:
retriever = vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 3, "fetch_k": 5}
)

In [19]:
result=retriever.invoke("เศร้า")

In [22]:
result[0].metadata['title']

'รักต่างวัย กับ จิตวิทยาความรัก - Alljit Blog'

In [23]:
# Iterate over each question in ground_truth
precision_scores = []
recall_scores = []
map_scores = []
ndcg_scores = []
mrr_scores = []
f1_scores = []
hit_rate_scores = []
err_scores = []

for i, row in eval_df.iterrows():
    question = row['question']
    
    # Use the question as the query in the hybrid retriever
    results = retriever.invoke(question)
    
    # Extract the retrieved titles
    retrieved_titles = [result.metadata['title'] for result in results]
    
    # Define the relevant title (from the ground_truth row)
    relevant_titles = [row['title']]
    
    # Calculate Precision@k, Recall, and MAP
    precision_k = precision_at_k(retrieved_titles, relevant_titles, k=3)
    recall_score = recall(retrieved_titles, relevant_titles)
    map_score = average_precision(retrieved_titles, relevant_titles)
    
    # Calculate NDCG@k (assuming k=3)
    ndcg_score = ndcg(retrieved_titles, relevant_titles, p=3)
    
    # Calculate Reciprocal Rank (for MRR)
    mrr_score = reciprocal_rank(retrieved_titles, relevant_titles)
    
    # Calculate F1 Score
    f1 = f1_score(precision_k, recall_score)
    
    # Calculate Hit Rate@k (k=3)
    hit_rate_score = hit_rate_at_k(retrieved_titles, relevant_titles, k=3)
    
    # Calculate ERR
    err_score = err(retrieved_titles, relevant_titles)
    
    # Append results for each query
    precision_scores.append(precision_k)
    recall_scores.append(recall_score)
    map_scores.append(map_score)
    ndcg_scores.append(ndcg_score)
    mrr_scores.append(mrr_score)
    f1_scores.append(f1)
    hit_rate_scores.append(hit_rate_score)
    err_scores.append(err_score)

# Calculate overall average metrics
avg_precision = sum(precision_scores) / len(precision_scores)
avg_recall = sum(recall_scores) / len(recall_scores)
avg_map = sum(map_scores) / len(map_scores)
avg_ndcg = sum(ndcg_scores) / len(ndcg_scores)
avg_mrr = sum(mrr_scores) / len(mrr_scores)
avg_f1 = sum(f1_scores) / len(f1_scores)
avg_hit_rate = sum(hit_rate_scores) / len(hit_rate_scores)
avg_err = sum(err_scores) / len(err_scores)

# Print out the final metrics
print(f"Average Precision@3: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average MAP: {avg_map}")
print(f"Average NDCG@3: {avg_ndcg}")
print(f"Average MRR: {avg_mrr}")
print(f"Average F1 Score: {avg_f1}")
print(f"Average Hit Rate@3: {avg_hit_rate}")
print(f"Average ERR: {avg_err}")

Average Precision@3: 0.03584229390681003
Average Recall: 0.10752688172043011
Average MAP: 0.08811230585424135
Average NDCG@3: 0.09129072990911476
Average MRR: 0.08632019115890084
Average F1 Score: 0.053763440860215055
Average Hit Rate@3: 0.1057347670250896
Average ERR: 0.08632019115890084
