In [125]:
# Import libraries and modules
import pandas as pd
import minsearch
from tqdm.auto import tqdm
from openai import OpenAI
from sentence_transformers import SentenceTransformer


In [144]:
# import data
df=pd.read_csv('../data/clean_data/final_data_with_IDs_new.csv')

In [145]:
documents=df.to_dict(orient='records')

In [146]:
len(documents)

5650

In [129]:
documents[0]

{'id': 0,
 'Chunked_Content': "when you eat it alone?. Good idea to eat it once wuth honey and once alone My kids eat it with the meal.They aren't at home btw meals I will do it during week end Yesterday I had nausea after several hours passed from second meal.Today I had zero hunger until 20h+ passed after previous that meal Today I made only one typical meal enhanced with 4 egg yolks",
 'number of sentences': 2,
 'number of words': 69}

In [130]:
### Transformer download
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



## Indexing with Elasticsearch

In [32]:
# Indexing with elastic search
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': 'c6e0ef327706', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'oJUtKKIDSXWCTt_emGDNlg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [33]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Chunked_Content": {"type": "text"},
            "text_vector":
            {"type": "dense_vector", 
             "dims": 384,
             "index": True, 
             "similarity": "cosine"},
        }
    }
}

In [34]:
index_name = "diet-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'diet-questions'})

In [35]:
len(model.encode('hello world'))

384

In [36]:
#created the dense vector using the pre-trained model
operations = []
for doc in tqdm(documents):
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["Chunked_Content"]).tolist()
    operations.append(doc)

  0%|          | 0/5650 [00:00<?, ?it/s]

In [38]:
type(operations)

list

In [40]:
import pickle


# Save the list of dictionaries to a pickle file
with open('../data/clean_data/semantic_vector_search.pkl', 'wb') as file:
    pickle.dump(operations, file)

print("Data saved to pickle file successfully.")


Data saved to pickle file successfully.


In [46]:
# load the data
with open('../data/clean_data/semantic_vector_search.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

item=loaded_data[0]['text_vector']
type(item)


list

In [47]:
# index with elasticsearch
for doc in tqdm(operations):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/5650 [00:00<?, ?it/s]

In [51]:
query = 'How can I lose belly fat?'

In [52]:
v_q = model.encode(query)

In [54]:
query = {
    "field": "text_vector",
    "query_vector": v_q,
    "k": 5,
    "num_candidates": 10000, 
}

In [105]:
res = es_client.search(index=index_name, knn=query, source=["Chunked_Content", "id"])
res["hits"]["hits"]

[{'_index': 'diet-questions',
  '_id': 'YvRNTZIBcgRRhKtd9ruE',
  '_score': 0.79868144,
  '_source': {'id': 4407,
   'Chunked_Content': 'Basically holding your breath ?. Breathing less ?. Thank you for good information vilt💪.. I have issues eating rendered fat or fat that has been cooked e.g beef tallow, ghee, etc not really, breathing small portions at a time and more slowly takes time to master I remember how in excercises i was teached to breath true the diaphragm so it moves up and down yeah, thats too you need to breath through your diaprhagm, so the belly moves'}},
 {'_index': 'diet-questions',
  '_id': 'j_ROTZIBcgRRhKtdkL2H',
  '_score': 0.7679534,
  '_source': {'id': 4964, 'Chunked_Content': 'How do I check for body fat?'}},
 {'_index': 'diet-questions',
  '_id': 'wfRLTZIBcgRRhKtdlrLT',
  '_score': 0.7602743,
  '_source': {'id': 2198,
   'Chunked_Content': 'Interesting- so a big belly but still with abs!?. I’m 100% raw if not, you are screwed'}},
 {'_index': 'diet-questions',
  

In [93]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        
    }

    search_query = {
        "knn": knn,
        "_source": ["id","Chunked_Content"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [94]:
def question_vector_knn(q):
    question = q['question']
    
    v_q = model.encode(question)

    return elastic_search_knn('text_vector', v_q)

In [147]:
df=pd.read_csv('../data/clean_data/ground-truth-data.csv')

In [148]:
ground_truth=df.to_dict(orient='records')
ground_truth[0]
len(ground_truth)

1500

In [149]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [150]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [151]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [152]:
evaluate(ground_truth, question_vector_knn)

  0%|          | 0/1500 [00:00<?, ?it/s]

{'hit_rate': 0.5106666666666667, 'mrr': 0.40985555555555536}

In [106]:
{'hit_rate': 0.5106666666666667, 'mrr': 0.40985555555555536} # vector search

{'hit_rate': 0.5106666666666667, 'mrr': 0.40985555555555536}

In [109]:
# ES text only: {'hit_rate': 0.5573333333333333, 'mrr': 0.3611481481481479}

### RAG Evaluation with semantic search

In [153]:
from openai import OpenAI
client=OpenAI()

In [167]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [168]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [169]:
query='How to lower blood pressure?'
vector=model.encode(query)
def rag(query, model='gpt-4o-mini'):
    search_results = elastic_search_knn('text_vector', vector)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer


In [170]:
def rag(query, model='gpt-4o-mini'):
    search_results = question_vector_knn(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [171]:
import json
len(df)

1500

In [161]:
df_sample = df.sample(n=200, random_state=1)

In [163]:
sample = df_sample.to_dict(orient='records')
sample[:5]

[{'id': 2629,
  'question': "What should I look for at the farmer's market related to the eggs?"},
 {'id': 5116,
  'question': 'What led Aajonus to compare the appendix to a library?'},
 {'id': 4650,
  'question': 'What made you decide to stop purchasing organic meat?'},
 {'id': 2331, 'question': 'Is it really feasible to avoid eating pizza?'},
 {'id': 2582,
  'question': "What did you mean by saying 'great to know' in response to the mention of mixed babies and Italy?"}]

In [172]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(record, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [173]:
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [174]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.680
PARTLY_RELEVANT    0.255
NON_RELEVANT       0.065
Name: proportion, dtype: float64

In [175]:
df_eval.to_csv('../data/clean_data/rag-eval-elastic vector search-gpt4o.csv', index=False)