In [183]:
# Import libraries and modules
import pandas as pd
import minsearch
from tqdm.auto import tqdm
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import pickle

In [184]:
# import data
df=pd.read_csv('../data/clean_data/date_chunked_5s.csv')

In [185]:
documents=df.to_dict(orient='records')

In [186]:
len(documents)

5837

In [187]:
documents[0:2]

[{'id': 0,
  'content': 'this is the first I see an egg in bath ingredients and egg?. The plastic might even be better (especially if you have hard plastic) considering you have the cloth in between, because stainless steel could draw EMFs Your love Better anything non processed.. Sea water, an egg, sea salt, ACV, a little Urine.. .. .',
  'number of sentences': 5,
  'number of words': 56},
 {'id': 1,
  'content': 'My husband is going through his first vomit/diarrhea detox, hydration formula or egg is not what he wants.. He is craving orange juice or lemonade.. What is good to give him to help detox?. is pasteurised milk gonna be better than nothing in the bath?. Впервые попробовал тухлую печень.',
  'number of sentences': 5,
  'number of words': 49}]

In [188]:
### Transformer download
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [189]:
len(model.encode('hello'))

384

In [190]:
# contents=[doc['content'] for doc in tqdm(documents)]
# embeddings=model.encode(contents)
# print(embeddings[0])


In [191]:
#created the dense vector using the pre-trained model
data = []
for doc in tqdm(documents):
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["content"]).tolist()
    data.append(doc)

  0%|          | 0/5837 [00:00<?, ?it/s]

In [192]:
# Save the list of dictionaries to a pickle file
with open('../data/clean_data/lancedb_vector_search.pkl', 'wb') as file:
    pickle.dump(data, file)

print("Data saved to pickle file successfully.")

Data saved to pickle file successfully.


In [193]:
len(embeddings)


5837

## Indexing with LancedDB

In [194]:
import lancedb
import pyarrow as pa

# Connect to LanceDB
db = lancedb.connect("../data/clean_data/lancedb")

# Define the schema for the table (example: text and embedding columns)
schema = pa.schema([
    ("id", pa.int64()), 
    ("content", pa.string()),               # Text column for storing document content
    ("text_vector", pa.list_(pa.float32(),384))   # Fixed-size list for vectors# 
])



# Create the table with the schema, even if there's no data
table = db.create_table("my_empty_table", 
                        schema=schema, 
                        mode='overwrite'
                       )

# Print message to confirm table creation
print("Empty table created successfully!")




Empty table created successfully!


In [195]:
# Assuming you've already connected to the database and created/opened a table
db = lancedb.connect("../data/clean_data/lancedb")

# Open the existing table (replace "my_vector_table" with your actual table name)
table = db.open_table("my_empty_table")

# Get the schema of the table
schema = table.schema

# Print the schema to see the fields and their data types
print(schema)


id: int64
content: string
text_vector: fixed_size_list<item: float>[384]
  child 0, item: float


In [196]:
data[0:1]

[{'id': 0,
  'content': 'this is the first I see an egg in bath ingredients and egg?. The plastic might even be better (especially if you have hard plastic) considering you have the cloth in between, because stainless steel could draw EMFs Your love Better anything non processed.. Sea water, an egg, sea salt, ACV, a little Urine.. .. .',
  'number of sentences': 5,
  'number of words': 56,
  'text_vector': [-0.06950577348470688,
   0.00620897114276886,
   0.06879696995019913,
   -0.01693076267838478,
   0.006399591453373432,
   0.016069823876023293,
   0.019661109894514084,
   -0.02508622221648693,
   0.004416115581989288,
   -0.08724799752235413,
   -0.04237271845340729,
   -0.07054975628852844,
   -0.07857808470726013,
   0.05781452730298042,
   -0.033999837934970856,
   -0.028500793501734734,
   0.028422120958566666,
   0.09938568621873856,
   -0.009505418129265308,
   -0.006964648608118296,
   0.006569295655936003,
   -0.03321262076497078,
   0.0458974689245224,
   -0.0588070265948

In [197]:
# add data to table
table.add(data)


In [198]:
### performing a vector search

In [199]:
# Let's assume the query is "vector search in LanceDB"
query = "what type of protein to eat to bulk up"

# Generate an embedding for the query using the same model
query_embedding = model.encode([query])[0]
# Perform vector search in LanceDB
results = table.search(query_embedding, vector_column_name="text_vector").limit(10).to_pandas()
print(type(results))
# Display results
#print(results[['content']])


results.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,id,content,text_vector,_distance
0,5232,How do you get raw dairy in Canada?. Accompany...,"[-0.040744204, -0.08202689, 0.018740114, 0.013...",1.004415
1,5473,If fasting is 'bad' there no need to have macr...,"[-0.03963867, -0.06847364, 0.016403785, -0.008...",1.014721
2,3631,Aajonus recommends to wake up at night to eat ...,"[-0.05317117, 0.0363952, 0.05797835, 0.0430706...",1.01625
3,2634,"show=true I’m straight if you paté the meat, a...","[-0.01303589, -0.005889988, 0.011757708, 0.044...",1.021979
4,4470,".. .. ), Aajonus recommends to eat about an eq...","[-0.05279341, -0.09572333, 0.055784415, -0.042...",1.022054


In [220]:
def lancedb_search(query):
    # Step 1: Generate an embedding for the query using your embedding model
    query_embedding = model.encode([query])[0]  # Use your embedding model to generate the vector

    # Step 2: Perform a vector search using LanceDB
    results = table.search(query_embedding, vector_column_name="text_vector").limit(10).to_pandas()

     # Extract the documents and their ids from the search results
    retrieved_docs = results[['id', 'content']].to_dict(orient='records')  # Convert DataFrame to list of dictionaries
    
    
    return retrieved_docs

In [247]:
def lancedb_search(query_text, nprobes=10, refine_factor=2, k=5):
    # Generate the embedding for the query using your model
    query_embedding = model.encode([query_text])[0]
    
    # Perform the vector search in LanceDB with nprobes and refine_factor
    results = table.search(query_embedding, vector_column_name="text_vector") \
        .nprobes(nprobes) \
        .refine_factor(refine_factor) \
        .limit(k) \
        .to_pandas()
     # Extract the documents and their ids from the search results
    retrieved_docs = results[['id', 'content']].to_dict(orient='records')  # Convert DataFrame to list of dictionaries
    
    return retrieved_docs

# # Example usage with the query text
# query_text = "some health-related query"
# search_results = lancedb_search(query_text, nprobes=20, refine_factor=3, k=5)

# print(search_results)


In [248]:
lancedb_search('how do I lose belly fat?')

[{'id': 2556,
  'content': 'More detoxing is necessary I wish I had that much body fat though, I frequently feel sick because I don’t have enough As long as u are overweight its fine 5-15lbs overweight is for women and 15-40lbs for men yeah.. But being fatter as a women is still recommended Funny how many people who find primal are primarily underweight people.. I am the opposite where I put on weight easily Isn’t that for men?. More fat is better Really, since I hit 60-61 kg I don’t gain any weight anymore 🤷🏻\u200d♀️ Fat = fertility Girl stop it 🤣🤣 Don’t flex 🍑 I’m also way fatter than before hahaha, especially my belly No just sunmaxxing, i did gain some muscle though I will but I won’t show it 🤣 How tall Yes, flex w words only !. 🤣🤣 And lose Belly fat by doing nothing?'},
 {'id': 3209,
  'content': ".. There's no way you can convince me that a body like that is healthy There's no way you can convince me that a belly like that is healthy.. 🤷🏻\u200d♂️ protection from what?. Can you pr

In [222]:
## Retrieval Evaluation with lancedb semantic vector search

In [223]:
df=pd.read_csv('../data/clean_data/ground-truth-data_final.csv')

In [224]:
ground_truth=df.to_dict(orient='records')
ground_truth[0]
len(ground_truth)

1000

In [225]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [226]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [227]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [228]:
fun=lambda q:q['id']
print(fun)
fun(ground_truth[0])


<function <lambda> at 0x000002EC123F4A40>


608

In [252]:
evaluate(ground_truth, lambda q: lancedb_search(q['question'],nprobes=40, refine_factor=20, k=5))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.527, 'mrr': 0.4236333333333329}

### RAG Evaluation with semantic search

In [210]:
from openai import OpenAI
client=OpenAI()

In [211]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [215]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    
    entry_template = """
    content: {content}
    """.strip()
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [218]:
query='how to get fat'
search_results=lancedb_search(query)
build_prompt(query, search_results)

"You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.\n    Use only the facts from the CONTEXT when answering the QUESTION.\n    \n    QUESTION: how to get fat\n    \n    CONTEXT:\n    content: Have a look at the guide above.. One of the essentials of PD.. Sure.. I generally make full fat.. But I don't keep them for many months any more, like i did before.\n\ncontent: 😳 Find cream Jean 👀 Actually im strugglin to gain weight.. .. .. im not skinny like a stick but i would like more weight.. .\n\ncontent: .. I’ve had the delight of good fat before.. But not always easy to find good ones, unfortunately same I buy 2 kilos at once and eat 1 pound per day, so on the week i only spend 2 dollars on my fat source, alongside the milk I put it on a jar of glass and dont mind to much One time i forget to put on the fridge and it got fermented very quickly Yeah, he isn’t on primal diet though so he doesn’t know I noticed that fat can be stored for a

In [230]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [231]:
query='How to lower blood pressure?'
vector=model.encode(query)
def rag(query, model='gpt-4o-mini'):
    search_results = lancedb_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer


In [233]:
rag(query)

"To lower blood pressure, consider incorporating raw garlic into your diet, as it can help regulate blood pressure according to your body's needs. Additionally, consuming half a grapefruit may also be beneficial, but be cautious of compatibility issues with heart medications. Foods that act as solvents, such as vinegar or unripe pineapple, might also help; it's recommended to consume them with fat, like in a sauce made with raw butter. These natural dietary changes can support your efforts to manage blood pressure effectively."

In [234]:
import json
len(df)

1000

In [235]:
df_sample = df.sample(n=200, random_state=1)

In [236]:
sample = df_sample.to_dict(orient='records')
sample[:5]

[{'id': 5385,
  'question': 'In Europe, does everything including milk come in glass containers?'},
 {'id': 871,
  'question': 'Can eating raw foods save time compared to more processed food preparation?'},
 {'id': 5593,
  'question': 'Why might you experience high cholesterol and ketones with certain diets?'},
 {'id': 3682,
  'question': 'Is pasteurized butter from non-toxic animals safe for consumption given concerns about mad cow disease?'},
 {'id': 604, 'question': 'Who is Lina and what does she run?'}]

In [240]:
evaluations_gpt4o_mini = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o-mini') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o_mini.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [241]:
df_eval = pd.DataFrame(evaluations_gpt4o_mini, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [242]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.715
PARTLY_RELEVANT    0.270
NON_RELEVANT       0.015
Name: proportion, dtype: float64

In [244]:
df_eval.to_csv('../data/clean_data/rag-eval-lancedb_vector_search-gpt4o_mini.csv', index=False)