In [6]:
# Import libraries and modules
import pandas as pd
import minsearch
from tqdm.auto import tqdm
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import pickle

### Step 1: Prepare documents

In [7]:
# import data
df=pd.read_csv('../App/data/Mental_Health_FAQ.csv')

In [8]:
documents=df.to_dict(orient='records')

In [9]:
len(documents)

98

In [10]:
documents[0]

{'Question_ID': 1590140,
 'Questions': 'What does it mean to have a mental illness?',
 'Answers': 'Mental illnesses are health conditions that disrupt a personâ€™s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life.\nMental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the spectrum lie serious mental illnesses, which result in major functional impairment and interference with daily life. These include such disorders as major depression, schizophrenia, and bipolar disorder, and may require that the person receives care in a hospital.\nIt is important to know that mental illnesses are medical conditions that have nothing to do with a personâ€™s character, intelligence, or willpower. Just as diabetes is a disorder of the pancreas, mental illness is a medical condit

### Step 2: Create Embeddings using Pretrained Models

Sentence Transformers documentation here: https://www.sbert.net/docs/sentence_transformer/pretrained_models.html

In [15]:
### Transformer download
model= SentenceTransformer('all-mpnet-base-v2')




In [16]:

len(model.encode("This is a simple sentence"))

768

In [17]:
#created the dense vector using the pre-trained model
data = []
for doc in tqdm(documents):
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["Answers"]).tolist()
    data.append(doc)

  0%|          | 0/98 [00:00<?, ?it/s]

In [18]:
# documents[0]
len(data[0]['text_vector'])

768

### Step 3a: Setup Lancedb connection

In [19]:
#documents[0]   why has the document been changed

In [20]:
# Save the list of dictionaries to a pickle file
with open('../App/data/lancedb_vector_search.pkl', 'wb') as file:
    pickle.dump(data, file)

print("Data saved to pickle file successfully.")

Data saved to pickle file successfully.


In [21]:
#len(embeddings)
len(documents)
documents==data
#data[0]

True

## Step 4: Create Mappings and Index with LancedDB

In [23]:
import lancedb
import pyarrow as pa

# Connect to LanceDB
db = lancedb.connect("../App/data/lancedb")

# Define the schema for the table (example: text and embedding columns)
schema = pa.schema([
    ("Question_ID", pa.int64()), 
    ("Questions", pa.string()),               # Text column for storing document questions
    ("Answers", pa.string()),             #Text column for storing documents answers
    ("text_vector", pa.list_(pa.float32(),768))   # Fixed-size list for vectors# 
])



# # Create the table with the schema, even if there's no data
# table = db.create_table("mental_health_table", 
#                         schema=schema, 
#                         mode='overwrite'
#                        )

# # Print message to confirm table creation
# print("Empty table created successfully!")




In [24]:
# Assuming you've already connected to the database and created/opened a table
db = lancedb.connect("../App/data/lancedb")

# Open the existing table (replace "my_vector_table" with your actual table name)
table = db.open_table("mental_health_table")

# Get the schema of the table
schema = table.schema

# Print the schema to see the fields and their data types
print(schema)


Question_ID: int64
Questions: string
Answers: string
text_vector: fixed_size_list<item: float>[768]
  child 0, item: float


In [25]:
# add data to table
table.add(data)


In [34]:
### performing a vector search

In [26]:
def lancedb_search(query_text, nprobes=10, refine_factor=2, k=5):
    # Generate the embedding for the query using your model
    query_embedding = model.encode([query_text])[0]
    
    # Perform the vector search in LanceDB with nprobes and refine_factor
    results = table.search(query_embedding, vector_column_name="text_vector") \
        .nprobes(nprobes) \
        .refine_factor(refine_factor) \
        .limit(k) \
        .to_pandas()
     # Extract the documents and their ids from the search results
    retrieved_docs = results[['Question_ID', 'Questions','Answers']].to_dict(orient='records')  # Convert DataFrame to list of dictionaries
    
    return retrieved_docs

# # Example usage with the query text
# query_text = "some health-related query"
# search_results = lancedb_search(query_text, nprobes=20, refine_factor=3, k=5)

# print(search_results)


In [27]:
lancedb_search('What are the symptoms of mental illness?')

[{'Question_ID': 9434130,
  'Answers': 'Symptoms of mental health disorders vary depending on the type and severity of the condition. The following is a list of general symptoms that may suggest a mental health disorder, particularly when multiple symptoms are expressed at once.\nIn adults:\nConfused thinking\nLong-lasting sadness or irritability\nExtreme highs and lows in mood\nExcessive fear, worrying, or anxiety\nSocial withdrawal\nDramatic changes in eating or sleeping habits\nStrong feelings of anger\nDelusions or hallucinations (seeing or hearing things that are not really there)\nIncreasing inability to cope with daily problems and activities\nThoughts of suicide\nDenial of obvious problems\nMany unexplained physical problems\nAbuse of drugs and/or alcohol\n  In older children and pre-teens:\nAbuse of drugs and/or alcohol\nInability to cope with daily problems and activities\nChanges in sleeping and/or eating habits\nExcessive complaints of physical problems\nDefying authority, 

In [28]:
## Retrieval Evaluation with lancedb semantic vector search

In [29]:
df=pd.read_csv('../App/data/ground-truth-data.csv')

In [30]:
ground_truth=df.to_dict(orient='records')
ground_truth[0]
len(ground_truth)

485

In [31]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [32]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [33]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['Question_ID'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [34]:
fun=lambda q:q['id']
print(fun)
fun(ground_truth[0])


<function <lambda> at 0x000002BF11B9EF80>


1590140

In [35]:
evaluate(ground_truth, lambda q: lancedb_search(q['question'],nprobes=5, refine_factor=5, k=10))

  0%|          | 0/485 [00:00<?, ?it/s]

{'hit_rate': 0.954639175257732, 'mrr': 0.7636630666012109}

### Step 3b Setup Elasticsearch Connection

In [73]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': 'b7b3d653cd0a', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'EYVcJd1UQ-emitJRCfIzug', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

## Step 4: Create Mappings and Index

- Mapping is the process of defining how a document, and the fields it contains, are stored and indexed.

- Each document is a collection of fields, which each have their own data type.

- We can compare mapping to a database schema in how it describes the fields and properties that documents hold, the datatype of each field (e.g., string, integer, or date), and how those fields should be indexed and stored

In [139]:
index_settings={
              "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0
              },
              "mappings": {
                "properties": {
                  "Question_ID": {
                    "type": "integer"
                  },
                  "Questions": {
                    "type": "text"
                  },
                  "Answers": {
                    "type": "text"
                  },
                  "question_vector": {
                    "type": "dense_vector",
                    "dims": 768, # Ensure this matches the dimensions of your vectors
                    "index": True,
                    "similarity": "cosine"
                  },
                  "answer_vector": {
                    "type": "dense_vector",
                    "dims": 768,
                    "index": True,
                    "similarity": "cosine"
                  },
                 "question_answer_vector": {
                    "type": "dense_vector",
                    "dims": 768,
                    "index": True,
                    "similarity": "cosine"
                  }
                }
              }
            }

In [140]:
index_name = "mental-health-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'mental-health-questions'})

### Step 5: Add documents into index

In [107]:
for doc in tqdm(documents):
    question = doc['Questions']
    text = doc['Answers']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['answer_vector'] = model.encode(text)
    doc['question_answer_vector'] = model.encode(qt)

  0%|          | 0/98 [00:00<?, ?it/s]

In [141]:
documents[0].keys()

dict_keys(['Question_ID', 'Questions', 'Answers', 'question_vector', 'answer_vector', 'question_answer_vector'])

In [142]:
for doc in documents:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

### Step 6: Create end user query

In [143]:
search_term = "is there a cure for mental illness?"
vector_search_term = model.encode(search_term)

In [144]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
       
    }

    search_query = {
        "knn": knn,
        "_source": ["Question_ID", "Questions", "Answers"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs 

In [145]:
def question_vector_knn(q):
    question = q['question']
    

    v_q = model.encode(question)

    return elastic_search_knn('question_vector', v_q)

In [146]:
def answer_vector_knn(q):
    question = q['question']
    

    v_q = model.encode(question)

    return elastic_search_knn('answer_vector', v_q)


In [147]:
def question_answer_vector_knn(q):
    question = q['question']
    

    v_q = model.encode(question)

    return elastic_search_knn('question_answer_vector', v_q)


In [148]:
evaluate(ground_truth, question_vector_knn)

  0%|          | 0/485 [00:00<?, ?it/s]

{'hit_rate': 0.7134020618556701, 'mrr': 0.5273883161512031}

In [149]:
evaluate(ground_truth, answer_vector_knn)

  0%|          | 0/485 [00:00<?, ?it/s]

{'hit_rate': 0.8948453608247423, 'mrr': 0.7554639175257732}

In [150]:
evaluate(ground_truth, question_answer_vector_knn)

  0%|          | 0/485 [00:00<?, ?it/s]

{'hit_rate': 0.8742268041237113, 'mrr': 0.7168728522336769}

In [None]:
# answer_vector performed best 

## Perform Keyword search with Semantic Search (Hybrid/Advanced Search)

### RAG Evaluation with semantic search

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()  # This loads the variables from .env into the environment

True

In [2]:
from openai import OpenAI
client=OpenAI()

In [3]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [4]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a mental health psychiatrist. Answer the QUESTION based on the CONTEXT from our mental questions and answer database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    
    entry_template = """
    ANSWER: {Answers}
    """.strip()
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [38]:
query='how to get fat'
search_results=lancedb_search(query)
#build_prompt(query, search_results)

In [39]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [40]:
query='How to cure mental illness?'
vector=model.encode(query)
def rag(query, model='gpt-4o-mini'):
    search_results = lancedb_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer


In [41]:
rag(query)

"To cure mental illness, it is crucial to ensure early identification and treatment. There is a range of effective treatments depending on the nature of the illness, including medication, therapy, self-help plans, and peer support. Proactivity and engagement in the recovery process are essential for the affected individual. While many people respond well to treatment, it's important to be mindful that some may experience a return of symptoms. Nonetheless, with careful monitoring and management, individuals can lead fulfilled and productive lives. Support from friends, family, and mental health professionals is also vital in encouraging those affected to seek help and begin their recovery journey."

In [42]:
import json
len(df)

485

In [43]:
df_sample = df.sample(n=200, random_state=1)

In [44]:
sample = df_sample.to_dict(orient='records')
sample[:5]

[{'id': 1030153,
  'question': 'How can I find a suitable mental health professional for my child?'},
 {'id': 6981545,
  'question': 'Can you provide details about the Bounce Back program and how to access it?'},
 {'id': 9250044,
  'question': 'What are some community support opportunities I can participate in to help others while also improving my mental health?'},
 {'id': 1667863,
  'question': 'Why is it important to seek help early for mental health challenges?'},
 {'id': 4211025,
  'question': 'What options are available for individuals experiencing a crisis?'}]

In [45]:
evaluations_gpt4o_mini = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o-mini') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o_mini.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [51]:
df_eval = pd.DataFrame(evaluations_gpt4o_mini, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [52]:
df_eval.relevance.value_counts()

relevance
RELEVANT           195
PARTLY_RELEVANT      3
NON_RELEVANT         2
Name: count, dtype: int64

In [53]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.975
PARTLY_RELEVANT    0.015
NON_RELEVANT       0.010
Name: proportion, dtype: float64

In [54]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.975
PARTLY_RELEVANT    0.015
NON_RELEVANT       0.010
Name: proportion, dtype: float64

In [50]:
evaluations_gpt4o_mini[0]

({'id': 1030153,
  'question': 'How can I find a suitable mental health professional for my child?'},
 "To find a suitable mental health professional for your child, you can take several steps:\n\n1. **Talk to your family doctor**: They can provide referrals and recommend appropriate mental health professionals based on your child's needs.\n\n2. **Consult the school**: If your child's difficulties affect their behavior or performance at school, speak with a school counsellor who can suggest resources and community services.\n\n3. **Use the Kelty Mental Health Resource Centre**: Their Help Finder tool can assist you in navigating the mental health system and finding specific services and providers.\n\n4. **Contact local services**: Reach out to local Child & Youth Mental Health offices on their walk-in days for immediate support.\n\n5. **Explore resources from the Institute of Families for Child and Youth Mental Health**: They offer guidance and support through their Youth-in-Residence 

In [46]:
df_eval.to_csv('../App/data/rag-eval-lancedb_vector_search-gpt4o_mini.csv', index=False)

NameError: name 'df_eval' is not defined

In [47]:
df_eval

NameError: name 'df_eval' is not defined

In [1]:
import pandas as pd

In [3]:
df=pd.read_csv('../data/clean_data/rag-eval-lancedb_vector_search-gpt4o_mini.csv')

In [6]:
df.relevance.value_counts()

relevance
RELEVANT           143
PARTLY_RELEVANT     54
NON_RELEVANT         3
Name: count, dtype: int64

In [5]:
df.head()

Unnamed: 0,answer,id,question,relevance,explanation
0,"No, not everything including milk comes in gla...",5385,"In Europe, does everything including milk come...",RELEVANT,The generated answer directly addresses the qu...
1,"Yes, eating raw foods can save time compared t...",871,Can eating raw foods save time compared to mor...,RELEVANT,The generated answer directly addresses the qu...
2,You might experience high cholesterol and keto...,5593,Why might you experience high cholesterol and ...,RELEVANT,The generated answer directly addresses the qu...
3,"Based on the context provided, pasteurized but...",3682,Is pasteurized butter from non-toxic animals s...,PARTLY_RELEVANT,The generated answer discusses pasteurized but...
4,Lina is a person mentioned in the context who ...,604,Who is Lina and what does she run?,PARTLY_RELEVANT,The answer provides some context about Lina's ...
