In [2]:
# Import Libraries

import os
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch, exceptions
from tqdm.auto import tqdm
from openai import OpenAI

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load data

df = pd.read_csv('data\cleaned_data.csv')
df.sample(5)

Unnamed: 0,question,answer,source,focus_area
226,What is (are) Rectal Cancer ?,Key Points\n - Rectal cance...,CancerGov,Rectal Cancer
191,Who is at risk for Pancreatic Neuroendocrine T...,Having certain syndromes can increase the risk...,CancerGov,Pancreatic Neuroendocrine Tumors (Islet Cell T...
183,How to diagnose Oropharyngeal Cancer ?,Tests that examine the mouth and throat are us...,CancerGov,Oropharyngeal Cancer
10,What is (are) Adult Central Nervous System Tum...,Key Points\n - An adult cen...,CancerGov,Adult Central Nervous System Tumors
96,How to diagnose Endometrial Cancer ?,Tests that examine the endometrium are used to...,CancerGov,Endometrial Cancer


In [4]:
print(df.shape)
df.info()

(292, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   question    292 non-null    object
 1   answer      292 non-null    object
 2   source      292 non-null    object
 3   focus_area  292 non-null    object
dtypes: object(4)
memory usage: 9.2+ KB


In [5]:
# Summary statistics
df.describe(include='all')

Unnamed: 0,question,answer,source,focus_area
count,292,292,292,292
unique,292,291,9,292
top,What is (are) 21-hydroxylase deficiency ?,New types of treatment are being tested in cli...,CancerGov,21-hydroxylase deficiency
freq,1,2,72,1


In [6]:
common_sources = df['source'].value_counts()
common_sources

source
CancerGov            72
NIDDK                66
GARD                 51
NHLBI                47
GHR                  26
NIHSeniorHealth      23
MPlusHealthTopics     5
CDC                   1
NINDS                 1
Name: count, dtype: int64

In [7]:
question_len= df['question'].apply(len)
answer_len= df['answer'].apply(len)

lengths_df = pd.DataFrame({'question_length': question_len,'answer_length': answer_len})
lengths_df.describe()

Unnamed: 0,question_length,answer_length
count,292.0,292.0
mean,46.119863,2193.965753
std,15.951042,2669.869616
min,22.0,31.0
25%,35.0,581.5
50%,43.0,1128.0
75%,52.25,2859.0
max,154.0,17810.0


#### Elastic Search for Retrieval

In [8]:
# Elasticsearch instance
es = Elasticsearch("http://localhost:9200")

# Define index name and settings
index_name = "health-questions"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "source": {"type": "text"},
            "focus_area": {"type": "text"}
        }
    }
}

In [9]:
# Create the index if it doesn't exist

try:
    if not es.indices.exists(index=index_name):
        es.indices.create(index=index_name, body=index_settings)
        print("Index created successfully")
    else:
        print("Index already exists")
except exceptions.ConnectionError as e:
    print(f"Failed to create index: {e}")

Index already exists


In [10]:
# Convert DataFrame to dictionary format for Elasticsearch
documents = df.to_dict(orient='records')

# Index the documents
for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  0%|          | 0/292 [00:00<?, ?it/s]

In [11]:
df.to_json('data/data.json', orient='records', lines=True)

In [12]:
# Define a function that retrieves documents and matches user queries

def search(query, max_results=5):
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "answer", 'focus_area'],
                        "type": "best_fields"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [13]:
# Define a function that creates a prompt for an LLM to answer health-related questions based on the given data

def build_prompt(query, search_results):
    prompt_template = """
You're a healthcare assistant AI. Answer the QUESTION based on the CONTEXT provided from a health FAQ database.
Use only the facts from the CONTEXT to provide an accurate, clear, and concise answer.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context += f"Question: {doc['question']}\nAnswer: {doc['answer']}\nSource: {doc['source']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [14]:
# Set up an OpenAI client

client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [15]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
# Sample query

query = 'how do i prevent diabetes?'
rag(query)  