In [1]:
# Import Libraries

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from elasticsearch import Elasticsearch, exceptions
from tqdm.auto import tqdm
from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data

df = pd.read_csv('data\medquad.csv')
df.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


In [3]:
print(df.shape)
df.info()

(16412, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16412 entries, 0 to 16411
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   question    16412 non-null  object
 1   answer      16407 non-null  object
 2   source      16412 non-null  object
 3   focus_area  16398 non-null  object
dtypes: object(4)
memory usage: 513.0+ KB


In [4]:
# Summary statistics
df.describe(include='all')

Unnamed: 0,question,answer,source,focus_area
count,16412,16407,16412,16398
unique,14984,15817,9,5126
top,What causes Causes of Diabetes ?,This condition is inherited in an autosomal re...,GHR,Breast Cancer
freq,20,348,5430,53


In [5]:
# Check for null values
df[df.isnull().any(axis=1)]

Unnamed: 0,question,answer,source,focus_area
3591,What is (are) HELLP syndrome ?,,GARD,HELLP syndrome
3840,What is (are) X-linked lymphoproliferative syn...,,GARD,X-linked lymphoproliferative syndrome
4200,What is (are) Familial HDL deficiency ?,,GARD,Familial HDL deficiency
4433,What is (are) Emery-Dreifuss muscular dystroph...,,GARD,"Emery-Dreifuss muscular dystrophy, X-linked"
6693,What is (are) Emery-Dreifuss muscular dystroph...,,GARD,"Emery-Dreifuss muscular dystrophy, dominant type"
7885,What is (are) ?,On this Page General Information What is vanco...,CDC,
7886,what is vancomycin-resistant enterococci?,On this Page General Information What is vanco...,CDC,
7887,what types of infections does vancomycin-resis...,On this Page General Information What is vanco...,CDC,
7888,are certain people at risk of getting vancomyc...,On this Page General Information What is vanco...,CDC,
7889,what is the treatment for vancomycin-resistant...,On this Page General Information What is vanco...,CDC,


In [6]:
df = df.dropna(subset=['answer']) # drop rows where the answer column is null 
df.fillna('', inplace=True) # prevent loss of data

In [7]:
#duplicates

In [8]:
frequent_questions = df['question'].value_counts().head(10)
frequent_questions

question
What causes Causes of Diabetes ?                                           20
What is (are) High Blood Cholesterol ?                                     19
What is (are) Medicare and Continuing Care ?                               14
What is (are) Stroke ?                                                     13
What is (are) Skin Cancer ?                                                13
What is (are) Colorectal Cancer ?                                          12
What are the treatments for Breast Cancer ?                                12
What is (are) Breast Cancer ?                                              12
What is (are) Kidney Failure: Eat Right to Feel Right on Hemodialysis ?    12
What is (are) Parkinson's Disease ?                                        11
Name: count, dtype: int64

In [9]:
common_sources = df['source'].value_counts()
common_sources

source
GHR                  5430
GARD                 5389
NIDDK                1192
NINDS                1088
MPlusHealthTopics     981
NIHSeniorHealth       769
CancerGov             729
NHLBI                 559
CDC                   270
Name: count, dtype: int64

In [10]:
focus_area_dist = df['focus_area'].value_counts().head(10)
focus_area_dist

focus_area
Breast Cancer             53
Prostate Cancer           43
Stroke                    35
Skin Cancer               34
Alzheimer's Disease       30
Colorectal Cancer         29
Lung Cancer               29
Causes of Diabetes        28
Heart Failure             28
High Blood Cholesterol    28
Name: count, dtype: int64

In [11]:
question_len= df['question'].apply(len)
answer_len= df['answer'].apply(len)

lengths_df = pd.DataFrame({'question_length': question_len,'answer_length': answer_len})
lengths_df.describe()

Unnamed: 0,question_length,answer_length
count,16407.0,16407.0
mean,50.684952,1303.452673
std,16.926465,1656.694326
min,16.0,6.0
25%,38.0,487.0
50%,48.0,890.0
75%,61.0,1589.0
max,191.0,29046.0


In [12]:
fields = ['question','answer','source', 'focus_area']
vectorizers = {}
matrices = {}

for field in fields:
    v = TfidfVectorizer(stop_words='english', min_df=3)
    X = v.fit_transform(df[field])

    vectorizers[field] = v
    matrices[field] = X

In [13]:
query = 'What causes of diabetes and how do i prevent diabetes?'

q = vectorizers['question'].transform([query])
score = cosine_similarity(matrices['question'], q).flatten()

In [14]:
indx = np.argsort(-score)[:10]
df.iloc[indx]

Unnamed: 0,question,answer,source,focus_area
121,How to prevent Diabetes ?,Your weight affects your health in many ways. ...,NIHSeniorHealth,Diabetes
105,How to prevent Diabetes ?,The two most common forms of diabetes are type...,NIHSeniorHealth,Diabetes
116,How to prevent Diabetes ?,The two most common forms of diabetes are type...,NIHSeniorHealth,Diabetes
15249,What to do for Causes of Diabetes ?,- Diabetes is a complex group of diseases with...,NIDDK,Causes of Diabetes
15243,What is (are) Causes of Diabetes ?,Diabetes is a complex group of diseases with a...,NIDDK,Causes of Diabetes
15348,What to do for Causes of Diabetes ?,- Diabetes is a complex group of diseases with...,NIDDK,Causes of Diabetes
16021,What to do for Causes of Diabetes ?,- Diabetes is a complex group of diseases with...,NIDDK,Causes of Diabetes
16015,What is (are) Causes of Diabetes ?,Diabetes is a complex group of diseases with a...,NIDDK,Causes of Diabetes
113,What causes Diabetes ?,Type 1 diabetes is an autoimmune disease. In a...,NIHSeniorHealth,Diabetes
16205,What is (are) Causes of Diabetes ?,Diabetes is a complex group of diseases with a...,NIDDK,Causes of Diabetes


In [24]:
# Convert the DataFrame to JSON
documents = df.to_json(orient='records')

df.to_json('data/data.json', orient='records', lines=True)

In [15]:
es = Elasticsearch("http://localhost:9200")

In [16]:
# Create an index

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "source": {"type": "text"},
            "focus_area": {"type": "keyword"} 
        }
    }
}

In [19]:
index_name = "health-questions"

try:
    if not es.indices.exists(index=index_name):
        es.indices.create(index=index_name, body=index_settings)
        print("Index created successfully")
    else:
        print("Index already exists")
except exceptions.ConnectionError as e:
    print(f"Failed to create index: {e}")

Index already exists


In [22]:
# Index the documents

for doc in tqdm(df.to_dict(orient='records')):
    es.index(index=index_name, document=doc)

  8%|▊         | 1285/16407 [05:03<59:36,  4.23it/s]  


KeyboardInterrupt: 

In [None]:
# Define a function that retrieves documents and matches user queries

def search(query, max_results=5):
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents