## Setup

In [3]:
import ast
import os
import sys
from typing import Optional, TypedDict, cast

import pandas as pd
import pyarrow.parquet as pq
from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents.aio import SearchClient
from azure.search.documents.models import (
    QueryCaptionResult,
    QueryType,
    VectorizedQuery,
    VectorQuery,
)
from dotenv import load_dotenv
from openai import AsyncAzureOpenAI
from openai_messages_token_helper import build_messages, get_token_limit

sys.path.append(os.path.abspath(os.path.join("..")))
from app.backend.approaches.prompts import general_prompt

In [4]:
load_dotenv(dotenv_path=r"..\.azure\hhgai-dev-eastasia-002\.env")

True

In [5]:
AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"]
AZURE_SEARCH_INDEX = os.environ["AZURE_SEARCH_INDEX"]
OPENAI_HOST = os.getenv("OPENAI_HOST", "azure")
OPENAI_CHATGPT_MODEL = os.environ["AZURE_OPENAI_CHATGPT_MODEL"]
OPENAI_EMB_MODEL = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-ada-002")
OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", 1536))
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_CHATGPT_DEPLOYMENT = (
    os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None
)
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-03-01-preview"
AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None
AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ORGANIZATION = os.getenv("OPENAI_ORGANIZATION")
AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID")
AZURE_USE_AUTHENTICATION = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true"
AZURE_ENFORCE_ACCESS_CONTROL = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true"
AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS = os.getenv("AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS", "").lower() == "true"
AZURE_ENABLE_UNAUTHENTICATED_ACCESS = os.getenv("AZURE_ENABLE_UNAUTHENTICATED_ACCESS", "").lower() == "true"
AZURE_SERVER_APP_ID = os.getenv("AZURE_SERVER_APP_ID")
AZURE_SERVER_APP_SECRET = os.getenv("AZURE_SERVER_APP_SECRET")
AZURE_CLIENT_APP_ID = os.getenv("AZURE_CLIENT_APP_ID")
AZURE_AUTH_TENANT_ID = os.getenv("AZURE_AUTH_TENANT_ID", AZURE_TENANT_ID)

AZURE_SEARCH_QUERY_LANGUAGE = os.getenv("AZURE_SEARCH_QUERY_LANGUAGE", "en-us")
AZURE_SEARCH_QUERY_SPELLER = os.getenv("AZURE_SEARCH_QUERY_SPELLER", "lexicon")
AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower()

CHATGPT_TOKEN_LIMIT = get_token_limit(OPENAI_CHATGPT_MODEL)

In [6]:
azure_credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True)
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")

openai_client = AsyncAzureOpenAI(
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider,
)

search_client = SearchClient(
    endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
    index_name=AZURE_SEARCH_INDEX,
    credential=azure_credential,
)

In [7]:
# Parameters
SEARCH_MAX_RESULTS = 30
TEMPERATURE_QNS = 0.3
TEMPERATURE_ANS = 0.0
SEED = 1234
USE_TEXT_SEARCH = "Hybrid"
USE_VECTOR_SEARCH = "Hybrid"
USE_SEMANTIC_RANKER = True
USE_SEMANTIC_CAPTIONS = False
MINIMUM_SEARCH_SCORE = 0.0
MINIMUM_RERANKER_SCORE = 0.0

RESPONSE_TOKEN_LIMIT = 512

In [8]:
class Document:
    id: Optional[str]
    parent_id: Optional[str]
    title: Optional[str]
    pr_name: Optional[str]
    cover_image_url: Optional[str]
    full_url: Optional[str]
    content_category: Optional[str]
    chunks: Optional[str]
    embedding: Optional[list[float]]
    captions: list[QueryCaptionResult]
    score: Optional[float] = None
    reranker_score: Optional[float] = None

## Functions

In [9]:
def build_filter(filter_category):
    filters = []
    filters.append("content_category eq '{}'".format(filter_category.replace("'", "''")))
    return None if len(filters) == 0 else " and ".join(filters)


def build_filter_article_search(id):
    filters = []
    id_content = f"{id}_content"
    id_table = f"{id}_table"
    filters.append("parent_id eq '{}'".format(id_content.replace("'", "''")))
    filters.append("parent_id eq '{}'".format(id_table.replace("'", "''")))
    return None if len(filters) == 0 else " or ".join(filters)


async def compute_text_embedding(q: str):
    SUPPORTED_DIMENSIONS_MODEL = {
        "text-embedding-ada-002": False,
        "text-embedding-3-small": True,
        "text-embedding-3-large": True,
    }

    class ExtraArgs(TypedDict, total=False):
        dimensions: int

    dimensions_args: ExtraArgs = (
        {"dimensions": OPENAI_EMB_DIMENSIONS} if SUPPORTED_DIMENSIONS_MODEL[OPENAI_EMB_MODEL] else {}
    )
    embedding = await openai_client.embeddings.create(  # noqa: F704
        # Azure OpenAI takes the deployment name as the model name
        model=AZURE_OPENAI_EMB_DEPLOYMENT if AZURE_OPENAI_EMB_DEPLOYMENT else OPENAI_EMB_MODEL,
        input=q,
        **dimensions_args,
    )
    query_vector = embedding.data[0].embedding
    return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields="embedding")


def get_citation(sourcepage: str, use_image_citation: bool) -> str:
    if use_image_citation:
        return sourcepage
    else:
        path, ext = os.path.splitext(sourcepage)
        if ext.lower() == ".png":
            page_idx = path.rfind("-")
            page_number = int(path[page_idx + 1 :])
            return f"{path[:page_idx]}.pdf#page={page_number}"

        return sourcepage


def nonewlines(s: str) -> str:
    return s.replace("\n", " ").replace("\r", " ")


async def get_sources_content(
    results: list[Document], use_semantic_captions: bool, use_image_citation: bool
) -> list[str]:
    if use_semantic_captions:
        return [
            {
                "id": doc["id"] or "",
                "article_id": doc["parent_id"] or "",
                "title": doc["title"] or "",
                "pr_name": doc["pr_name"] or "",
                "url": get_citation((doc["full_url"] or ""), use_image_citation),
                "chunk": nonewlines(" . ".join([cast(str, c.text) for c in (doc["captions"] or [])])),
            }
            async for doc in results
        ]
    else:
        return [
            {
                "index_id": doc["id"] or "",
                "article_id": doc["parent_id"] or "",
                "title": doc["title"] or "",
                "pr_name": doc["pr_name"] or "",
                "url": get_citation((doc["full_url"] or ""), use_image_citation),
                "chunk": nonewlines(doc["chunks"] or ""),
            }
            async for doc in results
        ]

In [10]:
def concat_sources(sources_content, start_idx, end_idx):
    combined = {"index_ids": [], "article_ids": [], "titles": [], "pr_names": [], "urls": [], "chunks": []}

    for item in sources_content[start_idx:end_idx]:
        combined["index_ids"].append(item["index_id"])
        combined["article_ids"].append(item["article_id"])
        combined["titles"].append(item["title"])
        combined["pr_names"].append(item["pr_name"])
        combined["urls"].append(item["url"])
        combined["chunks"].append(item["chunk"])

    return combined


def get_combined_sources(sources_content, step=5, total_combined=3):
    combined_sources = []
    for n in range(0, (total_combined + 1) * step, 5):
        combined_source = concat_sources(sources_content, n, min(n + step, len(sources_content)))
        combined_sources.append(combined_source)

        # Stop when reach total_combined iterations
        if len(combined_sources) >= total_combined:
            break
    return combined_sources

## Prompt

In [11]:
qns_generation_prompt = """Your task is to formulate a set of 3 unique questions from given context, satisfying the rules given below:
1. All generated questions should precisely pertain to the keywords {keyword}, and it is imperative that the topic is explicitly included as an integral part of each question.
2. The generated questions should be straightforward, using simple language that is accessible to a broad audience. 
3. The generated questions should make sense to humans even when read without the given context.
4. Prioritize clarity and brevity, ensuring that the questions are formulated in a way that reflect common language and would be easily comprehensible to the general public. 
5. Ensure that the questions generated are meaningful and relevant to the general public in understanding or exploring more about the given topic.
7. Only generate questions that can be derived from the given context, including text and tables.
8. Importantly, ensure uniqueness and non-repetition in the questions. 
9. Additionally, all questions must have answers found within the given context.
10. Do not use phrases like 'provided context', etc in the generated questions.
11. A generated question should contain less than 15 words.
12. Use simple language in the questions generated that are accessible to a broad audience.
13. Each question should be enclosed in ' '.
14. Output as a list of questions separated by , and enclosed by [ ]. 

Example of output:
['How can MediSave be used for outpatient treatments for newborns?', 'What are the MediSave withdrawal limits for assisted conception procedures?', 'How does MediShield Life help with payments for costly outpatient treatments?']
"""

## Question Generation by topics

In [10]:
content_category = "cost-and-financing"
subpage = "cost-and-financing"
filter = build_filter(content_category)
filter

"content_category eq 'cost-and-financing'"

In [11]:
df = pd.read_excel("Topics by content category for Qns Generation.xlsx", sheet_name=content_category)
df_filtered = df[df["to_include"] == "yes"]
keywords_list = list(df_filtered["final keywords"])
keywords_list

["['medisave', 'medication', 'outpatient','medishield', 'treatments', 'payments', 'appointments']",
 "['disability', 'seniors', 'eldershield', 'rehabilitation', 'assistive', 'residential', 'eligibility', 'mobility', 'allowances']",
 "['insurance', 'insurers', 'medishield', ''coverage', 'cpf', 'mammograms', 'screening', 'mammogram', 'screen for life']",
 "['medifund', 'insurance', 'grants', 'fund', 'payments', 'needy', 'savings']",
 "['caregivers', 'caregiving', 'caregiver', 'grant', 'resident', 'disabilities', 'nursing homes']",
 "['merdeka generation cardholders', 'assessment fee', 'medishield life coverage', 'lasik prices', 'healthcare cost', 'regular health screenings', 'mammogram screenings', 'hpbs screening programmes', 'breast cancer screening', 'breast cancer screening subsidies']"]

In [12]:
qns_bank_path = "question_bank.csv"
if os.path.exists(qns_bank_path):
    # Read the CSV file into a DataFrame if it exists
    df = pd.read_csv(qns_bank_path)
else:
    # Define an empty DataFrame if the file doesn't exist
    df = pd.DataFrame(
        columns=[
            "content_category",
            "subpage",
            "keywords",
            "source_num",
            "index_ids",
            "article_ids_unique",
            "titles_unique",
            "content_contributors",
            "urls_unique",
            "chunks",
            "question",
            "answer",
        ]
    )

In [13]:
for keywords in keywords_list:
    vectors: list[VectorQuery] = []
    if USE_VECTOR_SEARCH:
        vectors.append(await compute_text_embedding(keywords))  # noqa: F704

    if USE_SEMANTIC_RANKER:
        results = await search_client.search(  # noqa: F704
            search_text=keywords,
            filter=filter,
            top=SEARCH_MAX_RESULTS,
            query_caption="extractive|highlight-false" if USE_SEMANTIC_CAPTIONS else None,
            vector_queries=vectors,
            query_type=QueryType.SEMANTIC,
            query_language=AZURE_SEARCH_QUERY_LANGUAGE,
            query_speller=AZURE_SEARCH_QUERY_SPELLER,
            semantic_configuration_name="default",
            semantic_query=keywords,
        )
    else:
        results = await search_client.search(  # noqa: F704
            search_text=keywords,
            filter=filter,
            top=SEARCH_MAX_RESULTS,
            vector_queries=vectors,
        )

    sources_content = await get_sources_content(results, USE_SEMANTIC_CAPTIONS, use_image_citation=False)  # noqa: F704
    combined_sources = get_combined_sources(sources_content, step=5, total_combined=3)

    for n in range(len(combined_sources)):
        content = "\n".join(combined_sources[n]["chunks"])

        messages = build_messages(
            model=OPENAI_CHATGPT_MODEL,
            system_prompt=qns_generation_prompt.format(keyword=keywords),
            new_user_content=f"Please generate 3 unique questions on keywords '{keywords}' using the following provided source. \n\nSource:\n {content}",
            max_tokens=CHATGPT_TOKEN_LIMIT - RESPONSE_TOKEN_LIMIT,
        )
        chat_coroutine = await openai_client.chat.completions.create(  # noqa: F704
            # Azure OpenAI takes the deployment name as the model name
            model=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
            messages=messages,
            temperature=TEMPERATURE_QNS,
            max_tokens=RESPONSE_TOKEN_LIMIT,
            n=1,
            stream=False,
            seed=SEED,
        )

        data = []
        response_text_qns = chat_coroutine.choices[0].message.content
        questions_list = ast.literal_eval(response_text_qns)
        for question in questions_list:
            messages_ans_generation = build_messages(
                model=OPENAI_CHATGPT_MODEL,
                system_prompt=general_prompt.format(language="ENGLISH"),
                new_user_content=question + "\n\nSources:\n" + content,
                max_tokens=CHATGPT_TOKEN_LIMIT - RESPONSE_TOKEN_LIMIT,
            )
            chat_coroutine = await openai_client.chat.completions.create(  # noqa: F704
                # Azure OpenAI takes the deployment name as the model name
                model=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
                messages=messages_ans_generation,
                temperature=TEMPERATURE_ANS,
                max_tokens=RESPONSE_TOKEN_LIMIT,
                n=1,
                stream=False,
                seed=SEED,
            )

            response_text_ans = chat_coroutine.choices[0].message.content
            data.append(
                {
                    "content_category": content_category,
                    "subpage": subpage,
                    "keywords": keywords,
                    "source_num": f"source_{n+1}",
                    "index_ids": combined_sources[n]["index_ids"],
                    "article_ids_unique": list(set(combined_sources[n]["article_ids"])),
                    "titles_unique": list(set(combined_sources[n]["titles"])),
                    "content_contributors": list(set(combined_sources[n]["pr_names"])),
                    "urls_unique": list(set(combined_sources[n]["urls"])),
                    "chunks": content,
                    "question": question,
                    "answer": response_text_ans,
                }
            )
        df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)

In [14]:
print(df.shape)
df.head(2)

(54, 12)


Unnamed: 0,content_category,subpage,keywords,source_num,index_ids,article_ids_unique,titles_unique,content_contributors,urls_unique,chunks,question,answer
0,cost-and-financing,cost-and-financing,"['medisave', 'medication', 'outpatient','medis...",source_1,"[e096252aa674_1434993_content_content_pages_0,...","[1435063_content, 1435010_content, 1435032_con...","[​Costs and financing, MediSave Claims for Pol...","[National Healthcare Group, Khoo Teck Puat Hos...",[https://www.healthhub.sg/a-z/costs-and-financ...,"What is MediSave? MediSave, introduced in Apri...",How can MediSave be used for outpatient treatm...,MediSave can be used for outpatient treatments...
1,cost-and-financing,cost-and-financing,"['medisave', 'medication', 'outpatient','medis...",source_1,"[e096252aa674_1434993_content_content_pages_0,...","[1435063_content, 1435010_content, 1435032_con...","[​Costs and financing, MediSave Claims for Pol...","[National Healthcare Group, Khoo Teck Puat Hos...",[https://www.healthhub.sg/a-z/costs-and-financ...,"What is MediSave? MediSave, introduced in Apri...",What are the co-payment requirements for outpa...,"For outpatient treatments under MediSave, ther..."


In [15]:
count_df = df.groupby(["keywords", "source_num"]).size().reset_index(name="count")
count_df

Unnamed: 0,keywords,source_num,count
0,"['caregivers', 'caregiving', 'caregiver', 'gra...",source_1,3
1,"['caregivers', 'caregiving', 'caregiver', 'gra...",source_2,3
2,"['caregivers', 'caregiving', 'caregiver', 'gra...",source_3,3
3,"['disability', 'seniors', 'eldershield', 'reha...",source_1,3
4,"['disability', 'seniors', 'eldershield', 'reha...",source_2,3
5,"['disability', 'seniors', 'eldershield', 'reha...",source_3,3
6,"['insurance', 'insurers', 'medishield', ''cove...",source_1,3
7,"['insurance', 'insurers', 'medishield', ''cove...",source_2,3
8,"['insurance', 'insurers', 'medishield', ''cove...",source_3,3
9,"['medifund', 'insurance', 'grants', 'fund', 'p...",source_1,3


In [16]:
df_copy = df.applymap(lambda x: str(x) if isinstance(x, list) else x)
duplicate_qns = df_copy[df_copy.duplicated(subset="question", keep=False)]
duplicate_row = df_copy[df_copy.duplicated(keep=False)]
print(f"No. of duplicate questions: {len(duplicate_qns)}")
print(f"No. of duplicate rows: {len(duplicate_row)}")

No. of duplicate questions: 4
No. of duplicate rows: 0


  df_copy = df.applymap(lambda x: str(x) if isinstance(x, list) else x)


In [23]:
# df_drop = df.drop_duplicates(subset='question')
# df_drop.shape
# df = df.drop_duplicates(subset='question')

In [18]:
df.to_csv(f"questions_bank_{content_category}.csv", index=False)
# df.to_csv(f"questions_bank.csv", index=False)

## Question Generation by articles
Applicable to content_category: health-statistics, medication

In [24]:
# to shift up to parameters later
SEARCH_MAX_RESULTS_ARTICLE = 10
article_search_params = {
    "health-statistics": {"keywords": ["healthcare statistics"], "percentile": 0.75},  # 4 out of 15 articles
    "medications": {
        "keywords": [
            "dosage",
            "usage and adherence",
            "drug safety",
            "drug interaction",
            "food interaction",
            "side effects",
            "storage",
            "prescription",
            "symptoms",
            "purpose of drug",
            "precaution",
        ],
        "percentile": 0.90,  # 58 out of 579 articles
    },
}

In [25]:
content_category = "health-statistics"
df = pq.read_table("merged_data.parquet")
df = df.to_pandas()

df_filtered = df[df["content_category"] == content_category]
remove_type_list = [
    "No Extracted Content",
    "NaN",
    "No relevant content and mainly links",
    "Table of Contents",
    "No HTML Tags",
]
df_filtered = df_filtered[~df_filtered["remove_type"].isin(remove_type_list)]
df_filtered.shape

(15, 39)

In [26]:
percentile_value = df_filtered["page_views"].quantile(article_search_params[content_category]["percentile"])
df_percentile = df_filtered[df_filtered["page_views"] > percentile_value]
print(f"Percentile value: {percentile_value}, Number of articles: {df_percentile.shape[0]}")

Percentile value: 1279.5, Number of articles: 4


In [28]:
qns_bank_path = "questions_bank.csv"
if os.path.exists(qns_bank_path):
    # Read the CSV file into a DataFrame if it exists
    df = pd.read_csv(qns_bank_path)
else:
    # Define an empty DataFrame if the file doesn't exist
    df = pd.DataFrame(
        columns=[
            "content_category",
            "subpage",
            "keywords",
            "source_num",
            "index_ids",
            "article_ids_unique",
            "titles_unique",
            "content_contributors",
            "urls_unique",
            "chunks",
            "question",
            "answer",
        ]
    )

In [30]:
cnt = 1
for index, row in df_percentile.iterrows():
    title = row["title"]
    id = row["id"]
    keywords = article_search_params[content_category]["keywords"]
    filter = build_filter_article_search(id)

    vectors: list[VectorQuery] = []
    if USE_VECTOR_SEARCH:
        vectors.append(await compute_text_embedding(keywords))  # noqa: F704

    if USE_SEMANTIC_RANKER:
        results = await search_client.search(  # noqa: F704
            search_text=keywords,
            filter=filter,
            top=SEARCH_MAX_RESULTS_ARTICLE,
            query_caption="extractive|highlight-false" if USE_SEMANTIC_CAPTIONS else None,
            vector_queries=vectors,
            query_type=QueryType.SEMANTIC,
            query_language=AZURE_SEARCH_QUERY_LANGUAGE,
            query_speller=AZURE_SEARCH_QUERY_SPELLER,
            semantic_configuration_name="default",
            semantic_query=keywords,
        )
    else:
        results = await search_client.search(  # noqa: F704
            search_text=keywords,
            filter=filter,
            top=SEARCH_MAX_RESULTS_ARTICLE,
            vector_queries=vectors,
        )

    sources_content = await get_sources_content(results, USE_SEMANTIC_CAPTIONS, use_image_citation=False)  # noqa: F704
    combined = concat_sources(sources_content, 0, len(sources_content))
    content = "\n".join(combined["chunks"])

    messages = build_messages(
        model=OPENAI_CHATGPT_MODEL,
        system_prompt=qns_generation_prompt.format(keyword=keywords),
        new_user_content=f"Please generate 3 unique questions on keywords '{keywords}' using the following provided source. \n\nSource:\n {content}",
        max_tokens=CHATGPT_TOKEN_LIMIT - RESPONSE_TOKEN_LIMIT,
    )

    chat_coroutine = await openai_client.chat.completions.create(  # noqa: F704
        # Azure OpenAI takes the deployment name as the model name
        model=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
        messages=messages,
        temperature=TEMPERATURE_QNS,
        max_tokens=RESPONSE_TOKEN_LIMIT,
        n=1,
        stream=False,
        seed=SEED,
    )

    response_text_qns = chat_coroutine.choices[0].message.content
    questions_list = ast.literal_eval(response_text_qns)
    data = []
    for question in questions_list:
        messages_ans_generation = build_messages(
            model=OPENAI_CHATGPT_MODEL,
            system_prompt=general_prompt.format(language="ENGLISH"),
            new_user_content=question + "\n\nSources:\n" + content,
            max_tokens=CHATGPT_TOKEN_LIMIT - RESPONSE_TOKEN_LIMIT,
        )
        chat_coroutine = await openai_client.chat.completions.create(  # noqa: F704
            # Azure OpenAI takes the deployment name as the model name
            model=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
            messages=messages_ans_generation,
            temperature=TEMPERATURE_ANS,
            max_tokens=RESPONSE_TOKEN_LIMIT,
            n=1,
            stream=False,
            seed=SEED,
        )
        response_text_ans = chat_coroutine.choices[0].message.content
        data.append(
            {
                "content_category": content_category,
                "subpage": "",
                "keywords": "",
                "source_num": f"source_{cnt}",
                "index_ids": list(set(combined["index_ids"])),
                "article_ids_unique": list(set(combined["article_ids"])),
                "titles_unique": list(set(combined["titles"])),
                "content_contributors": list(set(combined["pr_names"])),
                "urls_unique": list(set(combined["urls"])),
                "chunks": content,
                "question": question,
                "answer": response_text_ans,
            }
        )
    df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)
    cnt += 1

1437960 Admissions: Top 10 Reasons for Being Admitted to Hospital
['healthcare statistics']
parent_id eq '1437960_content' or parent_id eq '1437960_table'
1437954 Principal Causes of Death
['healthcare statistics']
parent_id eq '1437954_content' or parent_id eq '1437954_table'
1437958 Disease Burden Statistics for Singapore
['healthcare statistics']
parent_id eq '1437958_content' or parent_id eq '1437958_table'
1437942 Healthcare Workforce Statistics
['healthcare statistics']
parent_id eq '1437942_content' or parent_id eq '1437942_table'


In [32]:
df.to_csv("questions_bank.csv", index=False)

## End