## Setup

In [19]:
import ast
import os
import re
import sys

# import time
from typing import cast

import pandas as pd
from dotenv import load_dotenv

sys.path.append(os.path.abspath(os.path.join("..")))
from typing import Optional, TypedDict

from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents.aio import SearchClient
from azure.search.documents.models import (
    QueryCaptionResult,
    QueryType,
    VectorizedQuery,
    VectorQuery,
)
from openai import AsyncAzureOpenAI
from openai_messages_token_helper import build_messages, get_token_limit

from app.backend.approaches.prompts import general_prompt

In [2]:
load_dotenv(dotenv_path=r"..\.azure\hhgai-dev-eastasia-002\.env")

True

In [4]:
AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"]
AZURE_SEARCH_INDEX = os.environ["AZURE_SEARCH_INDEX"]
OPENAI_HOST = os.getenv("OPENAI_HOST", "azure")
OPENAI_CHATGPT_MODEL = os.environ["AZURE_OPENAI_CHATGPT_MODEL"]
OPENAI_EMB_MODEL = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-ada-002")
OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", 1536))
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_CHATGPT_DEPLOYMENT = (
    os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None
)
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-03-01-preview"
AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None
AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ORGANIZATION = os.getenv("OPENAI_ORGANIZATION")
AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID")
AZURE_USE_AUTHENTICATION = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true"
AZURE_ENFORCE_ACCESS_CONTROL = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true"
AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS = os.getenv("AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS", "").lower() == "true"
AZURE_ENABLE_UNAUTHENTICATED_ACCESS = os.getenv("AZURE_ENABLE_UNAUTHENTICATED_ACCESS", "").lower() == "true"
AZURE_SERVER_APP_ID = os.getenv("AZURE_SERVER_APP_ID")
AZURE_SERVER_APP_SECRET = os.getenv("AZURE_SERVER_APP_SECRET")
AZURE_CLIENT_APP_ID = os.getenv("AZURE_CLIENT_APP_ID")
AZURE_AUTH_TENANT_ID = os.getenv("AZURE_AUTH_TENANT_ID", AZURE_TENANT_ID)

AZURE_SEARCH_QUERY_LANGUAGE = os.getenv("AZURE_SEARCH_QUERY_LANGUAGE", "en-us")
AZURE_SEARCH_QUERY_SPELLER = os.getenv("AZURE_SEARCH_QUERY_SPELLER", "lexicon")
AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower()

CHATGPT_TOKEN_LIMIT = get_token_limit(OPENAI_CHATGPT_MODEL)

In [5]:
azure_credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True)
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")

openai_client = AsyncAzureOpenAI(
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider,
)

search_client = SearchClient(
    endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
    index_name=AZURE_SEARCH_INDEX,
    credential=azure_credential,
)

In [6]:
# Parameters
SEARCH_MAX_RESULTS = 20
TEMPERATURE_QNS = 0.3
TEMPERATURE_ANS = 0.0
SEED = 1234
USE_TEXT_SEARCH = False
USE_VECTOR_SEARCH = True
USE_SEMANTIC_RANKER = True
USE_SEMANTIC_CAPTIONS = False
MINIMUM_SEARCH_SCORE = 0.0
MINIMUM_RERANKER_SCORE = 0.0

RESPONSE_TOKEN_LIMIT = 1024

In [7]:
class Document:
    id: Optional[str]
    parent_id: Optional[str]
    title: Optional[str]
    cover_image_url: Optional[str]
    full_url: Optional[str]
    content_category: Optional[str]
    chunks: Optional[str]
    embedding: Optional[list[float]]
    captions: list[QueryCaptionResult]
    score: Optional[float] = None
    reranker_score: Optional[float] = None

## Functions

In [8]:
def build_filter(filter_category):
    filters = []
    if filter_category:
        filters.append("content_category eq '{}'".format(filter_category.replace("'", "''")))
    return None if len(filters) == 0 else " and ".join(filters)


async def compute_text_embedding(q: str):
    SUPPORTED_DIMENSIONS_MODEL = {
        "text-embedding-ada-002": False,
        "text-embedding-3-small": True,
        "text-embedding-3-large": True,
    }

    class ExtraArgs(TypedDict, total=False):
        dimensions: int

    dimensions_args: ExtraArgs = (
        {"dimensions": OPENAI_EMB_DIMENSIONS} if SUPPORTED_DIMENSIONS_MODEL[OPENAI_EMB_MODEL] else {}
    )
    embedding = await openai_client.embeddings.create(  # noqa: F704
        # Azure OpenAI takes the deployment name as the model name
        model=AZURE_OPENAI_EMB_DEPLOYMENT if AZURE_OPENAI_EMB_DEPLOYMENT else OPENAI_EMB_MODEL,
        input=q,
        **dimensions_args,
    )
    query_vector = embedding.data[0].embedding
    return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields="embedding")


def get_citation(sourcepage: str, use_image_citation: bool) -> str:
    if use_image_citation:
        return sourcepage
    else:
        path, ext = os.path.splitext(sourcepage)
        if ext.lower() == ".png":
            page_idx = path.rfind("-")
            page_number = int(path[page_idx + 1 :])
            return f"{path[:page_idx]}.pdf#page={page_number}"

        return sourcepage


def nonewlines(s: str) -> str:
    return s.replace("\n", " ").replace("\r", " ")


async def get_sources_content(
    results: list[Document], use_semantic_captions: bool, use_image_citation: bool
) -> list[str]:
    if use_semantic_captions:
        return [
            (get_citation((doc["full_url"] or ""), use_image_citation))
            + ": "
            + nonewlines(" . ".join([cast(str, c.text) for c in (doc["captions"] or [])]))
            async for doc in results
        ]
    else:
        return [
            (get_citation((doc["full_url"] or ""), use_image_citation)) + ": " + nonewlines(doc["chunks"] or "")
            async for doc in results
        ]

In [9]:
def separate_url_and_chunk(source_content):
    separated_items = []

    # Regular expression pattern to capture URL and chunk
    pattern = re.compile(r"^(https?://[^\s]+):\s*(.*)$")

    for item in source_content:
        match = pattern.match(item)
        if match:
            url = match.group(1).strip()  # URL part
            chunk = match.group(2).strip()  # Chunk part
            separated_items.append((url, chunk))
        else:
            # Handle cases where the pattern does not match
            separated_items.append((item, ""))  # Assuming no chunk if the pattern does not match

    return separated_items  # tuple of url, chunk

In [10]:
def combine_chunks_and_urls(separated_items):
    source_chunks = []
    source_urls = []

    # Split items into first 5 and next 5
    first_5 = separated_items[:5]
    next_5 = separated_items[5:10]  # Adjust if the list has fewer items

    # Combine chunks
    combined_chunks_first_5 = "\n".join(chunk for url, chunk in first_5)
    combined_chunks_next_5 = "\n".join(chunk for url, chunk in next_5)
    source_chunks.append(combined_chunks_first_5)
    source_chunks.append(combined_chunks_next_5)

    # Combine URLs
    urls_first_5 = [url for url, chunk in first_5]
    urls_next_5 = [url for url, chunk in next_5]
    source_urls.append(urls_first_5)
    source_urls.append(urls_next_5)

    return source_chunks, source_urls

## Prompt

In [11]:
qns_generation_prompt = """Your task is to formulate a set of 3 unique questions from given context, satisfying the rules given below:
1. All generated questions should precisely pertain to the keywords {keyword}, and it is imperative that the topic is explicitly included as an integral part of each question.
2. The generated questions should be straightforward, using simple language that is accessible to a broad audience. 
3. The generated questions should make sense to humans even when read without the given context.
4. Prioritize clarity and brevity, ensuring that the questions are formulated in a way that reflect common language and would be easily comprehensible to the general public. 
5. Ensure that the questions generated are meaningful and relevant to the general public in understanding or exploring more about the given topic.
7. Only generate questions that can be derived from the given context, including text and tables.
8. Importantly, ensure uniqueness and non-repetition in the questions. 
9. Additionally, all questions must have answers found within the given context.
10. Do not use phrases like 'provided context', etc in the generated questions.
11. A generated question should contain less than 15 words.
12. Use simple language in the questions generated that are accessible to a broad audience.
13. Output as a list of questions separated by , and enclosed by [ ].

Example of output:
['How can MediSave be used for outpatient treatments for newborns?', 'What are the MediSave withdrawal limits for assisted conception procedures?', 'How does MediShield Life help with payments for costly outpatient treatments?']
"""

## Question Generation

In [12]:
sheet_name = "cost-and-financing"
filter_category = "cost-and-financing"
filter = build_filter(filter_category)
filter

In [13]:
df = pd.read_excel("Topics by content category for Qns Generation.xlsx", sheet_name=sheet_name)
df_filtered = df[df["to_include"] == "yes"]
keywords_list = list(df_filtered["final keywords"])
keywords_list

["['medisave', 'medication', 'outpatient','medishield', 'treatments', 'payments', 'appointments']",
 "['disability', 'seniors', 'eldershield', 'rehabilitation', 'assistive', 'residential', 'eligibility', 'mobility', 'allowances']",
 "['insurance', 'insurers', 'medishield', ''coverage', 'cpf', 'mammograms', 'screening', 'mammogram', 'screen for life']",
 "['medifund', 'insurance', 'grants', 'fund', 'payments', 'needy', 'savings']",
 "['caregivers', 'caregiving', 'caregiver', 'grant', 'resident', 'disabilities', 'nursing homes']"]

In [15]:
df = pd.DataFrame(
    columns=["content_category", "subpage", "keywords", "chunk_num", "chunks", "urls", "question", "answer"]
)

for keywords in keywords_list:
    vectors: list[VectorQuery] = []
    if USE_VECTOR_SEARCH:
        vectors.append(await compute_text_embedding(keywords))  # noqa: F704

    if USE_SEMANTIC_RANKER:
        results = await search_client.search(  # noqa: F704
            search_text=keywords,
            filter=filter,
            top=SEARCH_MAX_RESULTS,
            query_caption="extractive|highlight-false" if USE_SEMANTIC_CAPTIONS else None,
            vector_queries=vectors,
            query_type=QueryType.SEMANTIC,
            query_language=AZURE_SEARCH_QUERY_LANGUAGE,
            query_speller=AZURE_SEARCH_QUERY_SPELLER,
            semantic_configuration_name="default",
            semantic_query=keywords,
        )
    else:
        results = await search_client.search(  # noqa: F704
            search_text=keywords,
            filter=filter,
            top=SEARCH_MAX_RESULTS,
            vector_queries=vectors,
        )

    sources_content = await get_sources_content(results, USE_SEMANTIC_CAPTIONS, use_image_citation=False)  # noqa: F704
    separated_items = separate_url_and_chunk(sources_content)
    source_info = combine_chunks_and_urls(separated_items)

    for n in range(len(source_info)):
        chunks = source_info[0][n]
        urls = source_info[1][n]

        messages = build_messages(
            model=OPENAI_CHATGPT_MODEL,
            system_prompt=qns_generation_prompt.format(keyword=keywords),
            new_user_content=f"Please generate 3 unique questions on keywords '{keywords}' using the following provided source. \n\nSource:\n {chunks}",
            max_tokens=CHATGPT_TOKEN_LIMIT - RESPONSE_TOKEN_LIMIT,
        )
        chat_coroutine = await openai_client.chat.completions.create(  # noqa: F704
            # Azure OpenAI takes the deployment name as the model name
            model=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
            messages=messages,
            temperature=TEMPERATURE_QNS,
            max_tokens=RESPONSE_TOKEN_LIMIT,
            n=1,
            stream=False,
            seed=SEED,
        )

        data = []
        response_text_qns = chat_coroutine.choices[0].message.content
        questions_list = ast.literal_eval(response_text_qns)
        for question in questions_list:
            messages_ans_generation = build_messages(
                model=OPENAI_CHATGPT_MODEL,
                system_prompt=general_prompt.format(language="ENGLISH"),
                new_user_content=question + "\n\nSources:\n" + chunks,
                max_tokens=CHATGPT_TOKEN_LIMIT - RESPONSE_TOKEN_LIMIT,
            )

            chat_coroutine = await openai_client.chat.completions.create(  # noqa: F704
                # Azure OpenAI takes the deployment name as the model name
                model=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
                messages=messages_ans_generation,
                temperature=TEMPERATURE_ANS,
                max_tokens=RESPONSE_TOKEN_LIMIT,
                n=1,
                stream=False,
                seed=SEED,
            )

            response_text_ans = chat_coroutine.choices[0].message.content
            data.append(
                {
                    "content_category": filter_category,
                    "subpage": sheet_name,
                    "keywords": keywords,
                    "chunk_num": f"chunk_{n+1}",
                    "chunks": chunks,
                    "urls": urls,
                    "question": question,
                    "answer": response_text_ans,
                }
            )
        df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)

        # time.sleep(15)

In [16]:
count_df = df.groupby(["keywords", "chunk_num"]).size().reset_index(name="count")
count_df

Unnamed: 0,keywords,chunk_num,count
0,"['caregivers', 'caregiving', 'caregiver', 'gra...",chunk_1,3
1,"['caregivers', 'caregiving', 'caregiver', 'gra...",chunk_2,3
2,"['disability', 'seniors', 'eldershield', 'reha...",chunk_1,3
3,"['disability', 'seniors', 'eldershield', 'reha...",chunk_2,3
4,"['insurance', 'insurers', 'medishield', ''cove...",chunk_1,3
5,"['insurance', 'insurers', 'medishield', ''cove...",chunk_2,3
6,"['medifund', 'insurance', 'grants', 'fund', 'p...",chunk_1,3
7,"['medifund', 'insurance', 'grants', 'fund', 'p...",chunk_2,3
8,"['medisave', 'medication', 'outpatient','medis...",chunk_1,3
9,"['medisave', 'medication', 'outpatient','medis...",chunk_2,3


In [17]:
df

Unnamed: 0,content_category,subpage,keywords,chunk_num,chunks,urls,question,answer
0,cost-and-financing,cost-and-financing,"['medisave', 'medication', 'outpatient','medis...",chunk_1,"What is MediSave? MediSave, introduced in Apri...",[https://www.healthhub.sg/a-z/costs-and-financ...,How can MediSave be used for outpatient treatm...,MediSave can be used for outpatient treatments...
1,cost-and-financing,cost-and-financing,"['medisave', 'medication', 'outpatient','medis...",chunk_1,"What is MediSave? MediSave, introduced in Apri...",[https://www.healthhub.sg/a-z/costs-and-financ...,What are the MediSave withdrawal limits for ou...,The MediSave withdrawal limits for outpatient ...
2,cost-and-financing,cost-and-financing,"['medisave', 'medication', 'outpatient','medis...",chunk_1,"What is MediSave? MediSave, introduced in Apri...",[https://www.healthhub.sg/a-z/costs-and-financ...,How does MediShield Life assist with payments ...,MediShield Life assists with payments for cost...
3,cost-and-financing,cost-and-financing,"['medisave', 'medication', 'outpatient','medis...",chunk_2,MediSave Maternity Package With the MediSave M...,[https://www.healthhub.sg/a-z/costs-and-financ...,How can MediSave be used for pre-delivery medi...,"Under the MediSave Maternity Package (MMP), yo..."
4,cost-and-financing,cost-and-financing,"['medisave', 'medication', 'outpatient','medis...",chunk_2,MediSave Maternity Package With the MediSave M...,[https://www.healthhub.sg/a-z/costs-and-financ...,What are the MediSave withdrawal limits for as...,"For Assisted Conception Procedures (ACP), Medi..."
5,cost-and-financing,cost-and-financing,"['medisave', 'medication', 'outpatient','medis...",chunk_2,MediSave Maternity Package With the MediSave M...,[https://www.healthhub.sg/a-z/costs-and-financ...,How does MediShield Life help with payments fo...,MediShield Life helps with payments for costly...
6,cost-and-financing,cost-and-financing,"['disability', 'seniors', 'eldershield', 'reha...",chunk_1,Understanding ElderShield ElderShield is a sev...,[https://www.healthhub.sg/a-z/costs-and-financ...,What is the eligibility criteria for ElderShield?,"To be eligible for ElderShield, you must meet ..."
7,cost-and-financing,cost-and-financing,"['disability', 'seniors', 'eldershield', 'reha...",chunk_1,Understanding ElderShield ElderShield is a sev...,[https://www.healthhub.sg/a-z/costs-and-financ...,How can seniors apply for the Interim Disabili...,To apply for the Interim Disability Assistance...
8,cost-and-financing,cost-and-financing,"['disability', 'seniors', 'eldershield', 'reha...",chunk_1,Understanding ElderShield ElderShield is a sev...,[https://www.healthhub.sg/a-z/costs-and-financ...,What are the monthly cash payouts under ElderS...,"Under ElderShield 400, policyholders receive a..."
9,cost-and-financing,cost-and-financing,"['disability', 'seniors', 'eldershield', 'reha...",chunk_2,Introduction to Pioneer Generation Disability ...,[https://www.healthhub.sg/a-z/costs-and-financ...,What is the eligibility criteria for the Pione...,To be eligible for the Pioneer Generation Disa...


In [18]:
df.to_csv(f"questions_generated_{filter_category}.csv", index=False)