## Setup

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import time

In [6]:
load_dotenv(dotenv_path=r'..\.azure\hhgai-dev-eastus-001\.env')

True

In [3]:
from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents.aio import SearchClient

from typing import TypedDict, Optional
from openai import AsyncAzureOpenAI

from openai_messages_token_helper import build_messages, get_token_limit
from openai.types.chat import (
    ChatCompletion,
    ChatCompletionMessageParam,
    ChatCompletionToolParam,
)

from azure.search.documents.models import (
    QueryCaptionResult,
    QueryType,
    VectorizedQuery,
    VectorQuery,
)

In [7]:
AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"]
AZURE_SEARCH_INDEX = os.environ["AZURE_SEARCH_INDEX"]
OPENAI_HOST = os.getenv("OPENAI_HOST", "azure")
OPENAI_CHATGPT_MODEL = os.environ["AZURE_OPENAI_CHATGPT_MODEL"]
OPENAI_EMB_MODEL = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-ada-002")
OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", 1536))
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_CHATGPT_DEPLOYMENT = (
    os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None
)
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-03-01-preview"
AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None
AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ORGANIZATION = os.getenv("OPENAI_ORGANIZATION")
AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID")
AZURE_USE_AUTHENTICATION = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true"
AZURE_ENFORCE_ACCESS_CONTROL = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true"
AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS = os.getenv("AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS", "").lower() == "true"
AZURE_ENABLE_UNAUTHENTICATED_ACCESS = os.getenv("AZURE_ENABLE_UNAUTHENTICATED_ACCESS", "").lower() == "true"
AZURE_SERVER_APP_ID = os.getenv("AZURE_SERVER_APP_ID")
AZURE_SERVER_APP_SECRET = os.getenv("AZURE_SERVER_APP_SECRET")
AZURE_CLIENT_APP_ID = os.getenv("AZURE_CLIENT_APP_ID")
AZURE_AUTH_TENANT_ID = os.getenv("AZURE_AUTH_TENANT_ID", AZURE_TENANT_ID)

AZURE_SEARCH_QUERY_LANGUAGE = os.getenv("AZURE_SEARCH_QUERY_LANGUAGE", "en-us")
AZURE_SEARCH_QUERY_SPELLER = os.getenv("AZURE_SEARCH_QUERY_SPELLER", "lexicon")
AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower()

CHATGPT_TOKEN_LIMIT = get_token_limit(OPENAI_CHATGPT_MODEL)

In [7]:
azure_credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True)
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")

openai_client = AsyncAzureOpenAI(
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider,
)

search_client = SearchClient(
    endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
    index_name=AZURE_SEARCH_INDEX,
    credential=azure_credential,
)

In [None]:
# Parameters
SEARCH_MAX_RESULTS = 10
TEMPERATURE = 0.0
SEED = 1234
USE_TEXT_SEARCH = False
USE_VECTOR_SEARCH = True
USE_SEMANTIC_RANKER = False
USE_SEMANTIC_CAPTIONS = False
MINIMUM_SEARCH_SCORE = 0.0
MINIMUM_RERANKER_SCORE = 0.0

RESPONSE_TOKEN_LIMIT = 1024

In [None]:
class Document:
    id: Optional[str]
    parent_id: Optional[str]
    title: Optional[str]
    cover_image_url: Optional[str]
    full_url: Optional[str]
    content_category: Optional[str]
    chunks: Optional[str]
    embedding: Optional[list[float]]
    captions: list[QueryCaptionResult]
    score: Optional[float] = None
    reranker_score: Optional[float] = None

## Functions

In [16]:
async def compute_text_embedding(q: str):
    SUPPORTED_DIMENSIONS_MODEL = {
        "text-embedding-ada-002": False,
        "text-embedding-3-small": True,
        "text-embedding-3-large": True,
    }

    class ExtraArgs(TypedDict, total=False):
        dimensions: int

    dimensions_args: ExtraArgs = (
        {"dimensions": OPENAI_EMB_DIMENSIONS} if SUPPORTED_DIMENSIONS_MODEL[OPENAI_EMB_MODEL] else {}
    )
    embedding = await openai_client.embeddings.create(
        # Azure OpenAI takes the deployment name as the model name
        model=AZURE_OPENAI_EMB_DEPLOYMENT if AZURE_OPENAI_EMB_DEPLOYMENT else OPENAI_EMB_MODEL,
        input=q,
        **dimensions_args,
    )
    query_vector = embedding.data[0].embedding
    return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields="embedding")

def get_citation(sourcepage: str, use_image_citation: bool) -> str:
    if use_image_citation:
        return sourcepage
    else:
        path, ext = os.path.splitext(sourcepage)
        if ext.lower() == ".png":
            page_idx = path.rfind("-")
            page_number = int(path[page_idx + 1 :])
            return f"{path[:page_idx]}.pdf#page={page_number}"

        return sourcepage

def nonewlines(s: str) -> str:
    return s.replace("\n", " ").replace("\r", " ")
    
async def get_sources_content(results: list[Document], use_semantic_captions: bool, use_image_citation: bool) -> list[str]:
    if use_semantic_captions:
        return [
            (get_citation((doc['full_url'] or ""), use_image_citation))
            + ": "
            + nonewlines(" . ".join([cast(str, c.text) for c in (doc['captions'] or [])]))
            async for doc in results
        ]
    else:
        return [
            (get_citation((doc['full_url'] or ""), use_image_citation)) + ": " + nonewlines(doc['chunks'] or "")
            async for doc in results
        ]

## Prompt

In [12]:
qns_generation_prompt = """Your task is to formulate a set of 2 unique questions from given context, satisfying the rules given below:
1. All generated questions should precisely pertain to the topic of {subtopic}, and it is imperative that the topic is explicitly included as an integral part of each question.
2. The generated questions should be straightforward, using simple language that is accessible to a broad audience. 
3. The generated questions should make sense to humans even when read without the given context.
4. Prioritize clarity and brevity, ensuring that the questions are formulated in a way that reflect common language and would be easily comprehensible to the general public. 
5. Ensure that the questions generated are meaningful and relevant to the general public in understanding or exploring more about the given topic.
7. Only generate questions that can be derived from the given context, including text and tables.
8. Importantly, ensure uniqueness and non-repetition in the questions. 
9. Additionally, all questions must have answers found within the given context.
10. Do not use phrases like 'provided context', etc in the generated questions.
11. A generated question should contain less than 15 words.
12. Each question must be followed with the source url.
13. Use simple language in the questions generated that are accessible to a broad audience.
"""

## Question Generation

In [None]:
topics = ["Diabetes","Breast cancer","Vaccination"]

In [53]:
df = pd.DataFrame(columns = ["Topic", "Question", "URL", "Chunk"])

for topic in topics:
    print(topic)
    vectors: list[VectorQuery] = []
    if USE_VECTOR_SEARCH:
        vectors.append(await compute_text_embedding(topic))

    if USE_SEMANTIC_RANKER:
        results = await search_client.search(
            search_text=topic,
            filter=None,
            top=SEARCH_MAX_RESULTS,
            query_caption="extractive|highlight-false" if USE_SEMANTIC_CAPTIONS else None,
            vector_queries=vectors,
            query_type=QueryType.SEMANTIC,
            query_language=AZURE_SEARCH_QUERY_LANGUAGE,
            query_speller=AZURE_SEARCH_QUERY_SPELLER,
            semantic_configuration_name="default",
            semantic_query=topic,
        )
    else:
        results = await search_client.search(
            search_text=topic,
            filter=None,
            top=SEARCH_MAX_RESULTS,
            vector_queries=vectors,
        )
    
    sources_content = await get_sources_content(results, USE_SEMANTIC_CAPTIONS, use_image_citation=False)
    print(f"- number of sources retrieved: {len(sources_content)}")

    cnt=1
    for source in sources_content:
        messages = build_messages(
            model=OPENAI_CHATGPT_MODEL,
            system_prompt=qns_generation_prompt.format(subtopic=topic),
            new_user_content=f"Please generate 2 unique questions on topic '{topic}' using the following provided source. \n\nSource:\n {source}",
            max_tokens=CHATGPT_TOKEN_LIMIT - RESPONSE_TOKEN_LIMIT,
        )
        chat_coroutine = await openai_client.chat.completions.create(
            # Azure OpenAI takes the deployment name as the model name
            model=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
            messages=messages,
            temperature=TEMPERATURE,
            max_tokens=RESPONSE_TOKEN_LIMIT,
            n=1,
            stream=False,
            seed=SEED,
        )
        
        response_text = chat_coroutine.choices[0].message.content
        entries = response_text.strip().split("\n\n")
        print(f"- number of qns generated for source {cnt}: {len(entries)}")
        
        data = []
        for entry in entries:
            entry = entry.replace("Source:","")
            question, url = entry.split('https://', 1)
            question = question.split(". ",1)[1].strip()
            data.append({"Topic": topic, "Question": question, "URL": url, "Chunk": source})
        df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)
        cnt+=1

        time.sleep(15)

Diabetes
- number of sources retrieved: 10
- number of qns generated for source 1: 2
- number of qns generated for source 2: 2
- number of qns generated for source 3: 2
- number of qns generated for source 4: 2
- number of qns generated for source 5: 2
- number of qns generated for source 6: 2
- number of qns generated for source 7: 2
- number of qns generated for source 8: 1
- number of qns generated for source 9: 2
- number of qns generated for source 10: 2
Breast cancer
- number of sources retrieved: 10
- number of qns generated for source 1: 2
- number of qns generated for source 2: 2
- number of qns generated for source 3: 2
- number of qns generated for source 4: 2
- number of qns generated for source 5: 2
- number of qns generated for source 6: 2
- number of qns generated for source 7: 2
- number of qns generated for source 8: 2
- number of qns generated for source 9: 2
- number of qns generated for source 10: 2
Vaccination
- number of sources retrieved: 10
- number of qns gener

In [None]:
df_unique = df.drop_duplicates(subset=['Question'])
df_unique.to_csv("generated_questions.csv", index=False)