In [15]:
import os
print(os.getcwd())

e:\NLP\GraphRag_Eval\autoq


In [16]:
%reload_ext dotenv
%dotenv

In [4]:
import json
import logging
import os

import pandas as pd
import tiktoken
from pydantic import SecretStr

from benchmark_qed.autod.data_processor.embedding import TextEmbedder
from benchmark_qed.autod.io.text_unit import load_text_units
from benchmark_qed.autoq.io.activity import (
    save_activity_context,
)
from benchmark_qed.autoq.io.question import (
    load_questions,
    save_questions,
)
from benchmark_qed.config.llm_config import LLMConfig, LLMProvider
from benchmark_qed.llm.factory import ModelFactory

logging.basicConfig(level=logging.INFO)

if logging.getLogger("httpx") is not None:
    logging.getLogger("httpx").setLevel(logging.ERROR)

In [None]:
# DATA CONFIGS
INPUT_DATA_PATH = "/data"
OUTPUT_DATA_PATH = "autoq/processed_data"
OUTPUT_QUESTIONS_PATH = "autoq/questions"
TEXT_COLUMN = "body_nitf"
METADATA_COLUMNS = ["headline", "firstcreated"] # Ap news Dataset
FILE_ENCODING = "utf-8-sig"

# tokenizer used for chunking documents into text units
ENCODING_MODEL = "o200k_base"
CHUNK_SIZE = 600
CHUNK_OVERLAP = 100

# DATA SAMPLING CONFIGS
# These configs control the breadth and depth of the selected data sample.
# Adjust these parameters based on your data size and the number of questions to be generated (e.g. try increasing number of clusters if you want to generate more diverse questions)
# The final sample size will be NUM_CLUSTERS * NUM_SAMPLES_PER_CLUSTER
NUM_CLUSTERS = 10
NUM_SAMPLES_PER_CLUSTER = 10
RANDOM_SEED = 42

# GENERAL QUESTION GENERATION CONFIGS
# Number of questions to generate for each question class. You can also specify a different number of questions for each class.
NUM_QUESTIONS = 2
# Factor by which to overgenerate candidate questions (you can specify a different factor for each question class). These candidate questions will be ranked and filtered using a question sampler to select the final questions.
OVERSAMPLE_FACTOR = 2.0

# CONFIGS SPECIFIC TO ACTIVITY QUESTIONS
# these configs should be adjusted based on the number of questions to be generated. Try increasing these configs if you want to generate more questions.
NUM_PERSONAS = 5
NUM_TASKS_PER_PERSONA = 2
NUM_ENTITIES_PER_TASK = 5

In [6]:
# MODEL CONFIGS
API_KEY = SecretStr(os.getenv("OPENAI_API_KEY", ""))
EMBEDDING_MODEL = "text-embedding-3-large"
LLM_MODEL = "gpt-4.1-nano"
LLM_PARAMS = {
    "temperature": 0.0,
    "seed": 42,
}  # adjust this based on your model. For example, some reasoning models do not support temperature settings
CONCURRENT_REQUESTS = (
    8  # Control for request concurrency. Adjust this based on your model capacity.
)

text_embedder = TextEmbedder(
    ModelFactory.create_embedding_model(
        LLMConfig(
            model=EMBEDDING_MODEL,
            api_key=API_KEY,
            llm_provider=LLMProvider.OpenAIEmbedding,
        )
    )
)
llm = ModelFactory.create_chat_model(
    model_config=LLMConfig(
        model=LLM_MODEL,
        api_key=API_KEY,
        llm_provider=LLMProvider.OpenAIChat,
        call_args=LLM_PARAMS,
    )
)
token_encoder = tiktoken.get_encoding(ENCODING_MODEL)

In [8]:
from benchmark_qed.autod.sampler.sample_gen import acreate_clustered_sample

clustered_sample = await acreate_clustered_sample(
    input_path=INPUT_DATA_PATH,
    output_path=OUTPUT_DATA_PATH,
    text_embedder=text_embedder,
    num_clusters=NUM_CLUSTERS,
    num_samples_per_cluster=NUM_SAMPLES_PER_CLUSTER,
    input_type="json",
    text_tag=TEXT_COLUMN,
    metadata_tags=METADATA_COLUMNS,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    file_encoding=FILE_ENCODING,
    token_encoding=ENCODING_MODEL,
    random_seed=RANDOM_SEED,
)
print(
    f"Sampled {len(clustered_sample.sample_texts)} samples from {len(clustered_sample.text_units)} text units in {len(clustered_sample.documents)} documents."
)

INFO:benchmark_qed.autod.sampler.sample_gen:Document count: 1083
INFO:benchmark_qed.autod.sampler.sample_gen:Text unit count: 2851


INFO:benchmark_qed.autod.sampler.clustering.kmeans:Cluster sizes: min=104, max=516, mean=285.1
INFO:benchmark_qed.autod.sampler.sample_gen:Sampled text unit count: 100


Sampled 100 samples from 2851 text units in 1083 documents.


Data Local

In [10]:
from benchmark_qed.autoq.question_gen.data_questions.local_question_gen import (
    DataLocalQuestionGen,
)

# load clustered text sample (result from the data sampling step)
# If you have previously run the data sampling step, you can load the sample from disk instead of re-running the data sampling step as the below example.
# Otherwise, you can use clustered_sample.sample_texts directly
sample_texts_df = pd.read_parquet(f"{OUTPUT_DATA_PATH}/sample_texts.parquet")
sample_texts = load_text_units(df=sample_texts_df)

data_local_generator = DataLocalQuestionGen(
    llm=llm,
    text_embedder=text_embedder,
    text_units=sample_texts,
    concurrent_coroutines=CONCURRENT_REQUESTS,
    random_seed=RANDOM_SEED,
)

data_local_question_results = await data_local_generator.agenerate(
    num_questions=NUM_QUESTIONS,
    oversample_factor=OVERSAMPLE_FACTOR,
)

# save both candidate questions and the final selected questions
save_questions(
    data_local_question_results.selected_questions,
    f"{OUTPUT_QUESTIONS_PATH}/data_local_questions/",
    "selected_questions",
)
save_questions(
    data_local_question_results.selected_questions,
    f"{OUTPUT_QUESTIONS_PATH}/data_local_questions/",
    "selected_questions_text",
    question_text_only=True,
)
save_questions(
    data_local_question_results.candidate_questions,
    f"{OUTPUT_QUESTIONS_PATH}/data_local_questions/",
    "candidate_questions",
)

INFO:benchmark_qed.autoq.question_gen.data_questions.local_question_gen:Processing clusters 0 to 8 of 10 clusters...
  0%|          | 0/8 [00:00<?, ?it/s]INFO:benchmark_qed.autoq.question_gen.data_questions.local_question_gen:Question: What specific type of cancer was publicly announced as diagnosed in King Charles III during his treatment in early 2024 in the United Kingdom?. Intra-inter Similarity: 5.1593895592740635. Reference Coverage: 0.2
 12%|█▎        | 1/8 [00:25<02:56, 25.16s/it]INFO:benchmark_qed.autoq.question_gen.data_questions.local_question_gen:Question: What factors have contributed to the decline of pharmacies in West Virginia, and what measures are being considered to improve access to prescription medications in rural communities?. Intra-inter Similarity: 1.7171281526773072. Reference Coverage: 0.2
 25%|██▌       | 2/8 [00:29<01:15, 12.64s/it]INFO:benchmark_qed.autoq.question_gen.data_questions.local_question_gen:Question: What are the main environmental, social, and 

Data Global

In [11]:
from benchmark_qed.autoq.question_gen.data_questions.global_question_gen import (
    DataGlobalQuestionGen,
)

# Load candidate questions (result from the data local question generation step)
# Please note that we load all the candidate local questions (not just the selected ones) as that gives us a bigger pool of local questions to aggregate from.
# If you have previously run the data local question generation step, you can load the candidate questions from disk instead of re-running the data local question generation step as the below example.
# Otherwise, you can use data_local_question_results.candidate_questions directly
local_questions = load_questions(
    f"{OUTPUT_QUESTIONS_PATH}/data_local_questions/candidate_questions.json"
)
print(f"Loaded {len(local_questions)} candidate local questions.")

data_global_generator = DataGlobalQuestionGen(
    llm=llm,
    text_embedder=text_embedder,
    local_questions=local_questions,
    concurrent_coroutines=CONCURRENT_REQUESTS,
    random_seed=RANDOM_SEED,
)

data_global_question_results = await data_global_generator.agenerate(
    num_questions=NUM_QUESTIONS,
    oversample_factor=OVERSAMPLE_FACTOR,
)

# save both candidate questions and the final selected questions
save_questions(
    data_global_question_results.selected_questions,
    f"{OUTPUT_QUESTIONS_PATH}/data_global_questions/",
    "selected_questions",
)
save_questions(
    data_global_question_results.selected_questions,
    f"{OUTPUT_QUESTIONS_PATH}/data_global_questions/",
    "selected_questions_text",
    question_text_only=True,
)
save_questions(
    data_global_question_results.candidate_questions,
    f"{OUTPUT_QUESTIONS_PATH}/data_global_questions/",
    "candidate_questions",
)

INFO:benchmark_qed.autoq.question_gen.data_questions.global_question_gen:Number of initial categories: 97
Number of valid candidate categories (i.e. categories with more than one input question): 12
Number of questions to generate per candidate category: 1
INFO:benchmark_qed.autoq.question_gen.data_questions.global_question_gen:Processing categories 0 to 8 of 12 categories...


Loaded 27 candidate local questions.


ERROR:benchmark_qed.autoq.question_gen.data_questions.global_question_gen:Exception occurred while generating questions for category: judicial rulings
Traceback (most recent call last):
  File "c:\Users\shash\anaconda3\envs\benchmark\Lib\site-packages\benchmark_qed\autoq\question_gen\data_questions\global_question_gen.py", line 243, in _agenerate_single_chain
    await self.claim_extractor.aextract_claims(
  File "c:\Users\shash\anaconda3\envs\benchmark\Lib\site-packages\benchmark_qed\autoq\question_gen\data_questions\claim_extractor\global_claim_extractor.py", line 66, in aextract_claims
    claims = [
             ^
  File "c:\Users\shash\anaconda3\envs\benchmark\Lib\site-packages\benchmark_qed\autoq\question_gen\data_questions\claim_extractor\global_claim_extractor.py", line 70, in <listcomp>
    if claim["statement"] != "" and len(claim["source_ids"]) > 0
                                        ~~~~~^^^^^^^^^^^^^^
KeyError: 'source_ids'
ERROR:benchmark_qed.autoq.question_gen.data_q

In [12]:
from benchmark_qed.autoq.question_gen.activity_questions.context_gen.activity_context_gen import (
    ActivityContextGen,
)

# load clustered text sample (result from the data sampling step)
# If you have previously run the data sampling step, you can load the sample from disk instead of re-running the data sampling step as the below example.
# Otherwise, you can use clustered_sample.sample_texts directly
sample_texts_df = pd.read_parquet(f"{OUTPUT_DATA_PATH}/sample_texts.parquet")
sample_texts = load_text_units(
    df=sample_texts_df, attributes_cols=["is_representative"]
)

activity_generator = ActivityContextGen(
    llm=llm,
    text_embedder=text_embedder,
    token_encoder=token_encoder,
    text_units=sample_texts,
    concurrent_coroutines=CONCURRENT_REQUESTS,
)

activity_context = await activity_generator.agenerate(
    num_personas=NUM_PERSONAS,
    num_tasks=NUM_TASKS_PER_PERSONA,
    num_entities_per_task=NUM_ENTITIES_PER_TASK,
    oversample_factor=OVERSAMPLE_FACTOR,
    use_representative_samples_only=True,  # if True, we will only use a subset of representative samples from the clustered texts to generate activity context (for efficiency). If False, we will use all the samples in the clustered texts.
)

save_activity_context(activity_context, f"{OUTPUT_QUESTIONS_PATH}/context/")

INFO:benchmark_qed.autoq.question_gen.activity_questions.context_gen.activity_context_gen:Generating dataset summary from 10 representative texts...
INFO:benchmark_qed.autod.summarization.global_summarizer:Generating 1 map responses...
100%|██████████| 1/1 [00:09<00:00,  9.26s/it]
INFO:benchmark_qed.autoq.question_gen.activity_questions.context_gen.activity_context_gen:Dataset summary: The dataset primarily draws from news articles and legislative reports, focusing on legal, political, health, and environmental issues. Its prominent theme centers on reproductive rights and related legislative and judicial developments, including the impact of major court rulings and state-level restrictions, as well as social and political debates surrounding transgender healthcare and abortion policies. Other key topics include public health concerns such as disease control, environmental health impacts from incidents like train derailments, and healthcare policy debates on drug pricing, Medicaid expa

In [13]:
from pathlib import Path

from benchmark_qed.autoq.data_model.activity import ActivityContext
from benchmark_qed.autoq.question_gen.activity_questions.local_question_gen import (
    ActivityLocalQuestionGen,
)

# load activity context (result from the activity context generation step)
# If you have previously run the activity context generation step, you can load the context from disk instead of re-running the activity context generation step as the below example.
activity_context = ActivityContext(
    **json.loads(
        Path(f"{OUTPUT_QUESTIONS_PATH}/context/activity_context_full.json").read_text()
    )
)
print(f"Loaded {len(activity_context.task_contexts)} tasks.")

activity_local_generator = ActivityLocalQuestionGen(
    llm=llm,
    text_embedder=text_embedder,
    activity_context=activity_context,
    concurrent_coroutines=CONCURRENT_REQUESTS,
    random_seed=RANDOM_SEED,
)

activity_local_question_results = await activity_local_generator.agenerate(
    num_questions=NUM_QUESTIONS,
    oversample_factor=OVERSAMPLE_FACTOR,
)

# save both candidate questions and the final selected questions
save_questions(
    activity_local_question_results.selected_questions,
    f"{OUTPUT_QUESTIONS_PATH}/activity_local_questions/",
    "selected_questions",
)
save_questions(
    activity_local_question_results.selected_questions,
    f"{OUTPUT_QUESTIONS_PATH}/activity_local_questions/",
    "selected_questions_text",
    question_text_only=True,
)
save_questions(
    activity_local_question_results.candidate_questions,
    f"{OUTPUT_QUESTIONS_PATH}/activity_local_questions/",
    "candidate_questions",
)

Loaded 20 tasks.


INFO:benchmark_qed.autoq.question_gen.activity_questions.local_question_gen:Generated 23 candidate questions
INFO:benchmark_qed.autoq.question_gen.activity_questions.local_question_gen:Number of questions per entity: 1
Number of entities: 19
INFO:benchmark_qed.autoq.question_gen.activity_questions.local_question_gen:Retained 14 questions post-filtering by entity distribution
INFO:benchmark_qed.autod.sampler.clustering.kmeans:Cluster sizes: min=4, max=10, mean=7.0
INFO:benchmark_qed.autoq.sampler.question_sampler:Selected 2 questions from 14 candidates.


In [14]:
from benchmark_qed.autoq.question_gen.activity_questions.global_question_gen import (
    ActivityGlobalQuestionGen,
)

# load activity context (result from the activity context generation step)
# If you have previously run the activity context generation step, you can load the context from disk instead of re-running the activity context generation step as the below example.
activity_context = ActivityContext(
    **json.loads(
        Path(f"{OUTPUT_QUESTIONS_PATH}/context/activity_context_full.json").read_text()
    )
)
print(f"Loaded {len(activity_context.task_contexts)} tasks.")

activity_global_generator = ActivityGlobalQuestionGen(
    llm=llm,
    text_embedder=text_embedder,
    activity_context=activity_context,
    concurrent_coroutines=CONCURRENT_REQUESTS,
    random_seed=RANDOM_SEED,
)

activity_global_question_results = await activity_global_generator.agenerate(
    num_questions=NUM_QUESTIONS,
    oversample_factor=OVERSAMPLE_FACTOR,
)

# save both candidate questions and the final selected questions
save_questions(
    activity_global_question_results.selected_questions,
    f"{OUTPUT_QUESTIONS_PATH}/activity_global_questions/",
    "selected_questions",
)
save_questions(
    activity_global_question_results.selected_questions,
    f"{OUTPUT_QUESTIONS_PATH}/activity_global_questions/",
    "selected_questions_text",
    question_text_only=True,
)
save_questions(
    activity_global_question_results.candidate_questions,
    f"{OUTPUT_QUESTIONS_PATH}/activity_global_questions/",
    "candidate_questions",
)

Loaded 20 tasks.


INFO:benchmark_qed.autoq.question_gen.activity_questions.global_question_gen:Generated 20 candidate questions for 20 tasks
INFO:benchmark_qed.autod.sampler.clustering.kmeans:Cluster sizes: min=5, max=15, mean=10.0
INFO:benchmark_qed.autoq.sampler.question_sampler:Selected 2 questions from 20 candidates.
