<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/2_Question_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate Questions
With this notebook we generate the questions for the evaluation dataset.
- use this notebook with at least 22 GB of GPU RAM, preferable an L4 GPU in Colab
- Push only to GitHub if necessary
- To run this notebook you need access to the GitHub Repository and have to create an accesstoken which is required to add in Google Colab as a secret with the name **GITHUB**
- If you want to generate questions for the baseline model you have to register on Huggingface for the Meta Llama 3.1 model family because this is a gated repository. After that you have to add you HF token to Google Colab as a Secret with the name **HF_TOKEN**

## Setup

In [None]:
!pip install -U transformers
!pip install accelerate bitsandbytes
!pip install -U peft

In [2]:
import torch
from google.colab import userdata
import logging
import transformers
import os
import gc
from bitsandbytes import nn as bnb_nn
import pandas as pd
import json

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

Clone GitHub repository for data access

In [3]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 1945, done.[K
remote: Counting objects: 100% (210/210), done.[K
remote: Compressing objects: 100% (53/53), done.[K
remote: Total 1945 (delta 194), reused 157 (delta 157), pack-reused 1735 (from 2)[K
Receiving objects: 100% (1945/1945), 52.59 MiB | 23.64 MiB/s, done.
Resolving deltas: 100% (1173/1173), done.


In [4]:
################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

models = {
    "baseline": {
        "id": "meta-llama/Llama-3.1-8B-Instruct",
        "result_file": "Meta-Llama-3.1-8B-Instruct_Baseline_schema_prompt"
    },
    "dpo": {
        "id": "ricostaedeli/Meta-Llama-3.1-8B-Instruct_DPO",
        "result_file": "Meta-Llama-3.1-8B-Instruct_DPO_schema_prompt"
    },
    "dpo_sft": {
        "id": "ricostaedeli/Meta-Llama-3.1-8B-Instruct_SFT_DPO",
        "result_file": "Meta-Llama-3.1-8B-Instruct_DPO_SFT_schema_prompt"
    },
    "orpo": {
        "id": "ricostaedeli/Meta-Llama-3.1-8B-Instruct_ORPO",
        "result_file": "Meta-Llama-3.1-8B-Instruct_ORPO_schema_prompt"
    },
    "orpo_sft": {
        "id": "ricostaedeli/Meta-Llama-3.1-8B-Instruct_ORPO_SFT",
        "result_file": "Meta-Llama-3.1-8B-Instruct_ORPO_SFT_schema_prompt"
    }
}

# Choose model for inference
selected_model = "baseline"

model_id = models[selected_model]["id"]
result_file_name = models[selected_model]["result_file"]

test_dataset_path = f"/content/NLP2025_CQG/Data/Processed/test.csv"

results_path = os.path.join(os.getcwd(), f"/content/NLP2025_CQG/Evaluation/Results/results_{result_file_name}.json")

log_base_path = f"/content/NLP2025_CQG/Logs/"
os.makedirs(log_base_path, exist_ok=True)

log_path = log_base_path + f"4_cqs_generation_{result_file_name}.log"


################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################

USE_EXAMPLES_IN_PROMPT = True
PUSH_TO_GITHUB = False


In [5]:
# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(log_path)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

# Log the device info
logger.info("--------  Start with Baseline Generation  -------------")
logger.info(f'Device selected: {device}')
logger.info(f'Results Path: {results_path}')
logger.info(f'Log Path: {log_path}')
logger.info("--------------------------------------------------------")

INFO:__main__:--------  Start with Baseline Generation  -------------
INFO:__main__:Device selected: cuda
INFO:__main__:Results Path: /content/NLP2025_CQG/Evaluation/Results/results_Meta-Llama-3.1-8B-Instruct_Baseline_schema_prompt.json
INFO:__main__:Log Path: /content/NLP2025_CQG/Logs/4_cqs_generation_Meta-Llama-3.1-8B-Instruct_Baseline_schema_prompt.log
INFO:__main__:--------------------------------------------------------


## Generate Answers

Preprocess dataset

In [6]:
df = pd.read_csv(test_dataset_path)

# Define the schema types
schemas = {
    "CauseToEffect": """Examples are:
    What if the job is one that will cause their illness to manifest and affect their job performance?
    If so, do you think the lack of any of the pieces influence the decisions they make about whether to create the digital widget or which digital widget to create?
    If miners and the working class are such a small percentage of the population as to not make a difference in a straight up popular vote, then why do we let them have so much influence?
    """,

    "ExpertOpinion": """'Examples are:
    Do you have a scientific source that confirm that women only wants one partner to a greater degree than men?
    If you are just stating facts, the surely you can point to some studies that show most women are narcissistic sociopaths?
    Where in any scientific textbook or journal have you seen evidence of sex being described as a spectrum?
    """,

    "Analogy": """Examples are:
    If so, would these groups resemble what are commonly referred to as races?
    If whites are guilty of enjoying the benefits of stolen land after being here for two generations, why would that not mean a 2nd generation American POC not have a similar benefit?
    But why is the way her hair naturally grows from her head seen as less professional than chemically altering to mimic straight white European hair?
    """,

    "FearAppeal": """'Examples are:
    If he did such a great job making peace, why do we need sanctions to pressure North Korea into stopping its violent threats?
    And if we should strive to stop all killings then why would the implement of this murder be spared if it causes a very large number of deaths but was intended to cause none?
    If you say you can, then are you implying that you see Islam as being no worse?
    """
}

# Prepare the JSON structure
json_data = []
for idx, row in df.iterrows():
    context = row['input'].strip()
    original_id = row['id']

    for schema_key, schema_value in schemas.items():
        schema_text = f"\n\nSchema definition:\n{schema_value}" if USE_EXAMPLES_IN_PROMPT else ""
        entry = {
            "prompt": [
                {
                    "role": "system",
                    "content": "You generate concise, critical, single-sentence questions for argumentative contexts, matching specified question schemas."
                },
                 {
                    "role": "user",
                    "content": f"Generate one critical question addressing the provided context. Ensure it matches the schema:{schema_key}{schema_text}.\n\nContext: {context} Respond only with the question and nothing else."
                }

            ],
            "id": f"{original_id}",
            "schema" : schema_key,
            "input" : context

        }
        json_data.append(entry)


# Save the JSON data to a file
temp_output_file = 'processed_dataset.json'
with open(temp_output_file, 'w') as f:
    json.dump(json_data, f, indent=4)

## Generation with transformers

This cell is only needed if you have to clear the GPU cache.

In [None]:
# del model
# gc.collect()
# torch.cuda.empty_cache()

Load pipeline with quantized model for less GPU usage


In [None]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={
        "torch_dtype": torch.float16,
        "quantization_config": {
            "load_in_4bit": True,
            "bnb_4bit_compute_dtype": torch.float16,
        },
        "low_cpu_mem_usage": True,
    },
    device_map="auto",
)

### Single generation
This cell is just for debuging purpose to quickly get an overview of the generated questions from the evaluated model.

In [8]:
def get_response(query, max_tokens=128, temperature=0.7, top_p=0.9):

    prompt = pipeline.tokenizer.apply_chat_template(
        query, tokenize=False, add_generation_prompt=True
    )

    outputs = pipeline(
        prompt,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )
    response = outputs[0]["generated_text"][len(prompt):]
    print(response)
    return response

with open(temp_output_file, "r") as f:
    data = json.load(f)

combined_output = {}
i = 0
for item in data:
    print(f"Number: {i}")
    item_id = item["id"]
    schema = item["schema"]
    message = item["prompt"]
    input = item["input"]
    question = get_response(message).strip()

    if item_id not in combined_output:
        combined_output[item_id] = {"input": input, "cqs": []}

    combined_output[item_id]["cqs"].append({"schema": schema, "cq": question})
    i = i + 1
    if i == 10:
      break


# Save the combined questions
with open(results_path, "w") as f:
    json.dump(combined_output, f, indent=4)
print(f"Combined questions saved to {results_path}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Number: 0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Does increased cooperation with the Muslim community in America actually lead to a more effective counter-terrorism strategy?
Number: 1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Can you provide evidence from any scientific study that suggests Muslims are more likely to provide valuable intelligence to counter-terrorism efforts than other demographic groups?
Number: 2


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


If we rely heavily on intelligence from Europe and the Middle East to combat terrorism, wouldn't it be wise to prioritize building trust with our Muslim allies and communities, rather than dismissing or alienating them?
Number: 3


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


If we're to believe that our Muslim allies are essential for combating terrorism, why have Trump's divisive rhetoric towards Muslims potentially undermined our ability to gather crucial intelligence from them?
Number: 4


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Does her plan to make the wealthy pay their fair share and close corporate loopholes guarantee that the profits will be equitably distributed among all employees?
Number: 5


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Can you provide evidence from peer-reviewed studies that supports your claim that paid family leave, earned sick days, affordable child care, and debt-free college are essential for supporting people who are struggling to balance family and work?
Number: 6


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


If we are to provide paid family leave, earned sick days, and affordable child care, why should corporate executives, who have been responsible for creating profits, not be required to share some of those profits with their employees?
Number: 7


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


If we are truly committed to helping those struggling to balance family and work, why not implement these policies immediately rather than making them a bargaining chip for the wealthy to pay their fair share?
Number: 8


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Will pursuing a policy of clean energy lead to a stronger economy in the long run?
Number: 9
Do you have an independent, unbiased scientific source that supports the claim that climate change is a hoax perpetrated by the Chinese?
Combined questions saved to /content/NLP2025_CQG/Evaluation/Results/results_Meta-Llama-3.1-8B-Instruct_Baseline_schema_prompt.json


### Batch processing
This cell generated the questions for the evaluation dataset as a batch processing.

In [None]:
def get_responses(queries, max_tokens=512, temperature=0.6, top_p=0.9):
    # Prepare the prompts
    prompts = [pipeline.tokenizer.apply_chat_template(q, tokenize=False, add_generation_prompt=True) for q in queries]

    # Run generation in batches to avoid OOM
    all_responses = []
    batch_size = 50
    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i+batch_size]

        # Run the pipeline in batch mode
        outputs = pipeline(
            batch_prompts,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )

        # Extract responses
        for prompt, output_list in zip(batch_prompts, outputs):
            # Each output_list is a list with a single dictionary
            generated_text = output_list[0]["generated_text"]
            response = generated_text[len(prompt):].strip()
            all_responses.append(response)
            print(response)

    return all_responses

# Load the data
with open(temp_output_file, "r") as f:
    data = json.load(f)

queries = [item["prompt"] for item in data]
responses = get_responses(queries)

# Combine the outputs
combined_output = {}
for item, response in zip(data, responses):
    item_id = item["id"]
    schema = item["schema"]
    input = item["input"]

    if item_id not in combined_output:
        combined_output[item_id] = {"input": input, "cqs": []}

    combined_output[item_id]["cqs"].append({"schema": schema, "cq": response})

# Save the combined questions
with open(results_path, "w") as f:
    json.dump(combined_output, f, indent=4)

print(f"Combined questions saved to {results_path}")

In [None]:
try:
    os.remove(temp_output_file)
    print(f"File '{temp_output_file}' deleted successfully.")
except FileNotFoundError:
    print(f"File '{temp_output_file}' not found.")
except PermissionError:
    print(f"Permission denied to delete '{temp_output_file}'.")
except Exception as e:
    print(f"Error deleting file: {e}")

## Commit & Push

In [None]:
if PUSH_TO_GITHUB:
  os.chdir("NLP2025_CQG")
  !ls

  !git config --global user.name "Rico Städeli"
  !git config --global user.email "rico@yabrriga.ch"


  commit_message = f"Finetuned generation for {result_file_name}"
  !git add .
  !git commit -m "{commit_message}"
  !git push