<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/2_Baseline_CQS_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline Predictions
In this file we generate the baseline predictions

## Setup

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import json
import logging
import tqdm
import re
import torch
from getpass import getpass
from google.colab import userdata, drive
import os

In [None]:
drive.mount('/content/drive')

In [None]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

In [None]:
os.chdir("NLP2025_CQG")
!ls

In [None]:
################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

test_dataset_path = "Data/Processed/test.json"
model_path_llama = "/content/drive/MyDrive/HSG/NLP/Project NLP/Models/Meta-Llama-3.1-8B-Instruct"
model_path_qwen = "/content/drive/MyDrive/HSG/NLP/Project NLP/Models/Qwen2.5-7B-Instruct"
results_path = "Evaluation/Results/"
log_path = "Logs/2_baseline_predictions.log"

################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################

# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(log_path)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

# Log the device info
logger.info("--------  Start with Baseline Predictions  -------------")
logger.info(f'Device selected: {device}')

## Zero Shot prompting
In this section we genererate critical questions with different pretrained vanilla models. We use this generated questions as a baseline to compare it against our results. The following models were used to generate the baseline results:
- LLama 3.1 8B Instruct
- Qwen 2.5 7B Instruct

In [None]:
models = [
    {
        "name": "llama",
        "model_id": model_path_llama,
        "output_file": results_path + "results_zeroshot_llama_3.1-8B-instruct.json",
    },
    {
        "name": "qwen",
        "model_id": model_path_qwen,
        "output_file": results_path + "results_zeroshot_qwen2.5-7b-instruction.json",
    },
]

## Generate critical Questions

In [None]:
batch_size = 8  # You can adjust this based on your GPU memory

def structure_output(whole_text):
    cqs_list = whole_text.split('\n')
    final = []
    valid = []
    not_valid = []
    for cq in cqs_list:
        if re.match(r'.*\?(\")?( )?(\([a-zA-Z0-9\.\'-\,\? ]*\))?([a-zA-Z \.,\"\']*)?(\")?$', cq):
            valid.append(cq)
        else:
            not_valid.append(cq)

    still_not_valid = []
    for text in not_valid:
        new_cqs = re.split(r'\?\"', text + 'end')
        if len(new_cqs) > 1:
            for cq in new_cqs[:-1]:
                valid.append(cq + '?"')
        else:
            still_not_valid.append(text)

    for i, cq in enumerate(valid):
        occurrence = re.search(r'[A-Z]', cq)
        if occurrence:
            final.append(cq[occurrence.start():])
        else:
            continue

    output = []
    if len(final) >= 3:
        for i in [0, 1, 2]:
            output.append({'cq': final[i]})
        return output
    else:
        return 'Missing CQs'

In [None]:
def generate_critical_questions_batch(model, tokenizer, model_name, batch_data):
    prompts = [
        f"""Suggest 3 critical questions that should be raised before accepting the arguments in this text:\n\n\"{item['intervention']}\"\n\nGive one question per line. Make the questions simple, and do not give any explanation regarding why the question is relevant."""
        for item in batch_data
    ]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
      outputs = model.generate(
          **inputs,
          max_new_tokens=512,
          do_sample=True,
          temperature=0.6,
          top_p=0.9
      )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    del inputs, outputs
    torch.cuda.empty_cache()

    return [
        structure_output(decoded[len(prompt):].strip())
        for decoded, prompt in zip(decoded_outputs, prompts)
    ]

In [None]:
with open(test_dataset_path, 'r') as f:
    data = json.load(f)


for model_info in models:
    logger.info(f"Loading model: {model_info['model_id']}")

    tokenizer = AutoTokenizer.from_pretrained(model_info["model_id"])
    if tokenizer.pad_token is None:
      tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_info["model_id"],
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    output_data = []
    items = list(data.items())

    for i in range(0, len(items), batch_size):
        batch = items[i:i+batch_size]
        batch_ids = [item_id for item_id, _ in batch]
        batch_data = [item for _, item in batch]

        questions_list = generate_critical_questions_batch(model, tokenizer, model_info["name"], batch_data)

        for item_id, questions in zip(batch_ids, questions_list):
            output_entry = {
                item_id: {
                    "cqs": questions
                }
            }
            logger.info(f"Generated {item_id}: {questions}")
            output_data.append(output_entry)

    with open(model_info["output_file"], 'w') as f:
        json.dump(output_data, f, indent=2)

    logger.info(f"Output saved to {model_info['output_file']}")

## Commit & Push

In [None]:
!git config --global user.name "Rico Städeli"
!git config --global user.email "rico@yabriga.ch"

In [None]:
commit_message = "your commit message"
!git add .
!git commit -m "{commit_message}"
!git push