<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/2_Baseline_CQS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline Predictions
In this file we generate the baseline predictions

## Setup

In [7]:
import torch
import transformers
import pandas as pd
import json
import logging
import tqdm
import re
import torch
from getpass import getpass
from google.colab import userdata
import os

In [4]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 287, done.[K
remote: Counting objects: 100% (287/287), done.[K
remote: Compressing objects: 100% (200/200), done.[K
remote: Total 287 (delta 115), reused 229 (delta 61), pack-reused 0 (from 0)[K
Receiving objects: 100% (287/287), 24.01 MiB | 18.38 MiB/s, done.
Resolving deltas: 100% (115/115), done.


In [8]:
os.chdir("NLP2025_CQG")
!ls

1_Preprocessing.ipynb	      4_Evaluation_Analytics.ipynb  Logs
2a_Baseline_Evaluation.ipynb  Data			    README.md
2_Baseline_CQS.ipynb	      Development		    requirements.txt
3a_Finetuned_CQS.ipynb	      Doc			    Training
3b_Finetune_Evaluation.ipynb  Evaluation		    Utils
3_Training.ipynb	      LICENSE


In [9]:
filename = "test.txt"
with open(filename, "w") as file:
    file.write("This is a sample file created in Google Colab.")

In [None]:
################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

test_dataset_path = "Data/Processed/test.json"
model_path_llama = "Models/Meta-Llama-3.1-8B-Instruct"
model_path_qwen = "Models/Qwen2.5-7B-Instruct"
results_path = "Evaluation/Results/"

################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################

# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler('Logs/baseline_predictions.log')
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

# Log the device info
logger.info("--------  Start with Baseline Predictions  -------------")
logger.info(f'Device selected: {device}')

## Zero Shot prompting
In this section we genererate critical questions with different pretrained vanilla models. We use this generated questions as a baseline to compare it against our results. The following models were used to generate the baseline results:
- LLama 3.1 8B Instruct
- Qwen 2.5 7B Instruct

In [None]:
models = [
    {
        "name": "llama",
        "model_id": model_path_llama,
        "output_file": results_path + "results_zeroshot_llama_3.1-8B-instruct.json",
    },
    {
        "name": "qwen",
        "model_id": model_path_qwen,
        "output_file": results_path + "results_zeroshot_qwen2.5-7b-instruction.json",
    },
]

In [None]:
def structure_output(whole_text):
    cqs_list = whole_text.split('\n')
    final = []
    valid = []
    not_valid = []
    for cq in cqs_list:
        if re.match(r'.*\?(\")?( )?(\([a-zA-Z0-9\.\'-\,\? ]*\))?([a-zA-Z \.,\"\']*)?(\")?$', cq):
            valid.append(cq)
        else:
            not_valid.append(cq)

    still_not_valid = []
    for text in not_valid:
        new_cqs = re.split(r'\?\"', text + 'end')
        if len(new_cqs) > 1:
            for cq in new_cqs[:-1]:
                valid.append(cq + '?"')
        else:
            still_not_valid.append(text)

    for i, cq in enumerate(valid):
        occurrence = re.search(r'[A-Z]', cq)
        if occurrence:
            final.append(cq[occurrence.start():])
        else:
            continue

    output = []
    if len(final) >= 3:
        for i in [0, 1, 2]:
            output.append({'id': i, 'cq': final[i]})
        return output
    else:
        return 'Missing CQs'

In [None]:
def generate_critical_questions(pipe, model_name, intervention_text):
    prompt = f"""Suggest 3 critical questions that should be raised before accepting the arguments in this text:\n\n\"{intervention_text}\"\n\nGive one question per line. Make the questions simple, and do not give any explanation regarding why the question is relevant."""

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]

    outputs = pipe(
        messages,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    if model_name == "llama":
        assistant_response = outputs[0]["generated_text"][-1]["content"]
    elif model_name == "qwen":
        assistant_response = outputs[0]["generated_text"]
        if isinstance(assistant_response, list):
            assistant_response = "\n".join([m["content"] for m in assistant_response if isinstance(m, dict) and "content" in m])
    else:
        raise ValueError(f"Unsupported model name: {model_name}")

    structured = structure_output(assistant_response)
    return structured

In [None]:
with open(test_dataset_path, 'r') as f:
    data = json.load(f)

for model in models:
    print(f"Loading model: {model['model_id']}")
    logger.info(f"Loading model: {model['model_id']}")

    tokenizer = transformers.AutoTokenizer.from_pretrained(model["model_id"])
    pipe = transformers.pipeline(
        "text-generation",
        model=model["model_id"],
        model_kwargs={"torch_dtype": torch.bfloat16},
        device=device,
        pad_token_id=tokenizer.eos_token_id,
    )

    output_data = []

    for item_id, item in data.items():
        intervention_text = item["intervention"]
        questions = generate_critical_questions(pipe, model["name"], intervention_text)

        output_entry = {
             item_id: {
                 "cqs": questions
             }
        }
        logger.info(f"Generated {item_id}: {questions}")
        output_data.append(output_entry)

    with open(model["output_file"], 'w') as f:
        json.dump(output_data, f, indent=2)

    logger.info(f"Output saved to {model['output_file']}")

Loading model: Models/Meta-Llama-3.1-8B-Instruct


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use mps
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading model: Models/Qwen2.5-7B-Instruct


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use mps


## Commit & Push

In [11]:
!git config --global user.name "Rico StÃ¤deli"
!git config --global user.email "rico@yabriga.ch"

In [12]:
commit_message = "your commit message"
!git add .
!git commit -m "{commit_message}"
!git push

[main 89f1f3d] your commit message
 1 file changed, 1 insertion(+)
 create mode 100644 test.txt
Enumerating objects: 4, done.
Counting objects: 100% (4/4), done.
Delta compression using up to 2 threads
Compressing objects: 100% (2/2), done.
Writing objects: 100% (3/3), 320 bytes | 320.00 KiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/RicoStaedeli/NLP2025_CQG.git
   ab4c2ae..89f1f3d  main -> main
