<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/4_Finetuned_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuned Predictions
In this file we generate the finetuned predictions

## Setup

In [1]:
!pip install -U transformers
!pip install accelerate bitsandbytes
!pip install -U peft

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.

In [2]:
import torch
from google.colab import userdata
import logging
import transformers
import os
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer
from bitsandbytes import nn as bnb_nn
from tqdm.auto import tqdm
import pandas as pd
import json
from collections import defaultdict
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 961, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (165/165), done.[K
remote: Total 961 (delta 146), reused 88 (delta 59), pack-reused 737 (from 2)[K
Receiving objects: 100% (961/961), 46.17 MiB | 17.34 MiB/s, done.
Resolving deltas: 100% (498/498), done.
Updating files: 100% (95/95), done.


In [4]:
os.chdir("NLP2025_CQG")
!ls

1_Information_preprocessing.md	      Doc
1_Preprocessing.ipynb		      Evaluation
2_Baseline_Generation.ipynb	      INFORMATION.md
2_Information_Baseline_Generation.md  LICENSE
3_Evaluation.ipynb		      Logs
4_Finetuned_Generation.ipynb	      README.md
5_Evaluation_Analytics.ipynb	      requirements.txt
Data				      Training
Development			      Utils


In [18]:
################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

BASE_MODEL = "unsloth/Meta-Llama-3.1-8B-Instruct-unsloth-bnb-4bit"
model_name= "Meta-Llama-3.1-1B-Instruct_SFT_2-lora"
model_id= f"ricostaedeli/{model_name}"

test_dataset_path = f"Data/Processed/test.csv"

results_path = os.path.join(os.getcwd(), f"Evaluation/Results/results_{model_name}.json")

log_base_path = f"Logs/"
os.makedirs(log_base_path, exist_ok=True)

log_path = log_base_path + "4_cqs_generation_SFT_2.log"


################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################


In [6]:
# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(log_path)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

# Log the device info
logger.info("--------  Start with Baseline Generation  -------------")
logger.info(f'Device selected: {device}')
logger.info(f'Results Path: {results_path}')
logger.info(f'Log Path: {log_path}')
logger.info("--------------------------------------------------------")

INFO:__main__:--------  Start with Baseline Generation  -------------
INFO:__main__:Device selected: cuda
INFO:__main__:Results Path: /content/NLP2025_CQG/Evaluation/Results/results_Meta-Llama-3.1-1B-Instruct_SFT_2-lora.json
INFO:__main__:Log Path: Logs/4_cqs_generation_SFT_2.log
INFO:__main__:--------------------------------------------------------


## Generate Answers

Merge Lora weights and Base model

In [11]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="cuda",
    quantization_config={
        "load_in_4bit": True,
        "bnb_4bit_compute_dtype": torch.float16,
    },
    low_cpu_mem_usage=True,)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Load LoRA adapters from Hugging Face directly
model = PeftModel.from_pretrained(base_model, model_id)



Do this cell only when you load the complete model

In [None]:
'''
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="cuda",
    quantization_config={
        "load_in_4bit": True,
        "bnb_4bit_compute_dtype": torch.float16,
    },
    low_cpu_mem_usage=True,
)
'''

In [12]:
def generate_response(prompt_texts, max_new_tokens=64):
    inputs = tokenizer(prompt_texts, return_tensors="pt", padding=True, truncation=True).to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    responses = []
    for full_output in decoded:
        print(full_output)
        # Extract text after ### Response:
        if "### Response:" in full_output:
            response = full_output.split("### Response:")[-1].strip()
        else:
            response = full_output.strip()

        # Remove leading 'assistant\n\n' if present
        if response.lower().startswith("assistant"):
            response = response[len("assistant"):].lstrip("\n ").strip()

        responses.append(response)
    return responses

In [None]:
# del model
# gc.collect()
# torch.cuda.empty_cache()

In [13]:
schemas = {
"CauseToEffect": """'Cause to Effect' with the examples:
How strong is the generalisation that if <eventA> then <eventB>?
Are there other factors in this particular case that could have interfered with the event of‘<eventB>’?""",

"ExpertOpinion": """'Expert Opinion' with the examples:
Is <expertE> a genuine expert in <domainD>?
Is <eventA> consistent with what other experts in <domainD> say? """,

"Analogy": """'Analogy' with the examples:
Are <C1> and <C2> similar in the respect cited?
Is <eventA> true in <C1>?""",

"FearAppeal": """'Fear Appeal' with the examples:
Is <eventB> bad? Why and to whom is it bad?
Is <eventA> away to prevent <eventB>?"""
}

In [16]:
'''
tokenizer.chat_template = (
    "{% for message in messages %}"
    "{% if message['role'] == 'system' %}"
    "<|start_header_id|>system<|end_header_id|>\n{{ message['content'] }}<|eot_id|>\n"
    "{% elif message['role'] == 'user' %}"
    "<|start_header_id|>user<|end_header_id|>\n{{ message['content'] }}<|eot_id|>\n"
    "{% elif message['role'] == 'assistant' %}"
    "<|start_header_id|>assistant<|end_header_id|>\n{{ message['content'] }}<|eot_id|>\n"
    "{% endif %}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
    "<|start_header_id|>assistant<|end_header_id|>\n"
    "{% endif %}"
)
'''

In [17]:
results = {}
chunk_size = 20

for chunk in pd.read_csv(test_dataset_path, chunksize=chunk_size):
    contexts = chunk['input'].tolist()
    ids = chunk['id'].tolist()
    prompts = []
    input_ids = []
    schema_ids = []


    for idx, input_text in enumerate(contexts):
        input_id = ids[idx]
        for schema_name, schema_template in schemas.items():
            messages = [
                {"role": "system", "content": "You are a system designed to generate critical questions for a given argumentative context. You write clear and human understandable one sentence questions."},
                {"role": "user", "content": f"""Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Write a question that fits one of the given question schema types. If you are not able to create a useful and understandable question out of the given context please indicate with None.

### Question Schema Types::
{schema_template}

Your answer is just the question without anything else.

This is the given context to relate the question to:

### Context:
{input_text}

### Response:

"""}
            ]
            prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            prompts.append(prompt_text)
            input_ids.append(input_id)
            schema_ids.append(schema_name)

    batch_size = 64
    num_batches = (len(prompts) + batch_size - 1) // batch_size

    with tqdm(total=num_batches, desc="Generating Critical Questions", leave=True) as pbar:
        for batch_start in range(0, len(prompts), batch_size):
            batch_prompts = prompts[batch_start:batch_start+batch_size]
            batch_outputs = generate_response(batch_prompts, max_new_tokens=256)

            for curr_id, schema_name, output in zip(
                input_ids[batch_start:batch_start + batch_size],
                schema_ids[batch_start:batch_start + batch_size],
                batch_outputs
                ):
                if curr_id not in results:
                    results[curr_id] = {
                        "input": chunk.loc[chunk['id'] == curr_id, 'input'].values[0],
                        "cqs": []
                    }
                results[curr_id]['cqs'].append({
                    "schema": schema_name,
                    "cq": output
                    })

            torch.cuda.empty_cache()
            gc.collect()
            pbar.update(1)

with open(results_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Results saved to {results_path}")


['<|start_header_id|>system<|end_header_id|>\nYou are a system designed to generate critical questions for a given argumentative context. You write clear and human understandable one sentence questions.<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\nBelow is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction:\nWrite a question that fits one of the given question schema types. If you are not able to create a useful and understandable question out of the given context please indicate with None.\n\n### Question Schema Types::\n\'Cause to Effect\' with the examples:\nHow strong is the generalisation that if <eventA> then <eventB>?\nAre there other factors in this particular case that could have interfered with the event of‘<eventB>’?\n\nYour answer is just the question without anything else.\n\nThis is the given context to relate the question to:\n\n### Context:\nCLINTON: "which may prove to be an intelligence benefit\r\nwe\'v

Generating Critical Questions:   0%|          | 0/2 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistantıldığında
・━・━assistant
>
assistant
assistant
lásilassistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant
assistant

system
You are a system designed to generate critical questions for a given argumentative context. You write clear and human understandable one sentence questions.
user
Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Write a question that fits one of the given question schema types. If you are not able to create a useful and understandable question out of the given context please indicate with None.

### Question Schema Types::
'Analogy' with the examples:
Are <C1> and <C2> similar in the respect cited?
Is <

Generating Critical Questions:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Commit & Push

In [None]:
!git config --global user.name "Rico Städeli"
!git config --global user.email "rico@yabrriga.ch"


commit_message = f"Finetuned generation"
!git add .
!git commit -m "{commit_message}"
!git push

[main dcd4e0e] Finetuned generation
 2 files changed, 3913 insertions(+), 3908 deletions(-)
 rewrite Evaluation/Results/results_Meta-Llama-3.1-1B-Instruct_SFT_2.json (73%)
Enumerating objects: 13, done.
Counting objects: 100% (13/13), done.
Delta compression using up to 12 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 5.99 KiB | 558.00 KiB/s, done.
Total 7 (delta 4), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/RicoStaedeli/NLP2025_CQG.git
   d919344..dcd4e0e  main -> main
