<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/4_Finetuned_Generation_unsloth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuned Predictions
In this file we generate the finetuned predictions

## Setup

In [1]:
!pip install unsloth_zoo
!pip install --no-deps unsloth
!pip install -U transformers
!pip install accelerate bitsandbytes
!pip install -U peft

Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.5.7-py3-none-any.whl.metadata (8.0 kB)
Collecting tyro (from unsloth_zoo)
  Downloading tyro-0.9.20-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=3.4.1 (from unsloth_zoo)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth_zoo)
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting protobuf<4.0.0 (from unsloth_zoo)
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting cut_cross_entropy (from unsloth_zoo)
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting msgspec (from unsloth_zoo)
  Downloading msgspec-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.4.1->unsloth_zoo)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 

Collecting unsloth
  Downloading unsloth-2025.5.6-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading unsloth-2025.5.6-py3-none-any.whl (265 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.6/265.6 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unsloth
Successfully installed unsloth-2025.5.6
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency con

In [2]:
import torch
from google.colab import userdata
import logging
import transformers
import os
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer
from bitsandbytes import nn as bnb_nn
from unsloth import FastLanguageModel
from tqdm.auto import tqdm
import pandas as pd
import json
from collections import defaultdict
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 1176, done.[K
remote: Counting objects: 100% (263/263), done.[K
remote: Compressing objects: 100% (176/176), done.[K
remote: Total 1176 (delta 180), reused 127 (delta 84), pack-reused 913 (from 1)[K
Receiving objects: 100% (1176/1176), 48.44 MiB | 16.26 MiB/s, done.
Resolving deltas: 100% (656/656), done.
Updating files: 100% (113/113), done.


In [4]:
################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

BASE_MODEL = "unsloth/Meta-Llama-3.1-8B-Instruct"
model_name = "Meta-Llama-3.1-8B-Instruct_DPO"
model_id = "ricostaedeli/Meta-Llama-3.1-8B-Instruct_DPO" #"meta-llama/Llama-3.1-8B-Instruct" # ricostaedeli/Meta-Llama-3.1-8B-Instruct_DPO

test_dataset_path = f"/content/NLP2025_CQG/Data/Processed/test.csv"

results_path = os.path.join(os.getcwd(), f"/content/NLP2025_CQG/Evaluation/Results/results_{model_name}.json")

log_base_path = f"/content/NLP2025_CQG/Logs/"
os.makedirs(log_base_path, exist_ok=True)

log_path = log_base_path + f"4_cqs_generation_{model_name}.log"


################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################


In [5]:
# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(log_path)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

# Log the device info
logger.info("--------  Start with Baseline Generation  -------------")
logger.info(f'Device selected: {device}')
logger.info(f'Results Path: {results_path}')
logger.info(f'Log Path: {log_path}')
logger.info("--------------------------------------------------------")

INFO:__main__:--------  Start with Baseline Generation  -------------
INFO:__main__:Device selected: cuda
INFO:__main__:Results Path: /content/NLP2025_CQG/Evaluation/Results/results_Meta-Llama-3.1-8B-Instruct_DPO.json
INFO:__main__:Log Path: /content/NLP2025_CQG/Logs/4_cqs_generation_Meta-Llama-3.1-8B-Instruct_DPO.log
INFO:__main__:--------------------------------------------------------


## Generate Answers

Preprocess dataset

In [6]:
df = pd.read_csv(test_dataset_path)

# Define the schema types
schemas = ["CauseToEffect", "ExpertOpinion", "Analogy", "FearAppeal"]

# Prepare the JSON structure
json_data = []
for idx, row in df.iterrows():
    context = row['input'].strip()
    original_id = row['id']

    for schema in schemas:
        entry = {
            "prompt": [
                {
                    "role": "system",
                    "content": "You generate concise, critical, single-sentence questions for argumentative contexts, matching specified question schemas."
                },
                 {
                    "role": "user",
                    "content": f"Generate one critical question addressing the provided context. Ensure it matches the schema:{schema}\n\nContext: {context} Respond only with the question and nothing else."
                },
                 {
                    "role": "assistant",
                    "content": "Question: "
                }

            ],
            "id": f"{original_id}",
            "schema" : schema

        }
        json_data.append(entry)


# Save the JSON data to a file
temp_output_file = 'processed_dataset.json'
with open(temp_output_file, 'w') as f:
    json.dump(json_data, f, indent=4)

## Generation with unsloth shit

In [None]:
# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_id,
    max_seq_length=8192,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

In [8]:
with open(temp_output_file, 'r') as file:
    data = json.load(file)

# Prepare inputs in batch mode with a batch size of 5
batch_size = 30
prompts = []
prompt_ids = []
results = {}

for entry in data:
    entry_id = entry['id'].rsplit('_', 1)[0]
    schema = entry['id'].rsplit('_', 1)[1]
    messages = entry['prompt']
    formatted_messages = [
        {'role': msg['role'], 'content': msg['content']} for msg in messages
    ]
    prompt_text = tokenizer.apply_chat_template(
        formatted_messages,
        tokenize=False,
        add_generation_prompt=True
    )
    prompts.append(prompt_text)
    prompt_ids.append((entry_id, schema, prompt_text))

    # Initialize structure if not already present
    if entry_id not in results:
        results[entry_id] = {"input": formatted_messages[0]['content'], "cqs": []}

In [9]:
# Process in batches to prevent memory overflow
for i in range(0, len(prompts), batch_size):
    batch_data = prompt_ids[i:i+batch_size]
    batch_prompts = [item[2] for item in batch_data]

    # Tokenize with padding for batch processing
    inputs = tokenizer(batch_prompts, return_tensors='pt', padding=True).to('cuda')

    # Generate outputs
    outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Store results
    for j, output in enumerate(decoded_outputs):
        entry_id, schema, prompt_text = batch_data[j]
        results[entry_id]["cqs"].append({"schema": schema, "cq": output.strip()})

# Save the results to a JSON file
with open(results_path, "w") as outfile:
    json.dump(results, outfile, indent=4)

## Generation with transformers

In [22]:
# del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
pipeline = transformers.pipeline(
    "text-generation",
    model = model_id,
    model_kwargs={"torch_dtype": torch.float16},
)

In [7]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={
        "torch_dtype": torch.float16,
        "quantization_config": {"load_in_4bit": True},
        "low_cpu_mem_usage": True,
    },
    device_map="auto",
)

config.json:   0%|          | 0.00/924 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Device set to use cuda:0


In [9]:
def get_response(query, max_tokens=512, temperature=0.6, top_p=0.9):

    prompt = pipeline.tokenizer.apply_chat_template(
        query, tokenize=False, add_generation_prompt=True
    )

    outputs = pipeline(
        prompt,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )
    response = outputs[0]["generated_text"][len(prompt):]
    return response

In [11]:
with open(temp_output_file, "r") as f:
    data = json.load(f)

combined_output = {}
i = 0
for item in data:
    print(f"Number: {i}")
    item_id = item["id"]
    schema = item["schema"]
    # Extract context from the user turn
    message = item["prompt"]
    question = get_response(message).strip()

    if item_id not in combined_output:
        combined_output[item_id] = {"input": context, "cqs": []}

    combined_output[item_id]["cqs"].append({"schema": schema, "cq": question})
    i = i + 1
    if i == 10:
      break


# Save the combined questions
with open(results_path, "w") as f:
    json.dump(combined_output, f, indent=4)
print(f"Combined questions saved to {results_path}")

Number: 0
Number: 1
Number: 2
Number: 3
Number: 4


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Number: 5
Number: 6
Number: 7
Number: 8
Number: 9
Combined questions saved to /content/NLP2025_CQG/Evaluation/Results/results_Meta-Llama-3.1-8B-Instruct_DPO.json


In [None]:
def get_responses(queries, max_tokens=512, temperature=0.6, top_p=0.9):
    # Prepare the prompts
    prompts = [pipeline.tokenizer.apply_chat_template(q, tokenize=False, add_generation_prompt=True) for q in queries]

    # Run generation in batches to avoid OOM
    all_responses = []
    batch_size = 50
    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i+batch_size]

        # Run the pipeline in batch mode
        outputs = pipeline(
            batch_prompts,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )

        # Extract responses
        for prompt, output_list in zip(batch_prompts, outputs):
            # Each output_list is a list with a single dictionary
            generated_text = output_list[0]["generated_text"]
            response = generated_text[len(prompt):].strip()
            all_responses.append(response)
            print(response)

    return all_responses

# Load the data
with open(temp_output_file, "r") as f:
    data = json.load(f)

queries = [item["prompt"] for item in data]
responses = get_responses(queries)

# Combine the outputs
combined_output = {}
for item, response in zip(data, responses):
    item_id = item["id"]
    schema = item["schema"]

    if item_id not in combined_output:
        combined_output[item_id] = {"input": item["prompt"], "cqs": []}

    combined_output[item_id]["cqs"].append({"schema": schema, "cq": response})

# Save the combined questions
with open(results_path, "w") as f:
    json.dump(combined_output, f, indent=4)

print(f"Combined questions saved to {results_path}")

Does the dismissal of Muslim nations and communities by Donald Trump hinder the collection of crucial intelligence that could potentially prevent terrorist attacks?
Does the President's dismissal of Muslim allies and communities hinder or enhance the collection of crucial intelligence in the fight against terrorism?
Does the president's rhetoric towards Muslim nations and communities undermine the potential benefits of intelligence gathering and cooperation that could be achieved through more inclusive and respectful approaches?
Doesn't the President's divisive rhetoric towards Muslim nations and communities undermine the very intelligence and cooperation that are crucial to effectively combating terrorism?
Does Clinton's proposed plan for redistributing wealth and closing corporate loopholes actually address the root causes of economic inequality, or is it merely a superficial solution?
Can a system that prioritizes corporate profits over worker benefits truly foster a more equitable 

shit not working

In [15]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
import json

MODEL_PATH = model_id
DATASET_PATH = test_dataset_path
OUTPUT_PATH = results_path

# Load the model and tokenizer
print("Loading model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LlamaForCausalLM.from_pretrained(MODEL_PATH).to(device)
tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, legacy=False)

Loading model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

TypeError: expected str, bytes or os.PathLike object, not NoneType

In [None]:
def generate_question(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=256, do_sample=True, top_p=0.95, temperature=0.8)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Loading dataset...")
with open(DATASET_PATH, "r") as f:
    data = json.load(f)

combined_output = {}
for item in data:
    print(item)
    item_id = item["id"]
    schema = item["schema"]
    # Extract context from the user turn
    context = next(turn["content"] for turn in item["prompt"] if turn["role"] == "user")
    print(f"Generating question for ID: {item_id}, Schema: {schema}...")
    question = generate_question(context).strip()

    if item_id not in combined_output:
        combined_output[item_id] = {"input": context, "cqs": []}

    combined_output[item_id]["cqs"].append({"schema": schema, "cq": question})
    break

# Save the combined questions
with open(OUTPUT_PATH, "w") as f:
    json.dump(combined_output, f, indent=4)
print(f"Combined questions saved to {OUTPUT_PATH}")

In [10]:
try:
    os.remove(temp_output_file)
    print(f"File '{temp_output_file}' deleted successfully.")
except FileNotFoundError:
    print(f"File '{temp_output_file}' not found.")
except PermissionError:
    print(f"Permission denied to delete '{temp_output_file}'.")
except Exception as e:
    print(f"Error deleting file: {e}")

File 'processed_dataset.json' deleted successfully.


## Commit & Push

In [11]:
os.chdir("NLP2025_CQG")
!ls

1_a_Generate_DPO_Dataset.ipynb		Data
1_Information_preprocessing.md		Development
1_Preprocessing.ipynb			Doc
2_Baseline_Generation.ipynb		Evaluation
2_Information_Baseline_Generation.md	INFORMATION.md
3_Evaluation.ipynb			LICENSE
3_Training_1_SFT_3.ipynb		Logs
4_Finetuned_Generation.ipynb		README.md
4_Finetuned_Generation_unsloth_2.ipynb	requirements.txt
4_Finetuned_Generation_unsloth.ipynb	Training
5_Evaluation_Analytics.ipynb		Utils


In [12]:
!git config --global user.name "Rico Städeli"
!git config --global user.email "rico@yabrriga.ch"


commit_message = f"Finetuned generation"
!git add .
!git commit -m "{commit_message}"
!git push

[main db8a461] Finetuned generation
 2 files changed, 3913 insertions(+)
 create mode 100644 Evaluation/Results/results_eta-Llama-3.1-8B-Instruct_DPO.json
 create mode 100644 Logs/4_cqs_generation_eta-Llama-3.1-8B-Instruct_DPO.log
Enumerating objects: 11, done.
Counting objects: 100% (11/11), done.
Delta compression using up to 12 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 105.83 KiB | 3.65 MiB/s, done.
Total 7 (delta 3), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/RicoStaedeli/NLP2025_CQG.git
   4eea9af..db8a461  main -> main
