<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/4_Finetuned_Generation_unsloth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuned Predictions
In this file we generate the finetuned predictions

## Setup

In [1]:
!pip install unsloth_zoo
!pip install --no-deps unsloth
!pip install -U transformers
!pip install accelerate bitsandbytes
!pip install -U peft

Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.5.7-py3-none-any.whl.metadata (8.0 kB)
Collecting tyro (from unsloth_zoo)
  Downloading tyro-0.9.20-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=3.4.1 (from unsloth_zoo)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth_zoo)
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting protobuf<4.0.0 (from unsloth_zoo)
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting cut_cross_entropy (from unsloth_zoo)
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting msgspec (from unsloth_zoo)
  Downloading msgspec-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.4.1->unsloth_zoo)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 

Collecting unsloth
  Downloading unsloth-2025.5.6-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading unsloth-2025.5.6-py3-none-any.whl (265 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/265.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.6/265.6 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unsloth
Successfully installed unsloth-2025.5.6
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00

In [2]:
import torch
from google.colab import userdata
import logging
import transformers
import os
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer
from bitsandbytes import nn as bnb_nn
from unsloth import FastLanguageModel
from tqdm.auto import tqdm
import pandas as pd
import json
from collections import defaultdict
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
token = userdata.get('GITHUB')
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 1163, done.[K
remote: Counting objects: 100% (250/250), done.[K
remote: Compressing objects: 100% (164/164), done.[K
remote: Total 1163 (delta 172), reused 122 (delta 83), pack-reused 913 (from 1)[K
Receiving objects: 100% (1163/1163), 48.33 MiB | 25.89 MiB/s, done.
Resolving deltas: 100% (648/648), done.
Updating files: 100% (111/111), done.


In [5]:
os.chdir("NLP2025_CQG")
!ls

1_a_Generate_DPO_Dataset.ipynb		Data
1_Information_preprocessing.md		Development
1_Preprocessing.ipynb			Doc
2_Baseline_Generation.ipynb		Evaluation
2_Information_Baseline_Generation.md	INFORMATION.md
3_Evaluation.ipynb			LICENSE
3_Training_1_SFT_3.ipynb		Logs
4_Finetuned_Generation.ipynb		README.md
4_Finetuned_Generation_unsloth_2.ipynb	requirements.txt
4_Finetuned_Generation_unsloth.ipynb	Training
5_Evaluation_Analytics.ipynb		Utils


In [6]:
################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

BASE_MODEL = "unsloth/Meta-Llama-3.1-8B-Instruct"
model_name = "Meta-Llama-3.1-1B-Instruct_DPO"
model_id = f"unsloth/Meta-Llama-3.1-8B-Instruct"

test_dataset_path = f"Data/Processed/test.csv"

results_path = os.path.join(os.getcwd(), f"Evaluation/Results/results_{model_name}.json")

log_base_path = f"Logs/"
os.makedirs(log_base_path, exist_ok=True)

log_path = log_base_path + "4_cqs_generation_SFT_2.log"


################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################


In [7]:
# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(log_path)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

# Log the device info
logger.info("--------  Start with Baseline Generation  -------------")
logger.info(f'Device selected: {device}')
logger.info(f'Results Path: {results_path}')
logger.info(f'Log Path: {log_path}')
logger.info("--------------------------------------------------------")

INFO:__main__:--------  Start with Baseline Generation  -------------
INFO:__main__:Device selected: cuda
INFO:__main__:Results Path: /content/NLP2025_CQG/Evaluation/Results/results_Meta-Llama-3.1-1B-Instruct_DPO.json
INFO:__main__:Log Path: Logs/4_cqs_generation_SFT_2.log
INFO:__main__:--------------------------------------------------------


## Generate Answers

In [8]:
# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_id,
    max_seq_length=8192,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.5.6: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((409

Preprocess dataset

In [9]:
df = pd.read_csv(test_dataset_path)

# Define the schema types
schemas = ["CauseToEffect", "ExpertOpinion", "Analogy", "FearAppeal"]

# Prepare the JSON structure
json_data = []
for idx, row in df.iterrows():
    context = row['input'].strip()
    original_id = row['id']

    for schema in schemas:
        entry = {
            "prompt": [
                {
                    "role": "system",
                    "content": "You generate concise, critical, single-sentence questions for argumentative contexts, matching specified question schemas."
                },
                 {
                    "role": "user",
                    "content": f"Generate one critical question addressing the provided context. Ensure it matches the schema:{schema}\n\nContext: {context} Respond only with the question and nothing else.\n\nQuestion: "
                }

            ],
            "id": f"{original_id}_{schema}",
            "schema" : schema

        }
        json_data.append(entry)


# Save the JSON data to a file
temp_output_file = 'processed_dataset.json'
with open(temp_output_file, 'w') as f:
    json.dump(json_data, f, indent=4)

In [12]:
with open(temp_output_file, 'r') as file:
    data = json.load(file)

# Prepare inputs in batch mode with a batch size of 5
batch_size = 30
prompts = []
prompt_ids = []
results = {}

for entry in data:
    entry_id = entry['id'].rsplit('_', 1)[0]
    schema = entry['id'].rsplit('_', 1)[1]
    messages = entry['prompt']
    formatted_messages = [
        {'role': msg['role'], 'content': msg['content']} for msg in messages
    ]
    prompt_text = tokenizer.apply_chat_template(
        formatted_messages,
        tokenize=False,
        add_generation_prompt=True
    )
    prompts.append(prompt_text)
    prompt_ids.append((entry_id, schema, prompt_text))

    # Initialize structure if not already present
    if entry_id not in results:
        results[entry_id] = {"input": formatted_messages[0]['content'], "cqs": []}

In [None]:
# Process in batches to prevent memory overflow
for i in range(0, len(prompts), batch_size):
    batch_data = prompt_ids[i:i+batch_size]
    batch_prompts = [item[2] for item in batch_data]

    # Tokenize with padding for batch processing
    inputs = tokenizer(batch_prompts, return_tensors='pt', padding=True).to('cuda')

    # Generate outputs
    outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Store results
    for j, output in enumerate(decoded_outputs):
        entry_id, schema, prompt_text = batch_data[j]
        results[entry_id]["cqs"].append({"schema": schema, "cq": output.strip()})

# Save the results to a JSON file
with open(results_path, "w") as outfile:
    json.dump(results, outfile, indent=4)

In [None]:
try:
    os.remove(temp_output_file)
    print(f"File '{temp_output_file}' deleted successfully.")
except FileNotFoundError:
    print(f"File '{temp_output_file}' not found.")
except PermissionError:
    print(f"Permission denied to delete '{temp_output_file}'.")
except Exception as e:
    print(f"Error deleting file: {e}")

## Commit & Push

In [None]:
!git config --global user.name "Rico Städeli"
!git config --global user.email "rico@yabrriga.ch"


commit_message = f"Finetuned generation"
!git add .
!git commit -m "{commit_message}"
!git push