In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q datasets

In [3]:
from transformers import TrainerCallback

## **Model Modeling**

In [None]:
!pip install -q bitsandbytes>=0.41.0 accelerate
!pip install -q peft transformers datasets

import os
import json
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    TrainerCallback
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)
print(f"Using device: {device}")

MODEL_NAME = "Equall/Saul-7B-Instruct-v1"  # Legal specialized model
OUTPUT_DIR = "/content/drive/MyDrive/pfa_finetuning/gdpr/finetuned-saul-legal-model"  # Local path to save model
DATASET_PATH = "/content/drive/MyDrive/pfa_finetuning/gdpr/articles/preprocessed/gdpr_QA_316.jsonl"  # Local path to dataset
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
LEARNING_RATE = 2e-4
BATCH_SIZE = 1  
GRADIENT_ACCUMULATION_STEPS = 16  
NUM_EPOCHS = 1
MAX_SEQ_LEN = 768  
SAVE_STEPS = 100
FORMAT_TYPE = "qa_format"  #

import gc
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    print("HF token found, using for authentication")
except:
    hf_token = None
    print("No HF token found, proceeding without authentication (only works for public models)")

try:
    import bitsandbytes as bnb
    from peft import (
        LoraConfig,
        get_peft_model,
        prepare_model_for_kbit_training,
        TaskType
    )
    from transformers import BitsAndBytesConfig
    QUANTIZATION_AVAILABLE = True
    PEFT_AVAILABLE = True  # Explicitly set when imports succeed
    print("Quantization and PEFT libraries successfully imported")
except ImportError:
    QUANTIZATION_AVAILABLE = False
    PEFT_AVAILABLE = False
    print("Quantization not available - will load model in full precision (requires more memory)")
    try:
        from peft import (
            LoraConfig,
            get_peft_model,
            TaskType
        )
        PEFT_AVAILABLE = True
        print("PEFT available without quantization")
    except ImportError:
        print("PEFT not available - will fine-tune model without parameter-efficient methods")

print(f"Loading tokenizer for {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    token=hf_token,
    padding_side="right",
    model_max_length=MAX_SEQ_LEN
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Loading model {MODEL_NAME}...")
model_loading_args = {
    "token": hf_token,
    "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
    "low_cpu_mem_usage": True,
}

if QUANTIZATION_AVAILABLE and torch.cuda.is_available():
    print("Loading model with 4-bit quantization...")
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    model_loading_args["quantization_config"] = quantization_config
    if torch.cuda.device_count() > 1:
        model_loading_args["device_map"] = "auto"

try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        **model_loading_args
    )

    if QUANTIZATION_AVAILABLE and torch.cuda.is_available():
        model = prepare_model_for_kbit_training(model)
    elif device.type != "cpu":
        model = model.to(device)

    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Trying with less memory-intensive options...")

    # Try with lower precision and more memory optimizations
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=hf_token,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
        )
        if device.type != "cpu":
            model = model.to(device)
        print("Model loaded successfully with fallback options")
    except Exception as e2:
        print(f"Still failed to load model: {e2}")
        print("Cannot proceed without loading model.")
        raise

# Apply LoRA if PEFT is available
if PEFT_AVAILABLE:
    print("Applying LoRA for parameter-efficient fine-tuning")
    # Target modules for LoRA
    if device.type == "cpu":
        target_modules = ["q_proj", "v_proj"]  # Target fewer modules on CPU
        LORA_R = 4  # Lower rank for CPU training
    else:
        target_modules = ["q_proj", "v_proj", "k_proj", "o_proj"]

    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        target_modules=target_modules,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()  # Show trainable parameters
else:
    print("PEFT not available - fine-tuning full model (requires much more memory)")



Using device: cuda
No HF token found, proceeding without authentication (only works for public models)
Quantization and PEFT libraries successfully imported
Loading tokenizer for Equall/Saul-7B-Instruct-v1...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading model Equall/Saul-7B-Instruct-v1...
Loading model with 4-bit quantization...


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Model loaded successfully
Applying LoRA for parameter-efficient fine-tuning
trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940


# **Data Preparation**

In [5]:
# Sample dataset fallback
def create_sample_dataset():
    print("Creating a tiny sample dataset for testing purposes...")
    sample_data = [
        {"input": "What is the GDPR?", "output": "The GDPR is a regulation..."},
        {"input": "Detail the specific obligations...", "output": "Under the CTR..."}
    ]
    df = pd.DataFrame(sample_data)
    df["formatted_text"] = df.apply(
        lambda row: f"<|user|>\n{row['input']}\n<|assistant|>\n{row['output']}",
        axis=1
    )
    return Dataset.from_pandas(df[["formatted_text"]])

# Load and prepare dataset
def load_legal_dataset(path, format_type):
    if not os.path.exists(path):
        print(f"Dataset file {path} not found!")
        return create_sample_dataset()

    data = []
    chunk_size = 100
    if path.endswith('.jsonl'):
        with open(path, 'r', encoding='utf-8') as f:
            chunk = []
            for i, line in enumerate(f):
                chunk.append(json.loads(line))
                if (i + 1) % chunk_size == 0:
                    data.extend(chunk)
                    chunk = []
                    gc.collect()
            if chunk:
                data.extend(chunk)
    elif path.endswith('.json'):
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)

    df = pd.DataFrame(data)

    possible_question_keys = ['input', 'question', 'prompt', 'instruction']  # Prioritize 'input'
    possible_answer_keys = ['output', 'answer', 'response']  # Prioritize 'output'
    question_key = next((key for key in possible_question_keys if key in df.columns), None)
    answer_key = next((key for key in possible_answer_keys if key in df.columns), None)

    if format_type == "instruction_input_output":
        df["formatted_text"] = df.apply(
            lambda row: f"<|user|>\n{row['instruction']}\n\n{row['input']}\n<|assistant|>\n{row['output']}",
            axis=1
        )
    elif format_type == "text":
        df["formatted_text"] = df["text"]
    elif format_type == "discussion_text":
        df["formatted_text"] = df["discussion_text"].apply(
            lambda text: text.replace("[INST]", "<|user|>\n").replace("[/INST]", "\n<|assistant|>\n")
        )
    elif format_type == "qa_format":
        if question_key and answer_key:
            print(f"Using '{question_key}' as question key and '{answer_key}' as answer key")
            df["formatted_text"] = df.apply(
                lambda row: f"<|user|>\n{row[question_key]}\n<|assistant|>\n{row[answer_key]}",
                axis=1
            )
        else:
            raise KeyError(f"Could not find suitable question/answer keys in dataset. Found columns: {list(df.columns)}")
    else:
        raise ValueError(f"Unknown format type: {format_type}")

    return Dataset.from_pandas(df[["formatted_text"]])

# Load dataset
print(f"Loading dataset with format: {FORMAT_TYPE}")
dataset = load_legal_dataset(DATASET_PATH, FORMAT_TYPE)
print(f"Dataset loaded with {len(dataset)} examples")

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(
        examples["formatted_text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_tensors=None,
    )

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=["formatted_text"],
    desc="Tokenizing dataset",
)

tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
print(f"Split dataset into {len(tokenized_dataset['train'])} training and {len(tokenized_dataset['test'])} test examples")

del dataset
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Loading dataset with format: qa_format
Using 'input' as question key and 'output' as answer key
Dataset loaded with 316 examples
Tokenizing dataset...


Tokenizing dataset:   0%|          | 0/316 [00:00<?, ? examples/s]

Split dataset into 284 training and 32 test examples


# **Training**

In [6]:
# Custom callback
class ProgressCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            print(f"Step {state.global_step} - Training Loss: {logs['loss']:.4f}")
            if torch.cuda.is_available():
                gpu_memory = torch.cuda.memory_allocated() / 1024**3
                print(f"GPU Memory: {gpu_memory:.2f} GB")
        if logs and "eval_loss" in logs:
            print(f"Step {state.global_step} - Eval Loss: {logs['eval_loss']:.4f}")

# Gradient checkpointing
gradient_checkpointing = False
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()
    gradient_checkpointing = True
    print("Gradient checkpointing enabled for memory efficiency")

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.03,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    eval_steps=SAVE_STEPS,
    logging_dir="./logs",
    logging_steps=2,
    fp16=torch.cuda.is_available(),
    bf16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8,
    optim="adamw_torch",
    gradient_checkpointing=gradient_checkpointing,
    report_to="tensorboard",
    disable_tqdm=False,
    dataloader_num_workers=0,
    dataloader_pin_memory=True if torch.cuda.is_available() else False,
    ddp_find_unused_parameters=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    callbacks=[ProgressCallback()],
)

# Train
print("Starting training...")
try:
    trainer.train()
    training_successful = True
except Exception as e:
    training_successful = False
    print(f"Training error: {e}")
    if torch.cuda.is_available():
        print(f"Current GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"Max GPU Memory: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
    print("\nTrying to reduce model size or batch size might help.")

# Save model
if training_successful:
    print("Saving model...")
    if PEFT_AVAILABLE:
        model.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    else:
        model.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    print(f"Model saved to {os.path.join(OUTPUT_DIR, 'final_model')}")



Gradient checkpointing enabled for memory efficiency


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Step,Training Loss
2,1.3245
4,1.3274
6,1.2333
8,1.1409
10,1.1191
12,1.1148
14,1.1403
16,0.992
18,1.0798
20,1.0717


Step 2 - Training Loss: 1.3245
GPU Memory: 4.42 GB
Step 4 - Training Loss: 1.3274
GPU Memory: 4.42 GB
Step 6 - Training Loss: 1.2333
GPU Memory: 4.42 GB
Step 8 - Training Loss: 1.1409
GPU Memory: 4.42 GB
Step 10 - Training Loss: 1.1191
GPU Memory: 4.42 GB
Step 12 - Training Loss: 1.1148
GPU Memory: 4.42 GB
Step 14 - Training Loss: 1.1403
GPU Memory: 4.42 GB
Step 16 - Training Loss: 0.9920
GPU Memory: 4.42 GB
Step 18 - Training Loss: 1.0798
GPU Memory: 4.42 GB
Step 20 - Training Loss: 1.0717
GPU Memory: 4.42 GB
Step 22 - Training Loss: 0.9726
GPU Memory: 4.42 GB
Step 24 - Training Loss: 1.0675
GPU Memory: 4.42 GB
Step 26 - Training Loss: 1.0584
GPU Memory: 4.42 GB
Step 28 - Training Loss: 1.0808
GPU Memory: 4.42 GB
Step 30 - Training Loss: 1.0757
GPU Memory: 4.42 GB
Step 32 - Training Loss: 1.0352
GPU Memory: 4.42 GB
Step 34 - Training Loss: 0.9751
GPU Memory: 4.42 GB
Saving model...
Model saved to /content/drive/MyDrive/pfa_finetuning/gdpr/finetuned-saul-legal-model/final_model


# **Testing**

In [7]:
import torch

# Instruction and section to test
instruction = "Based on the following section, does it comply with GDPR Article 65(1)(a)? Respond with 'Compliant' or 'Not Compliant' and briefly explain why."
test_section = "No data processing agreement has been concluded with the company whose servers contained the resources of the Public Information Bulletin (BIP) of the Municipal Office in Aleksandrów Kujawski. For this reason, a fine of 40.000 PLN (9400 EUR) was imposed on the mayor of the city."

# Construct prompt
test_prompt = f"<|user|>\n{instruction}\n\nSection:\n{test_section}\n<|assistant|>"
print(f"\nTesting model with prompt:\n{test_prompt}")

# Tokenize and prepare inputs
inputs = tokenizer(test_prompt, return_tensors="pt").to(device)

# Generate response
try:
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode and extract response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("<|assistant|>")[1].strip() if "<|assistant|>" in response else response
    print(f"\nResponse: {response}")

except Exception as e:
    print(f"Error generating test response: {e}")
    if torch.cuda.is_available():
        print(f"GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.



Testing model with prompt:
<|user|>
Based on the following section, does it comply with GDPR Article 65(1)(a)? Respond with 'Compliant' or 'Not Compliant' and briefly explain why.

Section:
No data processing agreement has been concluded with the company whose servers contained the resources of the Public Information Bulletin (BIP) of the Municipal Office in Aleksandrów Kujawski. For this reason, a fine of 40.000 PLN (9400 EUR) was imposed on the mayor of the city.
<|assistant|>





Response: **Compliance with GDPR Article 65(1)(a):**
- **Legal Basis:** GDPR Article 65(1)(a) states that Member States may specify the tasks and powers of the data protection authority.
- **Compliance Explanation:** The provided section outlines a situation where a data processing agreement has not been concluded with a company whose servers contained resources of a Public Information Bulletin (BIP). This action led to a fine being imposed on the mayor of the city. This scenario is in line with GDPR Article 65(1)(a) as it demonstrates the tasks and powers of the data protection authority, specifically enforcing compliance with GDPR data processing agreements.

**Relevant GDPR Article:** Article 65(1)(a) - Tasks and powers of the supervisory authority.

**Source of Compliance:** GDPR Article 65(1)(a) - Legal Basis for Data Protection Authority Tasks and Powers.

**Assessment:** The provided section demonstrates how a data protection authority enforces GDPR compliance by imposing a fin