# LegalBERT Fine-Tuning for GDPR Compliance

## Introduction and Environment Setup

This notebook fine-tunes LegalBERT on GDPR articles and compliance datasets. The goal is to enhance the model's ability to identify GDPR compliance and non-compliance in legal texts. The notebook is structured as follows:
1. Environment Setup
2. Data Loading and Preprocessing
3. Model Fine-Tuning
4. Evaluation and Results
5. Conclusions and Next Steps

### Check GPU Memory

In [None]:
!nvidia-smi

### Install Required Libraries

In [None]:
!pip install evaluate

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

## Data Loading and Preprocessing

This section loads the GDPR articles used for fine-tuning the LegalBERT model.

**load data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import transformers
import torch
from datasets import Dataset, load_dataset

# Structure your GDPR articles data
gdpr_texts = '/content/drive/MyDrive/pfa_finetuning/gdpr/articles/preprocessed/gdpr_articles_recitals_preprocessed.jsonl'

# Load dataset from JSONL file
recitals_dataset = load_dataset("json", data_files=gdpr_texts, split="train")


# Tokenize with the appropriate tokenizer (LegalBERT)
tokenizer = transformers.AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = recitals_dataset.map(tokenize_function, batched=True)

# Data collator for MLM
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Standard masking rate
)

# Load base model
model = transformers.AutoModelForMaskedLM.from_pretrained("nlpaueb/legal-bert-base-uncased")

# Set up training arguments
training_args = transformers.TrainingArguments(
    output_dir="/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=50,  # Log every 50 steps
    eval_steps=200,  # Evaluate every 200 steps
    report_to="none"  # Avoid sending logs to external tools
)

# Initialize trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)



In [None]:
# Train the model
trainer.train()

In [None]:
# Save the resulting model
model.save_pretrained("/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-step1-articles-recitals-dataset")
tokenizer.save_pretrained("./content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-step1-articles-recitals-dataset")

# Simple validation script
test_text = "According to the GDPR, data subjects have the right to [MASK] their personal data."
inputs = tokenizer(test_text, return_tensors="pt").to(model.device) # Move inputs to the same device as the model
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

outputs = model(**inputs)
predictions = outputs.logits
predicted_token_id = predictions[0, mask_token_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print(f"Predicted: {predicted_token}")  # Should predict "access" or similar relevant term


In [None]:
# Simple validation script
test_text = "Article 1 from GDPR can be summarized : Individuals can [MASK] their data and data [MASK] should be [MASK]."
inputs = tokenizer(test_text, return_tensors="pt").to(model.device) # Move inputs to the same device as the model
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

outputs = model(**inputs)
predictions = outputs.logits
predicted_token_id = predictions[0, mask_token_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print(f"Predicted: {predicted_token}")

# load previously tuned model

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

model_path = "/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-step1-articles-recitals-dataset"
model = AutoModelForMaskedLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:
chapters_path = '/content/drive/MyDrive/pfa_finetuning/gdpr/articles/preprocessed/gdpr_article_chapters_content_preprocessed.jsonl'
chapters_dataset = load_dataset("json", data_files=chapters_path, split="train")


In [None]:
tokenized_dataset = chapters_dataset.map(tokenize_function, batched=True)

In [None]:
training_args = transformers.TrainingArguments(
    output_dir="./gdpr-legalbert",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=50,  # Log every 50 steps
    eval_steps=200,  # Evaluate every 200 steps
    prediction_loss_only=True,
    report_to="none"  # Avoid sending logs to external tools
)


# Initialize trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

# Train the model
trainer.train()

# Save the resulting model
model.save_pretrained("/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-step1-articles-chapters-dataset")
tokenizer.save_pretrained("./gdpr-legalbert-step1-articles-chapters-dataset")

# Simple validation script
test_text = "According to the GDPR, data subjects have the right to [MASK] their personal data."
inputs = tokenizer(test_text, return_tensors="pt").to(model.device) # Move inputs to the same device as the model
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

In [None]:

outputs = model(**inputs)
predictions = outputs.logits
predicted_token_id = predictions[0, mask_token_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print(f"Predicted: {predicted_token}")  # Should predict "access" or similar relevant term


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

model_path = "/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-step1-articles-chapters-dataset"
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
from datasets import load_dataset

qa_path = '/content/drive/MyDrive/pfa_finetuning/gdpr/articles/preprocessed/gdpr_articles_qa_squad_preprocessed.json'
qa_dataset = load_dataset("json", data_files=qa_path, split="train")


In [None]:
# Accessing the inner QAs within the single dataset element
paragraphs = qa_dataset[0]['data']['paragraphs']

# Print the number of QAs
print(f"Number of QAs: {sum(len(p['qas']) for p in paragraphs)}")

# Print the first QA pair as an example
print(paragraphs[0]['qas'][0])


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
import numpy as np

# Load your previously fine-tuned model
model_path = "/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-step1-articles-chapters-dataset"
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load dataset in SQuAD format
qa_path = '/content/drive/MyDrive/pfa_finetuning/gdpr/articles/preprocessed/gdpr_articles_qa_squad_preprocessed.json'
qa_dataset = load_dataset("json", data_files=qa_path, split="train")

# Preprocess the dataset to flatten and format it correctly
def preprocess_squad_dataset(examples):
    new_examples = {
        "question": [],
        "context": [],
        "id": [],
        "answers": []
    }

    for article in examples['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                new_examples["question"].append(qa['question'])
                new_examples["context"].append(context)
                new_examples["id"].append(qa['id'])
                new_examples["answers"].append({
                    "text": [answer['text'] for answer in qa['answers']],
                    "answer_start": [answer['answer_start'] for answer in qa['answers']]
                })

    return new_examples

# Apply preprocessing
flat_qa_dataset = qa_dataset.map(
    preprocess_squad_dataset,
    batched=True,
    remove_columns=qa_dataset.column_names
)

# Tokenize the dataset
def prepare_train_features(examples):
    # Tokenize questions and contexts
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    # Map from token spans back to character spans in the original document
    offset_mapping = tokenized_examples.pop("offset_mapping")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # The start_positions and end_positions represent answer span token indices
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # Match back to the original example
        example_idx = sample_mapping[i]

        # Get answer for this example
        answers = examples["answers"][example_idx]
        answer_starts = answers["answer_start"]
        answer_texts = answers["text"]

        # By default, use first answer (can be modified for multiple answers)
        if len(answer_starts) > 0:
            answer_start_char = answer_starts[0]
            answer_text = answer_texts[0]
            answer_end_char = answer_start_char + len(answer_text)
        else:
            # No answers - use special token positions
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
            continue

        # Find token indices that cover the answer
        token_start_index = 0
        token_end_index = 0

        # Find the start token index
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start_char < end:
                token_start_index = idx
                break

        # Find the end token index
        for idx, (start, end) in enumerate(offsets):
            if start < answer_end_char <= end:
                token_end_index = idx
                break

        # Add start and end positions
        tokenized_examples["start_positions"].append(token_start_index)
        tokenized_examples["end_positions"].append(token_end_index)

    return tokenized_examples

# Apply feature preparation
tokenized_qa = flat_qa_dataset.map(
    prepare_train_features,
    batched=True,
    remove_columns=flat_qa_dataset.column_names
)

# Split the data
split_dataset = tokenized_qa.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]



In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-qa",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100
  )

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)



In [None]:
# Train
trainer.train()

In [None]:
# Save the model
model.save_pretrained("/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-qa-final")
tokenizer.save_pretrained("/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-qa-final")



In [None]:
# Evaluation function for QA
def compute_metrics(eval_pred):
    metric = evaluate.load("squad")

    predictions, labels = eval_pred
    # Extract start and end logits
    start_logits, end_logits = predictions

    # Format predictions for evaluation
    formatted_predictions = [
        {"id": eval_dataset["id"][i], "prediction_text": "dummy", "no_answer_probability": 0.0}
        for i in range(len(start_logits))
    ]

    # Format references for evaluation
    references = [
        {"id": eval_dataset["id"][i], "answers": {"text": [eval_dataset["answers"][i]["text"]], "answer_start": [eval_dataset["answers"][i]["answer_start"]]}}
        for i in range(len(labels))
    ]

    # Compute metrics
    result = metric.compute(predictions=formatted_predictions, references=references)
    return result

# Evaluate
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

In [None]:
from transformers import pipeline

# Load the fine-tuned model
qa_pipeline = pipeline(
    "question-answering",
    model="/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-qa-final",
    tokenizer="/content/drive/MyDrive/pfa_finetuning/gdpr/gdpr-legalbert-qa-final"
)



In [None]:
# Test with examples
test_examples = [
    {
        "question": "What is the purpose of GDPR?",
        "context": "The GDPR aims to strengthen the protection of personal data within the EU. It lays down rules relating to the protection of natural persons with regard to the processing of personal data and rules relating to the free movement of personal data."
    },
    # Add more test examples
]

for example in test_examples:
    result = qa_pipeline(example)
    print(f"Question: {example['question']}")
    print(f"Answer: {result['answer']}")
    print(f"Score: {result['score']:.4f}")
    print("-" * 50)