## Setting up

In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 
dtype = None 
load_in_4bit = True 

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Loading the model and tokenizer

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.2.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 6.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

## Model inference before fine-tuning

In [6]:
pile_prompt_style = """Assume you are an accomplished legal professional. Below is a legal document from a US court or regulatory filing.
Process the text and summarize its key points in your own words to understand its legal implications.
Before answering, carefully analyze the legal issues, provide a detailed legal reasoning, and cite relevant laws or precedents to support your conclusions. Develop a step-by-step chain of thought to ensure a logical and accurate response.

### Legal Document:
{text}

### Summary:"""


In [8]:
question = "A tenant has been occupying an apartment for five years under a lease agreement that was recently terminated by the landlord without proper notice. The tenant files a lawsuit claiming wrongful eviction and demands compensation for damages. Based on legal precedents and tenant protection laws, what would the court most likely rule regarding the validity of the termination and the tenant’s right to compensation?"

FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([pile_prompt_style.format(text=question)], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response)


['<｜begin▁of▁sentence｜>Assume you are an accomplished legal professional. Below is a legal document from a US court or regulatory filing.\nProcess the text and summarize its key points in your own words to understand its legal implications.\nBefore answering, carefully analyze the legal issues, provide a detailed legal reasoning, and cite relevant laws or precedents to support your conclusions. Develop a step-by-step chain of thought to ensure a logical and accurate response.\n\n### Legal Document:\nA tenant has been occupying an apartment for five years under a lease agreement that was recently terminated by the landlord without proper notice. The tenant files a lawsuit claiming wrongful eviction and demands compensation for damages. Based on legal precedents and tenant protection laws, what would the court most likely rule regarding the validity of the termination and the tenant’s right to compensation?\n\n### Summary: The landlord terminated the lease without proper notice, leading 

## Loading and processing the dataset

In [7]:
from datasets import load_dataset, concatenate_datasets, Dataset
import logging
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define subsets
subsets = ["courtlistener_opinions", "edgar"]

# Define your prompt style (based on your previous output)
# pile_prompt_style = """Assume you are an accomplished legal professional and, below is a legal document from a US court or regulatory filing. 
# Process the text and summarize its key points in your own words to understand its legal implications.

# ### Legal Document:
# {text}

# ### Summary:
# {text}
# """.strip()

def format_pile_prompt(data_point):
    # Truncate text to 1000 characters and format with the prompt style
    truncated_text = data_point['text'][:2000]
    return pile_prompt_style.format(text=truncated_text) + EOS_TOKEN

# Function to load and process a subset with streaming
def load_and_process_subset(subset, num_examples=6000):
    try:
        logger.info(f"Loading subset: {subset}")
        dataset = load_dataset("pile-of-law/pile-of-law", subset, split="train", streaming=True)
        limited_dataset = dataset.take(num_examples)
        data_list = [example for example in limited_dataset]
        
        if not data_list:
            raise ValueError(f"No data retrieved for subset {subset}")
        
        in_memory_dataset = Dataset.from_list(data_list)
        formatted_dataset = in_memory_dataset.map(lambda x: {"text": format_pile_prompt(x)})
        
        logger.info(f"Successfully processed {len(formatted_dataset)} examples from {subset}")
        return formatted_dataset
    
    except Exception as e:
        logger.error(f"Failed to process subset {subset}: {str(e)}")
        return Dataset.from_dict({"text": []})

# Load and concatenate subsets
pile_of_law_datasets = [load_and_process_subset(subset) for subset in subsets]
pile_of_law = concatenate_datasets(pile_of_law_datasets)
pile_of_law_dataset = pile_of_law

# Verify the total number of examples
print(f"Total examples in pile_of_law: {len(pile_of_law)}")
print("First example:", pile_of_law[0])

# Count examples from each subset
def count_subset_sources(dataset):
    subset_counts = {"courtlistener_opinions": 0, "edgar": 0, "unknown": 0}
    
    for example in dataset:
        url = example.get("url", "").lower()
        if "courtlistener" in url:
            subset_counts["courtlistener_opinions"] += 1
        elif "edgar" in url or "sec.gov" in url:  # Adjust based on actual Edgar URLs
            subset_counts["edgar"] += 1
        else:
            subset_counts["unknown"] += 1  # For examples with ambiguous or missing URLs
    
    return subset_counts

# Get and print the counts
counts = count_subset_sources(pile_of_law)
print("Number of examples per subset:")
print(f"Courtlistener Opinions: {counts['courtlistener_opinions']}")
print(f"Edgar: {counts['edgar']}")
print(f"Unknown: {counts['unknown']}")

# Optional: Sample check for Edgar examples
print("\nChecking for an Edgar example:")
for i, example in enumerate(pile_of_law):
    if "edgar" in example.get("url", "").lower() or "sec.gov" in example.get("url", "").lower():
        print(f"Found Edgar example at index {i}:", example)
        break
else:
    print("No Edgar examples found in the dataset.")

README.md:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

pile-of-law.py:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

The repository for pile-of-law/pile-of-law contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/pile-of-law/pile-of-law.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Loading Dataset Infos from /root/.cache/huggingface/modules/datasets_modules/datasets/pile-of-law--pile-of-law/c1090502f95031ebfad49ede680394da5532909fa46b7a0452be8cddecc9fa60


Error reading file: https://huggingface.co/datasets/pile-of-law/pile-of-law/resolve/main/data/train.courtlisteneropinions.0.jsonl.xz


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Loading Dataset Infos from /root/.cache/huggingface/modules/datasets_modules/datasets/pile-of-law--pile-of-law/c1090502f95031ebfad49ede680394da5532909fa46b7a0452be8cddecc9fa60


Error reading file: https://huggingface.co/datasets/pile-of-law/pile-of-law/resolve/main/data/train.edgar.jsonl.xz


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Total examples in pile_of_law: 12000
First example: {'text': 'Assume you are an accomplished legal professional. Below is a legal document from a US court or regulatory filing.\nProcess the text and summarize its key points in your own words to understand its legal implications.\nBefore answering, carefully analyze the legal issues, provide a detailed legal reasoning, and cite relevant laws or precedents to support your conclusions. Develop a step-by-step chain of thought to ensure a logical and accurate response.\n\n### Legal Document:\n—Appeal by the defendant from a judgment of the Supreme Court, Queens County (Brennan, J.), rendered December 1, 1983, adjudicating him a youthful offender, upon his plea of guilty to robbery in the first degree (two counts), robbery in the second degree, and assault in the first degree (two counts), and imposing sentence.\nOrdered that the judgment is affirmed.\nWe have reviewed the record and agree with the defendant’s assigned counsel that there are

In [8]:
print(pile_of_law_dataset.to_pandas().head())

                                                text created_timestamp  \
0  Assume you are an accomplished legal professio...        01-13-2022   
1  Assume you are an accomplished legal professio...        01-13-2022   
2  Assume you are an accomplished legal professio...        01-13-2022   
3  Assume you are an accomplished legal professio...        01-13-2022   
4  Assume you are an accomplished legal professio...        01-13-2022   

  downloaded_timestamp                                                url  
0           01-03-2023  https://www.courtlistener.com/api/rest/v3/opin...  
1           01-03-2023  https://www.courtlistener.com/api/rest/v3/opin...  
2           01-03-2023  https://www.courtlistener.com/api/rest/v3/opin...  
3           01-03-2023  https://www.courtlistener.com/api/rest/v3/opin...  
4           01-03-2023  https://www.courtlistener.com/api/rest/v3/opin...  


In [10]:
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>
{}
</think>
{}"""


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Rank of the LoRA adaptation
    target_modules=[               # Layers to apply LoRA to
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=16,          # Scaling factor for LoRA
    lora_dropout=0,         # Dropout rate for LoRA layers (0 means no dropout)
    bias="none",            # No bias adaptation in LoRA
    use_gradient_checkpointing="unsloth",  # Gradient checkpointing for memory efficiency
    random_state=3407,      # Random seed for reproducibility
    use_rslora=False,       # Use standard LoRA (not rank-stabilized LoRA)
    loftq_config=None,      # No LoFTQ configuration
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Setting up the model

In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments, get_cosine_schedule_with_warmup
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=pile_of_law_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs = 1
        warmup_steps=5,
        #max_steps=300,
        learning_rate=1e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        lr_scheduler_type="cosine", 
        weight_decay=0.01,
        seed=3407,
        output_dir="outputs",
        report_to=["none"],  # Disable WandB integration
    ),
)


Spawning 2 processes


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/12000 [00:00<?, ? examples/s]

Concatenating 2 shards


## Model training

In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 12,000 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.2302
20,1.6177
30,1.4158
40,1.2718
50,1.2896
60,1.2246
70,1.1864
80,1.2111
90,1.1845
100,1.1804


In [13]:
trainer_stats

TrainOutput(global_step=300, training_loss=1.2035038312276205, metrics={'train_runtime': 9277.7982, 'train_samples_per_second': 0.259, 'train_steps_per_second': 0.032, 'total_flos': 5.78761737965568e+16, 'train_loss': 1.2035038312276205})

In [14]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/tokenizer.json')

In [15]:
!zip -r /kaggle/working/fine_tuned_model.zip ./fine_tuned_model

  adding: fine_tuned_model/ (stored 0%)
  adding: fine_tuned_model/adapter_config.json (deflated 56%)
  adding: fine_tuned_model/tokenizer.json (deflated 85%)
  adding: fine_tuned_model/README.md (deflated 66%)
  adding: fine_tuned_model/tokenizer_config.json (deflated 95%)
  adding: fine_tuned_model/adapter_model.safetensors (deflated 7%)
  adding: fine_tuned_model/special_tokens_map.json (deflated 69%)


In [16]:
# Load the dataset
dataset = load_dataset("dzunggg/legal-qa-v1")

# Preprocessing function to clean and format the data
def preprocess_example(example):
    # Remove 'Q:' and 'A:' prefixes and clean text
    question = re.sub(r'http\S+|www\S+|https\S+', '', example['question'].lstrip('Q:').strip())
    answer = re.sub(r'http\S+|www\S+|https\S+', '', example['answer'].lstrip('A:').strip())
    question = question.replace('\n', ' ').replace('\r', ' ')
    answer = answer.replace('\n', ' ').replace('\r', ' ')
    
    # Combine question and answer into a single string
    conversation = (
        f"You are a legal expert. Provide a detailed and accurate answer to the following question in a professional tone: {question}\n\n"
        f"Answer: {answer}"
    )
    return {"text": conversation}

# Apply preprocessing to the training dataset
train_dataset = dataset['train'].map(preprocess_example)


Map:   0%|          | 0/3742 [00:00<?, ? examples/s]

In [None]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/deepseek_r1_lora",
    num_train_epochs=3,
    #max_steps=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    report_to="none",
    max_steps=-1,
    gradient_checkpointing=True,
    optim="adamw_8bit",
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",  # This should match the key in your preprocessed data
    max_seq_length=1024,
    args=training_args,
    packing=False,
)
# Start training
print("Starting fine-tuning...")
stats=trainer.train()

# Save fine-tuned LoRA adapter
model.save_pretrained("/kaggle/working/deepseek_r1_lora")
tokenizer.save_pretrained("/kaggle/working/deepseek_r1_lora")
print("Fine-tuned LoRA adapter saved to /kaggle/working/deepseek_r1_lora")

Starting fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,742 | Num Epochs = 3 | Total steps = 702
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
10,2.4793
20,2.3105
30,2.1838
40,2.2669
50,2.2003
60,2.1295
70,2.1364
80,2.1172
90,2.0707
100,2.1498


In [8]:
print(stats)

                                               input  \
0  Assume you are the most accomplished legal pro...   
1  Assume you are the most accomplished legal pro...   
2  Assume you are the most accomplished legal pro...   
3  Assume you are the most accomplished legal pro...   
4  Assume you are the most accomplished legal pro...   

                                              output  
0  It might be legal. Lawsuits often seek an awar...  
1  nd he deletes his account 35mins later....thin...  
2  Some places have court dates on the ticket.  O...  
3  Yes. The landlord is the owner. The owner can ...  
4  Common sense goes a long way. Sometimes you sh...  


In [None]:
!pip install evaluate


In [None]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Example list of predictions and references
# Replace this with your model's predictions and actual answers
predictions = [
    "Legal contracts are enforceable by courts.",
    "A minor cannot enter into a legal agreement."
]

references = [
    "Courts can enforce legal contracts.",
    "Legal agreements involving minors are not binding."
]

# Compute ROUGE scores
results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
print(f"ROUGE-L: {results['rougeL']:.4f}")


In [None]:
import torch
from transformers import pipeline
from rouge import Rouge
import re

# Assuming model, tokenizer, and train_dataset are already loaded
# train_dataset should be equivalent to dataset in the original code

# Use pipeline with CPU (since model is already on CPU)
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=-1,  # CPU
    max_new_tokens=512,
    do_sample=False,
    temperature=0.0
)

# Preprocess function
def clean(text):
    text = re.sub(r'Q:\s*', '', text)
    text = re.sub(r'A:\s*', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = text.replace('\n', ' ').replace('\r', ' ')
    return text.strip()

# Subset for quick evaluation (10 samples)
subset = train_dataset.select(range(10))

# Generate and compare
rouge = Rouge()
scores = []

for example in subset:
    prompt = f"You are a legal expert. Provide a detailed and accurate answer to the following question in a professional tone: {clean(example['question'])}"
    reference = clean(example['answer'])

    try:
        output = generator(prompt)[0]["generated_text"]
        # Remove prompt from output
        generated = output.replace(prompt, '').strip()
        score = rouge.get_scores(generated, reference)[0]["rouge-l"]["f"]
        scores.append(score)
    except Exception as e:
        print(f"Skipping example due to error: {e}")
        continue

# Calculate and display average ROUGE-L score
if scores:
    average_rouge_l = sum(scores) / len(scores)
    print(f"\n✅ Average ROUGE-L Score: {average_rouge_l:.4f}")
else:
    print("\n❌ No valid scores calculated.")

In [None]:
!pip install rouge-score

In [1]:
dataset = load_dataset("dzunggg/legal-qa-v1")
test_data = dataset["train"] 
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

predictions = []
references = []

# Step 6: Generate predictions for a subset to manage memory usage
for sample in tqdm(test_data.select(range(50))):  # Adjust range as needed
    question = sample["question"]
    reference = sample["answer"]

    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = decoded_output.split("Answer:")[-1].strip()

    predictions.append(pred)
    references.append(reference)

# Step 7: Compute and print evaluation metrics
bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_result = rouge.compute(predictions=predictions, references=references)

print("✅ BLEU Score:", bleu_result["bleu"])
print("✅ ROUGE Scores:", rouge_result)

Loading checkpoint shards: 100%
 9/9 [01:12<00:00,  6.97s/it]
100%|██████████| 50/50 [10:35<00:00, 12.71s/it]
✅ BLEU Score: 0.0587767782433838
✅ ROUGE Scores: {'rouge1': np.float64(0.3452739972537895), 'rouge2': np.float64(0.08962874604671196), 'rougeL
