* https://huggingface.co/docs/bitsandbytes/v0.43.2/index
* https://huggingface.co/docs/peft/en/index

In [None]:
import json
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"

import datasets
from datasets import Dataset
import huggingface_hub
import peft
import torch
from torch import cuda as torch_cuda
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import trl
from utils import jupyter_formatting

jupyter_formatting.setup_notebook_formatting()

torch_cuda.is_available()

In [5]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

* https://huggingface.co/NousResearch/Meta-Llama-3.1-8B-Instruct

In [None]:
MODEL_ID = "NousResearch/Meta-Llama-3.1-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [None]:
if False:
    first_gpu = torch_cuda.get_device_properties(0)

    print(
        """\
  Memory allocated: {0:.2f} MB \
  Memory reserved: {1:.2f} MB\

  Total GPUs: {2}
  GPU's memory: {3:.1f} MB
  """.format(
            torch_cuda.memory_allocated() / 1_024**2,
            torch_cuda.memory_reserved() / 1_024**2,
            torch_cuda.device_count(),
            first_gpu.total_memory / 1_024**2,
        )
    )

In [1]:
if False:
    # Replace the model id with your specific model
    MODEL_ID = "NousResearch/Meta-Llama-3.1-8B-Instruct"

    # Download config.json file from the Hugging Face hub
    config_file_path = huggingface_hub.hf_hub_download(MODEL_ID, "config.json")
    print(config_file_path)

    index_file_path = huggingface_hub.hf_hub_download(
        MODEL_ID, "model.safetensors.index.json"
    )
    print(index_file_path)

In [None]:
if False:
    import json

    with open(config_file_path, "r") as config_file:
        config_data = json.load(config_file)
        print(json.dumps(config_data, indent=4))

    with open(index_file_path, "r") as index_file:
        index_data = json.load(index_file)
        print(json.dumps(index_data, indent=4))

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
print(model)

In [None]:
!git clone https://github.com/lauramanor/legal_summarization

In [None]:
jsonl_array = []
with open("legal_summarization/tldrlegal_v1.json") as f:
    data = json.load(f)
    for _, value in data.items():
        jsonl_array.append(value)

In [None]:
jsonl_array[0].keys()

* https://huggingface.co/docs/datasets/en/index

In [None]:
legal_dataset = Dataset.from_list(jsonl_array)

In [None]:
legal_dataset

In [None]:
legal_dataset = legal_dataset.train_test_split(test_size=0.2)

In [None]:
legal_dataset_test_valid = legal_dataset["test"].train_test_split(test_size=0.5)

In [None]:
legal_dataset = datasets.DatasetDict(
    {
        "train": legal_dataset["train"],
        "test": legal_dataset_test_valid["test"],
        "validation": legal_dataset_test_valid["train"],
    }
)

In [None]:
legal_dataset

In [None]:
legal_dataset["train"][0]

### Instruction Templating

https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1/

In [None]:
INSTRUCTION_PROMPT_TEMPLATE = """\
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Please convert the following legal content into a short human-readable summary<|eot_id|><|start_header_id|>user<|end_header_id|>

[LEGAL_DOC]{LEGAL_TEXT}[END_LEGAL_DOC]<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

RESPONSE_TEMPLATE = """\
{NATURAL_LANGUAGE_SUMMARY}<|eot_id|><|end_of_text|>"""

In [None]:
def create_prompt(sample: str, include_response: bool = True) -> str:
    full_prompt = INSTRUCTION_PROMPT_TEMPLATE.format(LEGAL_TEXT=sample["original_text"])

    if include_response:
        full_prompt += RESPONSE_TEMPLATE.format(
            NATURAL_LANGUAGE_SUMMARY=sample["reference_summary"]
        )

    return full_prompt

In [None]:
print(create_prompt(legal_dataset["test"][1]))

In [None]:
def generate_response(prompt, model, tokenizer):
    """
    Parameters:
      - prompt: str representing formatted prompt
      - model: model object
      - tokenizer: tokenizer object

    Functionality:
      This will allow our model to generate a response to a prompt!

    Returns:
      - str response of the model
    """

    # convert str input into tokenized input
    encoded_input = tokenizer(prompt, return_tensors="pt")

    # send the tokenized inputs to our GPU
    model_inputs = encoded_input.to("cuda")

    # generate response and set desired generation parameters
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=256,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # decode output from tokenized output to str output
    decoded_output = tokenizer.batch_decode(generated_ids)

    # return only the generated response (not the prompt) as output
    return decoded_output[0].split("<|end_header_id|>")[-1]

In [None]:
def generate_response(
    prompt: str,
    model: transformers.AutoModelForCausalLM,
    tokenizer: transformers.AutoTokenizer,
) -> str:
    # convert str input into tokenized input
    encoded_input = tokenizer(prompt, return_tensors="pt")

    # send the tokenized inputs to our GPU
    model_inputs = encoded_input.to("cuda")

    # generate response and set desired generation parameters
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=256,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # decode output from tokenized output to str output
    decoded_output = tokenizer.batch_decode(generated_ids)

    # return only the generated response (not the prompt) as output
    return decoded_output[0].split("<|end_header_id|>")[-1]

In [None]:
generate_response(
    create_prompt(legal_dataset["test"][1], include_response=False), model, tokenizer
)

In [None]:
# Ground Truth Summary
legal_dataset["test"][1]["reference_summary"]

In [None]:
# Let's try another just to see how the model responds to a different prompt.
generate_response(
    create_prompt(legal_dataset["test"][3], include_response=False), model, tokenizer
)

In [None]:
# Ground Truth Summary
legal_dataset["test"][3]["reference_summary"]

The response, while an accurate *recounting* of the conversation is certainly not what would be consider to be a summary.

Let's see how fine-tuning can improve these responses.

### Required Post Processing

In [None]:
model_config = model.config
model = peft.prepare_model_for_kbit_training(model)

## Task #3: Setting up PEFT LoRA

In [None]:
def print_trainable_parameters(model: transformers.AutoModelForCausalLM) -> None:

    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(
        "trainable params: {0} || all params: {1} || trainable%: {2}".format(
            trainable_params, all_param, 100 * trainable_params / all_param
        )
    )

#### Initializing LoRA Config

https://huggingface.co/docs/peft/main/en/package_reference/lora#peft.LoraConfig
https://huggingface.co/docs/peft/main/en/package_reference/peft_model#peft.get_peft_model

In [None]:
# set our rank (higher value is more memory/better performance)
lora_r = 16

# set our dropout (default value)
lora_dropout = 0.1

# rule of thumb: alpha should be (lora_r * 2)
lora_alpha = 32

# construct our LoraConfig with the above hyperparameters
peft_config = peft.LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

model = peft.get_peft_model(model, peft_config)

In [None]:
print_trainable_parameters(model)

In [None]:
print(model)

## Task #4: Training the Model


### Setting up Training

In [None]:
max_seq_length = 2_048

args = trl.SFTConfig(
    output_dir="llama381binstruct_summarize_short",
    max_steps=500,
    per_device_train_batch_size=1,
    warmup_steps=30,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    },
    max_seq_length=max_seq_length,
    packing=True,
)

* https://huggingface.co/docs/trl/en/sft_trainer

In [None]:
trainer = trl.SFTTrainer(
    model=model,
    peft_config=peft_config,
    tokenizer=tokenizer,
    formatting_func=create_prompt,
    args=args,
    train_dataset=legal_dataset["train"],
    eval_dataset=legal_dataset["validation"],
)

In [None]:
trainer.train()

## Task #5: Share Your Model!

In [None]:
huggingface_hub.notebook_login()

In [None]:
trainer.push_to_hub("SethWeidman/llama381binstruct_summarize_short_merged_test")

### Compare Outputs

In [None]:
merged_model = model.merge_and_unload()

In [None]:
merged_model.push_to_hub(
    "SethWeidman/llama381binstruct_summarize_short_merged_test",
    safe_serialization=True,
)

In [None]:
tokenizer.push_to_hub("SethWeidman/llama381binstruct_summarize_short_merged_test")

In [None]:
generate_response(
    create_prompt(legal_dataset["test"][1], include_response=False),
    merged_model,
    tokenizer,
)

In [None]:
legal_dataset["test"][3]["original_text"]

In [None]:
generate_response(
    create_prompt(legal_dataset["test"][3], include_response=False),
    merged_model,
    tokenizer,
)

In [None]:
# Ground Truth Summary
legal_dataset["test"][3]["reference_summary"]