In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainerCallback, Trainer, TrainingArguments, BitsAndBytesConfig
from awq import AutoAWQForCausalLM
from datasets import load_dataset
import torch
from dataset import ConstantLengthDataset
from tqdm import tqdm
from contextlib import nullcontext
from trl import SFTTrainer
import os
from huggingface_hub import HfApi
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
torch.manual_seed(0)

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
model_path = "microsoft/Phi-3-mini-4k-instruct"

# Model Initialization

## Model Quantizing

### Peft

In [4]:
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

In [5]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, target_modules = "all-linear"
)

### AWQ

In [7]:
# from transformers import AwqConfig, AutoConfig
# quant_path = model_path + "-quant"
# quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version":"GEMM"}

# # Load model
# model = AutoAWQForCausalLM.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# # Quantize
# model.quantize(tokenizer, quant_config=quant_config)


# # modify the config file so that it is compatible with transformers integration
# quantization_config = AwqConfig(
#     bits=quant_config["w_bit"],
#     group_size=quant_config["q_group_size"],
#     zero_point=quant_config["zero_point"],
#     version=quant_config["version"].lower(),
# ).to_dict()

# # the pretrained transformers model is stored in the model attribute + we need to pass a dict
# model.model.config.quantization_config = quantization_config
# # a second solution would be to use Autoconfig and push to hub (what we do at llm-awq)


# # save model weights
# model.save_quantized(quant_path)
# tokenizer.save_pretrained(quant_path)
# api = HfApi()
# api.upload_folder(
#     folder_path=quant_path,
#     repo_id="TommyBark/Phi-3-mini-4k-instruct-awq",
#     repo_type="model",
# )

Fetching 19 files: 100%|██████████| 19/19 [00:00<00:00, 70586.16it/s]
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
Downloading readme: 100%|██████████| 167/167 [00:00<00:00, 733kB/s]
Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████| 471M/471M [00:05<00:00, 90.2MB/s] 
Generating validation split: 100%|██████████| 214670/214670 [00:21<00:00, 10002.73 examples/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (8321 > 4096). Running this sequence through the model will result in indexing errors
AWQ: 100%|██████████| 32/32 [16:00<00:00, 30.00s/it]


## Model Loading

### Peft

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_path , trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path , trust_remote_code=True, quantization_config=bnb_config)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.13s/it]


In [8]:
model = prepare_model_for_kbit_training(model)

In [9]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 12,582,912 || all params: 3,833,662,464 || trainable%: 0.3282


### Original

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(model_path , trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(model_path , trust_remote_code=True).to(device)

### AWQ Quantized - don't do this for finetuning

In [None]:
hf_model_path = "TommyBark/Phi-3-mini-4k-instruct-awq"
local_model_path = "./microsoft/Phi-3-mini-4k-instruct-quant/"
if os.path.exists(local_model_path):
    model_path = local_model_path
else:
    model_path = hf_model_path
    
model = AutoAWQForCausalLM.from_quantized(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Data Loading

In [10]:
def chars_token_ratio(dataset, tokenizer, nb_examples=200):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = example["document"]
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens

def create_datasets(tokenizer, dataset_name, split, streaming = True, seq_length = 1024,size_valid_set = 100):
    dataset = load_dataset(
        dataset_name,
        split=split,
        use_auth_token=True,
        num_proc=None,
        streaming=streaming,
    )
    if streaming:
        shuffle_buffer = 4000
        print("Loading the dataset in streaming mode")
        valid_data = dataset.take(size_valid_set)
        train_data = dataset.skip(size_valid_set)
        train_data = train_data.shuffle(buffer_size=shuffle_buffer, seed=None)
    else:
        dataset = dataset.train_test_split(test_size=0.005, seed=None)
        train_data = dataset["train"]
        valid_data = dataset["test"]
        print(
            f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}"
        )

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=lambda x: x["document"],
        infinite=True,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=lambda x: x["document"],
        infinite=False,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset

In [11]:
#ds = load_dataset("HFforLegal/case-law",split='us', streaming=True)
train_ds, eval_ds = create_datasets(tokenizer, "HFforLegal/case-law", "us", streaming=True, seq_length=1024, size_valid_set=100)



Loading the dataset in streaming mode


  0%|          | 1/200 [00:06<22:58,  6.93s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (9045 > 4096). Running this sequence through the model will result in indexing errors
100%|██████████| 200/200 [00:08<00:00, 23.47it/s] 

The character to token ratio of the dataset is: 3.38





In [12]:
for i in eval_ds:
    print(i)
    break

{'input_ids': tensor([29906,   501, 29889,  ...,  9245,   756,  2217], device='cuda:0'), 'labels': tensor([29906,   501, 29889,  ...,  9245,   756,  2217], device='cuda:0')}


In [13]:
for i in train_ds:
    print(i)
    break

{'input_ids': tensor([ 1212,  2519, 29892,  ...,   342,   403,  2545], device='cuda:0'), 'labels': tensor([ 1212,  2519, 29892,  ...,   342,   403,  2545], device='cuda:0')}


# Finetuning

In [16]:
from utils import FinetuningArguments

In [17]:
output_dir = "./finetuning"

In [18]:
class ProfilerCallback(TrainerCallback):
    def __init__(self, profiler):
        self.profiler = profiler

    def on_step_end(self, *args, **kwargs):
        self.profiler.step()

In [19]:
enable_profiler = True
if enable_profiler:
    wait, warmup, active, repeat = 1, 1, 2, 1
    total_steps = (wait + warmup + active) * (1 + repeat)
    schedule = torch.profiler.schedule(
        wait=wait, warmup=warmup, active=active, repeat=repeat
    )
    profiler = torch.profiler.profile(
        schedule=schedule,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(
            f"{output_dir}/logs/tensorboard"
        ),
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
    )

    profiler_callback = ProfilerCallback(profiler)
else:
    profiler = nullcontext()

In [20]:
script_args = FinetuningArguments(model_name = model_path)
peft_config = script_args.peft_config
training_args = script_args.training_args

In [25]:
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 1.0e-04,
    "log_level": "info",
    "logging_steps": 100,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "output_dir": "./finetuning",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 2,
    "per_device_train_batch_size": 3,
    "remove_unused_columns": False,
    "save_steps": 100,
    "save_total_limit": 3,
    "seed": 0,
    #    "gradient_checkpointing": True,
    #    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    "report_to": "wandb",
    "run_name": "ft-phi-3-mini-4k-instruct",
    "max_steps": 1500,
}
training_args = TrainingArguments(**training_config)

PyTorch: setting up devices


In [None]:
with profiler:
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        args=training_args,
        peft_config=peft_config,
        callbacks=[profiler_callback] if enable_profiler else [],
    )
    trainer.train()

trainer.save_model(output_dir)

In [24]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples: Unknown
  Batch size = 1


{'eval_loss': 1.543197512626648,
 'eval_runtime': 242.9595,
 'eval_samples_per_second': 1.877,
 'eval_steps_per_second': 1.877,
 'epoch': 1.0}

In [None]:
test_input = tokenizer.decode(i["input_ids"], skip_special_tokens=True, clean_up_tokenization_spaces=False)

In [34]:
tokenizer.batch_decode(trainer.model.generate(tokenizer(test_input[:100], return_tensors="pt").to(device).input_ids, max_length=50), skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"law, a trial court may consider in a postjudgment review of a\njury's punitive-damages award. Factor 10, which is not listed in the\ntext of the statute, is not a factor"

In [35]:
test_input[:150]

'law, a trial court may consider in a postjudgment review of a\njury\'s punitive-damages award. Factor number "4" is "the\nfinancial position of the defen'

In [32]:
test_input[:50] 

'law, a trial court may consider in a postjudgment '

# Loading finetuned model

In [None]:
model = AutoAWQForCausalLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=True)

# Rest

In [23]:
prompt = "The court held that"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

In [11]:
generate_ids = model.to(device).generate(inputs.input_ids, max_length=300)

In [24]:
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"The court held that the defendant's actions were not protected under the First Amendment because they constituted a true threat, which is not a form of protected speech.\n\nStep 2: Understand the context of the case.\nThe case involves a defendant who made statements that were interpreted as a threat. The court had to determine whether these statements were protected speech under the First Amendment or if they crossed the line into unprotected true threats.\n\nStep 3: Analyze the court's reasoning.\nThe court's analysis focused on the nature of the statements made by the defendant. It considered whether the statements were made in a context that would lead a reasonable person to perceive them as a serious expression of intent to inflict harm.\n\nStep 4: Consider the precedent.\nThe court referenced the precedent set in United States v. Alvarez, which established that the First Amendment does not protect speech that constitutes a true threat. This precedent is crucial in understanding 

In [30]:
generate_ids.shape

torch.Size([1, 300])