In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainerCallback, Trainer, TrainingArguments
from awq import AutoAWQForCausalLM
from datasets import load_dataset
import torch
from dataset import ConstantLengthDataset
from tqdm import tqdm
from contextlib import nullcontext
from trl import SFTTrainer
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
model_path = "microsoft/Phi-3-mini-4k-instruct"

# Model Loading

In [7]:
# tokenizer = AutoTokenizer.from_pretrained(model_path , trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(model_path , trust_remote_code=True).to(device)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.90s/it]


## Model Quantizing

In [7]:
# quant_path = model_path + "-quant"
# quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version":"GEMM"}

# # Load model
# model = AutoAWQForCausalLM.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# # Quantize
# model.quantize(tokenizer, quant_config=quant_config)

Fetching 19 files: 100%|██████████| 19/19 [00:00<00:00, 70586.16it/s]
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
Downloading readme: 100%|██████████| 167/167 [00:00<00:00, 733kB/s]
Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████| 471M/471M [00:05<00:00, 90.2MB/s] 
Generating validation split: 100%|██████████| 214670/214670 [00:21<00:00, 10002.73 examples/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (8321 > 4096). Running this sequence through the model will result in indexing errors
AWQ: 100%|██████████| 32/32 [16:00<00:00, 30.00s/it]


In [8]:
# from transformers import AwqConfig, AutoConfig
# from huggingface_hub import HfApi

# # modify the config file so that it is compatible with transformers integration
# quantization_config = AwqConfig(
#     bits=quant_config["w_bit"],
#     group_size=quant_config["q_group_size"],
#     zero_point=quant_config["zero_point"],
#     version=quant_config["version"].lower(),
# ).to_dict()

# # the pretrained transformers model is stored in the model attribute + we need to pass a dict
# model.model.config.quantization_config = quantization_config
# # a second solution would be to use Autoconfig and push to hub (what we do at llm-awq)


# # save model weights
# model.save_quantized(quant_path)
# tokenizer.save_pretrained(quant_path)

Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using split_torch_state_dict_into_shards from huggingface_hub library


('microsoft/Phi-3-mini-4k-instruct-quant/tokenizer_config.json',
 'microsoft/Phi-3-mini-4k-instruct-quant/special_tokens_map.json',
 'microsoft/Phi-3-mini-4k-instruct-quant/tokenizer.json')

In [13]:
# api = HfApi()
# api.upload_folder(
#     folder_path=quant_path,
#     repo_id="TommyBark/Phi-3-mini-4k-instruct-awq",
#     repo_type="model",
# )

model.safetensors: 100%|██████████| 2.28G/2.28G [01:10<00:00, 32.2MB/s]


CommitInfo(commit_url='https://huggingface.co/TommyBark/Phi-3-mini-4k-instruct-awq/commit/fb66cd9d1bcad4142b6f652b28d16eaa10c03eb2', commit_message='Upload folder using huggingface_hub', commit_description='', oid='fb66cd9d1bcad4142b6f652b28d16eaa10c03eb2', pr_url=None, pr_revision=None, pr_num=None)

In [4]:
quant_model_path = "TommyBark/Phi-3-mini-4k-instruct-awq"
local_model_path = "./microsoft/Phi-3-mini-4k-instruct-quant/"
if os.path.exists(local_model_path):
    model_path = local_model_path
else:
    model_path = quant_model_path
    
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [4]:
# Load state dict from previous finetuning
# state_dict_path = "./finetuning"



# model = AutoAWQForCausalLM.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

OSError: ./finetuning does not appear to have a file named config.json. Checkout 'https://huggingface.co/./finetuning/tree/None' for available files.

# Data Loading

In [5]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = example["document"]
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens

def create_datasets(tokenizer, dataset_name, split, streaming = True, seq_length = 1024,size_valid_set = 100):
    dataset = load_dataset(
        dataset_name,
        split=split,
        use_auth_token=True,
        num_proc=None,
        streaming=streaming,
    )
    if streaming:
        shuffle_buffer = 4000
        print("Loading the dataset in streaming mode")
        valid_data = dataset.take(size_valid_set)
        train_data = dataset.skip(size_valid_set)
        train_data = train_data.shuffle(buffer_size=shuffle_buffer, seed=None)
    else:
        dataset = dataset.train_test_split(test_size=0.005, seed=None)
        train_data = dataset["train"]
        valid_data = dataset["test"]
        print(
            f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}"
        )

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=lambda x: x["document"],
        infinite=True,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=lambda x: x["document"],
        infinite=False,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset

In [6]:
#ds = load_dataset("HFforLegal/case-law",split='us', streaming=True)
train_ds, eval_ds = create_datasets(tokenizer, "HFforLegal/case-law", "us", streaming=True, seq_length=1024, size_valid_set=100)



Loading the dataset in streaming mode


  0%|          | 1/400 [00:01<13:08,  1.98s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (10379 > 4096). Running this sequence through the model will result in indexing errors
100%|██████████| 400/400 [00:04<00:00, 82.26it/s] 

The character to token ratio of the dataset is: 3.40





In [7]:
for i in eval_ds:
    print(i)
    break

{'input_ids': tensor([ 1663, 29884,  4543,  ...,  1105, 29889, 29871], device='cuda:0'), 'labels': tensor([ 1663, 29884,  4543,  ...,  1105, 29889, 29871], device='cuda:0')}


In [14]:
for i in train_ds:
    print(i)
    break

{'input_ids': tensor([ 263, 2022,  310,  ...,  379, 3437,  325], device='cuda:0'), 'labels': tensor([ 263, 2022,  310,  ...,  379, 3437,  325], device='cuda:0')}


In [8]:
class ProfilerCallback(TrainerCallback):
    def __init__(self, profiler):
        self.profiler = profiler

    def on_step_end(self, *args, **kwargs):
        self.profiler.step()

In [9]:
enable_profiler = True
output_dir = "./finetuning"
if enable_profiler:
    wait, warmup, active, repeat = 1, 1, 2, 1
    total_steps = (wait + warmup + active) * (1 + repeat)
    schedule = torch.profiler.schedule(
        wait=wait, warmup=warmup, active=active, repeat=repeat
    )
    profiler = torch.profiler.profile(
        schedule=schedule,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(
            f"{output_dir}/logs/tensorboard"
        ),
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
    )

    profiler_callback = ProfilerCallback(profiler)
else:
    profiler = nullcontext()

In [10]:
from utils import FinetuningArguments

In [11]:
script_args = FinetuningArguments(model_name = model_path)
peft_config = script_args.peft_config
training_args = script_args.training_args

In [12]:
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "output_dir": "./results_ft",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 1,
    "per_device_train_batch_size": 1,
    "remove_unused_columns": False,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
#    "gradient_checkpointing": True,
#    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    "report_to":"wandb",
    "run_name":"ft-phi-3-mini-4k-instruct",
    "max_steps":100,
    }
training_args = TrainingArguments(**training_config)

In [13]:
with profiler:
    trainer = Trainer(
        model=model,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        args=training_args,
        callbacks=[profiler_callback] if enable_profiler else [],
    )
    trainer.train()

trainer.save_model(output_dir)

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
***** Running training *****
  Num examples = 100
  Num Epochs = 9,223,372,036,854,775,807
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 197,200,896
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtomas-t[0m ([33mda-zealots[0m). Use [1m`wandb login --relogin`[0m to force relogin


Could not log the number of model parameters in Weights & Biases.
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
20,1.0113
40,0.0
60,0.0
80,0.0
100,0.0


STAGE:2024-07-29 14:19:42 35997:35997 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
[W CPUAllocator.cpp:249] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event
STAGE:2024-07-29 14:19:44 35997:35997 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-07-29 14:19:44 35997:35997 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
Saving model checkpoint to ./results_ft/checkpoint-100
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./results_ft/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./results_ft/checkpoint-100/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./finetuning
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./finetuning/tokenizer_

In [16]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples: Unknown
  Batch size = 1


KeyboardInterrupt: 

In [9]:
prompt = "The court held that"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

In [11]:
generate_ids = model.to(device).generate(inputs.input_ids, max_length=300)

In [24]:
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"The court held that the defendant's actions were not protected under the First Amendment because they constituted a true threat, which is not a form of protected speech.\n\nStep 2: Understand the context of the case.\nThe case involves a defendant who made statements that were interpreted as a threat. The court had to determine whether these statements were protected speech under the First Amendment or if they crossed the line into unprotected true threats.\n\nStep 3: Analyze the court's reasoning.\nThe court's analysis focused on the nature of the statements made by the defendant. It considered whether the statements were made in a context that would lead a reasonable person to perceive them as a serious expression of intent to inflict harm.\n\nStep 4: Consider the precedent.\nThe court referenced the precedent set in United States v. Alvarez, which established that the First Amendment does not protect speech that constitutes a true threat. This precedent is crucial in understanding 