In [1]:
from datasets import load_dataset

# download dataset
dataset = load_dataset("csv", data_files="finetune.csv")
print(dataset)

# print a sample triplet
print(dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Input', 'Context', 'Output'],
        num_rows: 98
    })
})
{'Input': 'What is the primary purpose of patronage by the European Parliament?', 'Context': '[Patronage is a form of moral support. No financial undertaking nor material obligation are linked to the granting of patronage.] [Patronage is granted by decision of the President of the Parliament to events that meet the conditions laid down in these rules.] [Patronage is a way for Parliament to associate itself with a selected number of events which satisfy the conditions for eligibility in order to increase public interest in and visibility of its activities and those of the European Union among citizens the media and civil society.] [Patronage shall only be granted to specific and confirmed events.] [Requests for patronage shall be submitted to the President preferably through the web form or by email or by post.]', 'Output': 'Patronage is a way for Parliament to associate i

In [2]:
def format_instruction(sample):
    return f"""You are an expert on European Parliament administration. Provide a correct and up to date response based on the given query.
        ### Input:
        {sample["Input"]}

        ### Context:
        {sample["Context"]}

        ### Response:
        {sample["Output"]}
    """

In [3]:
sample = dataset["train"][0]
print(format_instruction(sample))

You are an expert on European Parliament administration. Provide a correct and up to date response based on the given query.
        ### Input:
        What is the primary purpose of patronage by the European Parliament?

        ### Context:
        [Patronage is a form of moral support. No financial undertaking nor material obligation are linked to the granting of patronage.] [Patronage is granted by decision of the President of the Parliament to events that meet the conditions laid down in these rules.] [Patronage is a way for Parliament to associate itself with a selected number of events which satisfy the conditions for eligibility in order to increase public interest in and visibility of its activities and those of the European Union among citizens the media and civil society.] [Patronage shall only be granted to specific and confirmed events.] [Requests for patronage shall be submitted to the President preferably through the web form or by email or by post.]

        ### Respons

In [4]:
from transformers import BitsAndBytesConfig
import torch

# BitsAndBytesConfig to quantize the model int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# base model id to fine-tune
model_id = "mistralai/Mistral-7B-v0.3"
access_token="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"

# load model 
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto",
    token=access_token
)
model.config.pretraining_tp = 1

# load tokenizer, pad short samples with end of sentence token
tokenizer = AutoTokenizer.from_pretrained(model_id, force_download=True, token=access_token)
tokenizer.pad_token = tokenizer.eos_token



config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [6]:
from peft import LoraConfig

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

In [7]:
from peft import prepare_model_for_kbit_training, get_peft_model

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

# get frozen vs trainable model param statistics
print_trainable_parameters(model)

trainable params: 85065728 || all params: 3843428352 || trainable%: 2.213277319342624


In [13]:
from trl import SFTTrainer
from transformers import TrainingArguments

model_args = TrainingArguments(
    output_dir="mistral-7b-style",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False
)

# Supervised Fine-Tuning Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    max_seq_length=32,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction,
    args=model_args,
)

# train
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]



Step,Training Loss
10,2.61
20,1.89
30,1.2632
40,1.2035
50,1.0994
60,1.1832
70,1.176
80,1.0724
90,1.0278
100,0.9001




TrainOutput(global_step=288, training_loss=0.7661446920699544, metrics={'train_runtime': 545.3632, 'train_samples_per_second': 4.208, 'train_steps_per_second': 0.528, 'total_flos': 3172110760673280.0, 'train_loss': 0.7661446920699544, 'epoch': 3.0})

In [14]:
# save model to output_dir in TrainingArguments
model.merge_and_unload()
trainer.save_model()

# login to HF hub
from huggingface_hub import login
login("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")
    
# push model and tokenizer to HF hub under your username
trainer.model.push_to_hub("mistral-7b-eugpt")
tokenizer.push_to_hub("mistral-7b-eugpt")



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/splacintescu/.cache/huggingface/token
Login successful


adapter_model.safetensors:   0%|          | 0.00/877M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/xX-FANE-Xx/mistral-7b-eugpt/commit/51c44d8d22e124cc6906459c969c4f6150e59daa', commit_message='Upload tokenizer', commit_description='', oid='51c44d8d22e124cc6906459c969c4f6150e59daa', pr_url=None, pr_revision=None, pr_num=None)