In [None]:
# import torch
# torch.cuda.is_available()
# !pip uninstall torch

In [None]:
!pip install -q -U bitsandbytes
!pip install --no-cache-dir https://github.com/NetEase-FuXi/EETQ/releases/download/v1.0.0/EETQ-1.0.0+cu121+torch2.1.2-cp310-cp310-linux_x86_64.whl
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets matplotlib

In [None]:
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='data_train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='data_validate.jsonl', split='train')

In [None]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [None]:
!pip install -q wandb -U

import wandb, os
wandb.login(key="a43a8f4c6ff88763479fd91dadde1b04a0a424be")

wandb_project = "journal-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

In [None]:
def formatting_func(example):
    text = f"### Question: {example['question']}\n ### Answer: {example['wrong_answer']}\n ### Found mistakes: {example['found_mistakes']}"
    return text

In [None]:
!huggingface-cli login --token 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

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto")

### 3. Tokenization

Set up the tokenizer. Add padding on the left as it [makes training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa).


For `model_max_length`, it's helpful to get a distribution of your data lengths. Let's first tokenize without the truncation/padding, so we can get a length distribution.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def generate_and_tokenize_prompt(prompt):
    return tokenizer(formatting_func(prompt))

In [None]:
tokenized_train_dataset = [generate_and_tokenize_prompt(i) for i in train_dataset]
tokenized_val_dataset = [generate_and_tokenize_prompt(i) for i in eval_dataset]

In [None]:
max_length = 512 # This was an appropriate max length for my dataset

def generate_and_tokenize_prompt2(prompt):
    result = tokenizer(
        formatting_func(prompt),
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt2)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt2)

### How does the base model do?

In [None]:
question = "How to run a Python file that has a .py extension? Answer should be complete step by step."
answer = """python3 main.py"""

eval_prompt1 = f"""Question: {question} Answer: {answer} \n Found mistakes:"""

question = "Что может быть ключом в словаре?"
answer = """
В Python ключом в словаре может быть любой неизменяемый объект, такой как число, строка или кортеж. Например:

my_dict = {1: 'one', 'two': 2, (3, 4): 'three four'}
В этом примере ключами словаря являются число 1, строка 'two' и кортеж (3, 4). Однако, если вы попытаетесь использовать изменяемый объект, такой как список, как ключ словаря, вы получите TypeError:

my_dict = {[1, 2]: 'one two'}
# this will raise a TypeError: unhashable type: 'list'
Также, если вы попытаетесь добавить два ключа в словарь с одинако-вым хеш-кодом, то второй ключ перезапишет первый:

my_dict = {1: 'one', '1': 'one again'}
# this will result in {1: 'one again'}
"""
eval_prompt2 = f"""Question: {question} Answer: {answer} \n Found mistakes:"""

question = "Что может быть ключом в словаре?"
answer = """Ключом в словаре может быть сам словарь и список"""
eval_prompt3 = f"""Question: {question} Answer: {answer} \n Found mistakes:"""


In [None]:
# Init an eval tokenizer that doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
)

model_input1 = eval_tokenizer(eval_prompt1, return_tensors="pt").to("cuda")
model_input2 = eval_tokenizer(eval_prompt2, return_tensors="pt").to("cuda")
model_input3 = eval_tokenizer(eval_prompt3, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input1, max_new_tokens=256, repetition_penalty=1.15)[0], skip_special_tokens=True))
    print("---------------------------------")
    print(eval_tokenizer.decode(model.generate(**model_input2, max_new_tokens=256, repetition_penalty=1.15)[0], skip_special_tokens=True))
    print("---------------------------------")
    print(eval_tokenizer.decode(model.generate(**model_input3, max_new_tokens=256, repetition_penalty=1.15)[0], skip_special_tokens=True))

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

### 5. Run Training!

In [None]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
model = accelerator.prepare_model(model)

In [None]:
import transformers
from datetime import datetime

project = "journal-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=2,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=50,
        learning_rate=2.5e-5, # Want a small lr for finetuning
        bf16=False,
        optim="paged_adamw_8bit",
        logging_steps=25,              # When to start reporting loss
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=25,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=25,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
)

eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)

In [None]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "mistral-journal-finetune/checkpoint-50")

In [None]:
question = "How to run a Python file that has a .py extension? Answer should be complete step by step."
answer = """python3 main.py"""

eval_prompt1 = f"""Question: {question} Answer: {answer} \n Found mistakes:"""

question = "Что может быть ключом в словаре?"
answer = """
В Python ключом в словаре может быть любой неизменяемый объект, такой как число, строка или кортеж. Например:

my_dict = {1: 'one', 'two': 2, (3, 4): 'three four'}
В этом примере ключами словаря являются число 1, строка 'two' и кортеж (3, 4). Однако, если вы попытаетесь использовать изменяемый объект, такой как список, как ключ словаря, вы получите TypeError:

my_dict = {[1, 2]: 'one two'}
# this will raise a TypeError: unhashable type: 'list'
Также, если вы попытаетесь добавить два ключа в словарь с одинако-вым хеш-кодом, то второй ключ перезапишет первый:

my_dict = {1: 'one', '1': 'one again'}
# this will result in {1: 'one again'}
"""
eval_prompt2 = f"""Question: {question} Answer: {answer} \n Found mistakes:"""

question = "Что может быть ключом в словаре?"
answer = """Ключом в словаре может быть сам словарь и список"""
eval_prompt3 = f"""Question: {question} Answer: {answer} \n Found mistakes:"""


model_input1 = eval_tokenizer(eval_prompt1, return_tensors="pt").to("cuda")
model_input2 = eval_tokenizer(eval_prompt2, return_tensors="pt").to("cuda")
model_input3 = eval_tokenizer(eval_prompt3, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(ft_model.generate(**model_input1, max_new_tokens=500, repetition_penalty=1.15)[0], skip_special_tokens=True))
    print('----------------------------')
    print(eval_tokenizer.decode(ft_model.generate(**model_input2, max_new_tokens=500, repetition_penalty=1.15)[0], skip_special_tokens=True))
    print('----------------------------')
    print(eval_tokenizer.decode(ft_model.generate(**model_input3, max_new_tokens=500, repetition_penalty=1.15)[0], skip_special_tokens=True))

In [None]:
!zip -r /content/file.zip /content/mistral-journal-finetune/checkpoint-100
from google.colab import files
files.download("/content/file.zip")