# LoRA Fine-tuning Mistral 7B on UA-GEC dataset

#### Install required libraries

In [1]:
!git clone https://github.com/Reennon/ua-gec-lora.git
!cd ua-gec-lora && pip install -q -r requirements.txt
!pwd && ls -a
# Install additional libs
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
!pip install -q ua_gec
!pip install -q datasets==2.16.0
!pip install -q nltk
!pip install -q -U wandb
!pip install -q toolz
# CD into the project directory
%cd ua-gec-lora
!git pull origin "feature/fine-tuning-research"
!git status

Cloning into 'ua-gec-lora'...
remote: Enumerating objects: 125, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (107/107), done.[K
remote: Total 125 (delta 48), reused 71 (delta 13), pack-reused 0[K
Receiving objects: 100% (125/125), 187.30 KiB | 7.49 MiB/s, done.
Resolving deltas: 100% (48/48), done.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
keras-nlp 0.8.1 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not insta

#### Add imports

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, TaskType, PeftModel, get_peft_model, prepare_model_for_kbit_training
from src.packages.constants.error_constants import ErrorConstants
from src.packages.prompts.instruction_tuning_gec_prompts import InstructionTuningGecPrompts
from ua_gec import Corpus
from langchain.prompts import PromptTemplate
from kaggle_secrets import UserSecretsClient
from difflib import SequenceMatcher
from trl import SFTTrainer
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
from torch.utils.data import Dataset
import time
import re
import torch
import nltk
import wandb
import gc
import toolz

nltk.download('punkt')  # Download the necessary resources for sentence tokenization

from nltk.tokenize import sent_tokenize

2024-03-24 20:14:56.902495: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-24 20:14:56.902601: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-24 20:14:57.028758: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

#### Load HuggingFace and Weights & Biases secrets

In [4]:
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
secret_wandb = user_secrets.get_secret("wandb")

#### Login to HuggingFace

In [5]:
!huggingface-cli login --token $secret_hf

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


#### Login to Weights & Biases and connect to project

In [6]:
wandb_project_name = 'UA-GEC LoRA fine tuning mistral 7B'

wandb.login(key = secret_wandb)
run = wandb.init(
    project=wandb_project_name, 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mandriankr[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/ua-gec-lora/wandb/run-20240324_201509-joxyske7[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mmajor-snow-10[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/andriankr/UA-GEC%20LoRA%20fine%20tuning%20mistral%207B[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/andriankr/UA-GEC%20LoRA%20fine%20tuning%20mistral%207B/runs/joxyske7[0m


#### Specify model names

In [7]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
fine_tuned_model_name = "mistral-7b-ua-gec"

#### Load model

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16, 
    device_map={'':torch.cuda.current_device()},)

model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

#### Load tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side='left',
    trust_remote_code=True)

# Fix padding token for Mistral and Phi-2 models
tokenizer.pad_token = "[PAD]"

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

#### Load prompt

In [10]:
template = """[INST] Given a text ("ORIGINAL_TEXT") in Ukrainian with potential errors, correct them to fulfill the GEC (Grammar Error Correction) Task.
Consider the provided set of error types ("ERROR_TYPES"):
{error_types}
When you identify an error ("ERROR") in the text, correct it according to the format:
("ERROR") => ("CORRECTION")
The correction should address the error without providing explicit reasoning for the change.
The resulting text ("FIXED_TEXT") should be error-free, maintaining the original information's semantics.
Focus solely on correcting Ukrainian language errors.
Ensure that the corrected text doesn't include original errors, additional text, comments, or parts of these instructions.

ORIGINAL_TEXT: {query}
FIXED_TEXT:
[/INST]"""

it_prompt = PromptTemplate(
    template=template,
    input_variables=['query', 'error_types']
)

#### Specify number of sentences for each document in training set

In [11]:
max_sentences = 4

#### Load UA-GEC train dataset and remove duplicates

In [12]:
corpus_train_list = list(Corpus(partition="train", annotation_layer="gec-only"))

corpus_train_list = list(toolz.unique(corpus_train_list, key=lambda x: x.doc_id))
print(f'Total number of samples in train set: {len(corpus_train_list)}')

Total number of samples in train set: 1706


#### Visualize training prompt

In [13]:
def document_to_training_prompt(document):
    source = ' '.join(sent_tokenize(document.source)[:max_sentences])
    target = ' '.join(sent_tokenize(document.target)[:max_sentences])
    prompt = it_prompt.format_prompt(
        query=source,
        error_types=ErrorConstants.ERROR_TYPES
    ).to_string()
#     prompt_text = ' '.join(prompt.split())
#     target_text = ' '.join(target.split())

    # By default, the Mistral tokenizer only adds <s> (BOS token) 
    # to the prompt but not </s> (EOS token), make sure to add it at the end of your prompt.
    prompt = (
            prompt 
            + target 
            + tokenizer.eos_token
        )
    return prompt

In [14]:
doc = corpus_train_list[0]
print(f'--- Document id: {doc.doc_id}')
source = doc.source
prompt = document_to_training_prompt(doc)
print(f"\n--- Prompt for training:\n\n{prompt}")

--- Document id: 0000

--- Prompt for training:

[INST] Given a text ("ORIGINAL_TEXT") in Ukrainian with potential errors, correct them to fulfill the GEC (Grammar Error Correction) Task.
Consider the provided set of error types ("ERROR_TYPES"):
['Fluency', 'Grammar', 'Punctuation', 'Spelling']
When you identify an error ("ERROR") in the text, correct it according to the format:
("ERROR") => ("CORRECTION")
The correction should address the error without providing explicit reasoning for the change.
The resulting text ("FIXED_TEXT") should be error-free, maintaining the original information's semantics.
Focus solely on correcting Ukrainian language errors.
Ensure that the corrected text doesn't include original errors, additional text, comments, or parts of these instructions.

ORIGINAL_TEXT: Byte for France або “Мій досвід ведення блогу у Instagram”
Останні 3 місяці мого життя видалися аж занадто насиченими на події та емоції, але ось нарешті у мене з’явилося декілька вільних годин та т

#### Prepare model for training & specify LoRA configurations

In [15]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    bias="none",
    lora_dropout=0.05,  # Conventional
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 11,534,336 || all params: 7,253,266,432 || trainable%: 0.15902264322061513


#### Set tokenizer max length

In [16]:
tokenizer_max_length = 900

#### Generate dataset from UA-GEC documents

In [17]:
class UAGECDataset(Dataset):
    def __init__(
        self, 
        generator: list,
        tokenizer: object,
        tokenizer_max_length: int,
        device: str,
        prompt: object,
        max_sentences=None,
        samples: int = None # if none will use all
    ):
        self.tokenizer = tokenizer
        self.tokenizer_max_length = tokenizer_max_length
        self.text_data =  list(toolz.unique(generator, key=lambda x: x.doc_id))
        
        if samples:
            self.text_data = self.text_data[:samples]
        
        self.max_sentences = max_sentences
        self.device = device
        self.prompt = prompt

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = self.text_data[idx]
        
        inputs = document_to_training_prompt(sample)
        encodings = self._tokenize_text(
            text=inputs,
        ).to(self.device)

        return {
            'prompt': inputs,
            'input_ids': encodings["input_ids"].squeeze(0),
            'attention_mask': encodings["attention_mask"].squeeze(0),
        }
    
    def _tokenize_text(self, text: str):
        return self.tokenizer(
            text, 
            max_length=self.tokenizer_max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )

In [18]:
train_list = list(corpus_train_list)[:1000]
test_list = list(corpus_train_list)[1000:1100]

In [19]:
train_dataset, val_dataset = [UAGECDataset(
    generator=corpus,
    tokenizer=tokenizer,
    tokenizer_max_length=tokenizer_max_length,
    device=device,
    prompt=it_prompt,
    max_sentences=max_sentences,
) for corpus in [train_list,test_list]]

#### Define training configurations

In [20]:
# Since the model is loaded in 4bit precision, use right-side padding for tokenizer
peft_model.config.use_cache = False
tokenizer.padding_side = 'right'

training_arguments = TrainingArguments(
    output_dir=fine_tuned_model_name,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    learning_rate=2e-4,
    logging_steps=25,
    num_train_epochs=5,
    save_total_limit = 2,
    save_strategy="no",
    load_best_model_at_end=True,
    hub_private_repo=False,
    report_to='wandb',
    optim="paged_adamw_32bit",
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

peft_model = peft_model.to(device)
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    dataset_text_field="prompt",
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=tokenizer_max_length,
    packing=False,

)

#### Clear needed amount of memory

In [21]:
min_memory_available = 2 * 1024 * 1024 * 1024  # 2GB

In [22]:
def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()
    
def wait_until_enough_gpu_memory(min_memory_available, max_retries=10, sleep_time=5):
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(torch.cuda.current_device())

    for _ in range(max_retries):
        clear_gpu_memory()
        info = nvmlDeviceGetMemoryInfo(handle)
        if info.free >= min_memory_available:
            break
        print(f"Waiting for {min_memory_available} bytes of free GPU memory. Retrying in {sleep_time} seconds...")
        time.sleep(sleep_time)
    else:
        raise RuntimeError(f"Failed to acquire {min_memory_available} bytes of free GPU memory after {max_retries} retries.")

In [23]:
clear_gpu_memory()

In [24]:
wait_until_enough_gpu_memory(min_memory_available)

#### Train model

In [25]:
trainer.train()



Step,Training Loss
25,0.9315
50,0.735
75,0.7059
100,0.7348
125,0.7265
150,0.6935
175,0.6872
200,0.6904
225,0.7164
250,0.7009


TrainOutput(global_step=1250, training_loss=0.41495089492797854, metrics={'train_runtime': 29853.8976, 'train_samples_per_second': 0.167, 'train_steps_per_second': 0.042, 'total_flos': 1.92299249664e+17, 'train_loss': 0.41495089492797854, 'epoch': 5.0})

In [26]:
trainer.model.save_pretrained(fine_tuned_model_name)
wandb.finish()
peft_model.config.use_cache = True

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
[34m[1mwandb[0m:   train/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
[34m[1mwandb[0m:     train/grad_norm ▃▂▁▂▁▂▄▃▃▂▂▂▂▃▂▂▄▃▃▃▄▃▃▄▃▄▆▄▅▅▆▅▃█▅▄▅▄▄▃
[34m[1mwandb[0m: train/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:          train/loss █▆▆▆▆▆▆▆▄▅▅▅▅▅▅▅▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 1.92299249664e+17
[34m[1mwandb[0m:              train/epoch 5.0
[34m[1mwandb[0m:        train/global_step 1250
[34m[1mwandb[0m:          train/grad_norm 0.98272
[34m[1mwandb[0m:      train/learning_rate 0.0002
[34m[1mwandb[0m:               train/loss 0.1562
[34m[1mwandb[0m:               train_loss 0.41495
[34m[1mwandb

#### Push to HuggingFace

In [27]:
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

'https://huggingface.co/andrian-kr/mistral-7b-ua-gec/tree/main/'