In [1]:
!git clone https://github.com/Reennon/ua-gec-lora.git
!cd ua-gec-lora && pip install -r requirements.txt
!pwd && ls -a
# Install additional libs
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
!pip install ua_gec
!pip install datasets==2.16.0
!pip install nltk
!pip install wandb -q -U
# CD into the project directory
%cd ua-gec-lora
!git pull origin "feature/fine-tuning-research"
!git status

Cloning into 'ua-gec-lora'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 98 (delta 29), reused 71 (delta 13), pack-reused 0[K
Unpacking objects: 100% (98/98), 124.83 KiB | 1.92 MiB/s, done.
Collecting anyio==3.7.1 (from -r requirements.txt (line 1))
  Downloading anyio-3.7.1-py3-none-any.whl.metadata (4.7 kB)
Collecting dynaconf==3.2.4 (from -r requirements.txt (line 2))
  Downloading dynaconf-3.2.4-py2.py3-none-any.whl.metadata (9.3 kB)
Collecting huggingface-hub==0.19.4 (from -r requirements.txt (line 3))
  Downloading huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting langchain==0.0.329 (from -r requirements.txt (line 4))
  Downloading langchain-0.0.329-py3-none-any.whl.metadata (16 kB)
Collecting langsmith==0.0.56 (from -r requirements.txt (line 5))
  Downloading langsmith-0.0.56-py3-none-any.whl.metadata (10 kB)
Collecting llama_cpp_python==0.2.13 (fr

In [2]:
from transformers import AutoModelForCausalLM, pipeline, Conversation, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, TaskType, PeftModel, get_peft_model, prepare_model_for_kbit_training
from src.packages.constants.error_constants import ErrorConstants
from src.packages.prompts.instruction_tuning_gec_prompts import InstructionTuningGecPrompts
from ua_gec import Corpus
from langchain.prompts import PromptTemplate
from kaggle_secrets import UserSecretsClient
import torch
import nltk
import wandb

nltk.download('punkt')  # Download the necessary resources for sentence tokenization

from nltk.tokenize import sent_tokenize

2024-03-23 14:51:27.947068: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-23 14:51:27.947208: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-23 14:51:28.091516: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

Load HuggingFace and Weights & Biases secrets

In [None]:
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
secret_wandb = user_secrets.get_secret("wandb")

Login to HuggingFace

In [6]:
!huggingface-cli login --token $secret_hf

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Login to Weights & Biases and connect to project

In [39]:
wandb_project_name = 'UA-GEC LoRA Instruction-Tuning Mistral 7B'

wandb.login(key = secret_wandb)
run = wandb.init(
    project=wandb_project_name, 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrkovalch[0m ([33mrkovalchuk[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
fine_tuned_model_name = 'rkovalchuk/mistral-7b-ua-gec'

In [10]:
model = prepare_model_for_kbit_training(base_model)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=4,
    lora_alpha=16,
    bias="none",
    lora_dropout=0.05,  # Conventional
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

# base_model.enable_input_require_grads()
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 5,767,168 || all params: 7,247,499,264 || trainable%: 0.07957459242040704


In [11]:
template = """[INST] Given a text ("ORIGINAL_TEXT") in Ukrainian with potential errors, correct them to fulfill the GEC (Grammar Error Correction) Task, especially tailored for Mistral 7B LLM.
Consider the provided set of error types ("ERROR_TYPES"):
{error_types}
When you identify an error ("ERROR") in the text, correct it according to the format:
("ERROR") => ("CORRECTION")
The correction should address the error without providing explicit reasoning for the change.
The resulting text ("FIXED_TEXT") should be error-free, maintaining the original information's semantics.
Focus solely on correcting Ukrainian language errors.
Ensure that the corrected text doesn't include original errors, additional text, comments, or parts of these instructions.

ORIGINAL_TEXT: {query}
FIXED_TEXT:
[/INST]"""

it_prompt = PromptTemplate(
    template=template,
    input_variables=['query', 'error_types']
)

In [12]:
max_sentences = 4

In [14]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side='left',
    trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_eos_token = True

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [15]:
corpus = Corpus(partition="train", annotation_layer="gec-only")
for doc in corpus:
    print("\nPrompt for training:\n")
    source = "".join(sent_tokenize(doc.source)[:max_sentences])
    target = "".join(sent_tokenize(doc.target)[:max_sentences])
    prompt = it_prompt.format_prompt(
        query=source,
        error_types=ErrorConstants.ERROR_TYPES
    ).to_string()
    prompt_with_original_text = ' '.join(prompt.split())
    target_text = ' '.join(target.split())

    # By default, the Mistral tokenizer only adds <s> (BOS token) 
    # to the prompt but not </s> (EOS token), make sure to add it at the end of your prompt.
    instruction_tuning_template = (
        prompt_with_original_text 
        + target_text 
        + tokenizer.eos_token
    )
    
    print(instruction_tuning_template)
    break


Prompt for training:

[INST] Given a text ("ORIGINAL_TEXT") in Ukrainian with potential errors, correct them to fulfill the GEC (Grammar Error Correction) Task, especially tailored for Mistral 7B LLM. Consider the provided set of error types ("ERROR_TYPES"): ['Fluency', 'Grammar', 'Punctuation', 'Spelling'] When you identify an error ("ERROR") in the text, correct it according to the format: ("ERROR") => ("CORRECTION") The correction should address the error without providing explicit reasoning for the change. The resulting text ("FIXED_TEXT") should be error-free, maintaining the original information's semantics. Focus solely on correcting Ukrainian language errors. Ensure that the corrected text doesn't include original errors, additional text, comments, or parts of these instructions. ORIGINAL_TEXT: Byte for France або “Мій досвід ведення блогу у Instagram” Останні 3 місяці мого життя видалися аж занадто насиченими на події та емоції, але ось нарешті у мене з’явилося декілька вільн

In [17]:
prompt_len = len(tokenizer.tokenize(instruction_tuning_template))
tokenizer_max_len = 900
max_correction_addtional_tokens = 0.1
max_new_tokens = int(prompt_len * 1.1)

print(f"""
Prompt len: {prompt_len}
Tokenize max length: {tokenizer_max_len}
Max token difference because of corrections: {max_correction_addtional_tokens}
Max new tokens len (output without input): {max_new_tokens}
""")


Prompt len: 771
Tokenize max length: 900
Max token difference because of corrections: 0.1
Max new tokens len (output without input): 848



In [23]:
# Fix padding token for Mistral and Phi-2 models
tokenizer.pad_token = "[PAD]"

In [18]:
model.train()
model.device

device(type='cuda', index=0)

In [None]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [30]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class UAGECDataset(Dataset):
    def __init__(
        self, 
        generator: object, 
        device: str,
        prompt: object,
        tokenizer: object,
        max_sentences=None,
        samples: int = None # if none will use all
    ):
        self.text_data =  generator#list(generator)
        
        if samples:
            self.text_data = self.text_data[:samples]
        
        self.max_sentences = max_sentences
        self.device = device
        self.prompt = prompt
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = self.text_data[idx]
        
        inputs: str = self._preprocess_text(
            text=sample.source, 
            target_text=sample.target
        )
        encodings = self._tokenize_text(
            text=inputs,
        ).to(self.device)

        return {
            'prompt': inputs,
            'input_ids': encodings["input_ids"].squeeze(0),
            'attention_mask': encodings["attention_mask"].squeeze(0),
        }
    
    def _preprocess_text(self, text: str, target_text: str) -> torch.tensor:
        # Select top n sentences
        text = "".join(sent_tokenize(text)[:self.max_sentences] if self.max_sentences else sent_tokenize(text))
        target_text = "".join(sent_tokenize(target_text)[:self.max_sentences] if self.max_sentences else sent_tokenize(target_text))
        # Add instructions (prepend prompt)
        text = self._format_prompt(text=text)

        text = self._normalize_spaces(text=text)
        target_text = self._normalize_spaces(text=target_text)
        # Add target response to input text
        text += target_text
        # Add EOS token
        text += self.tokenizer.eos_token
        
        return text
    
    def _format_prompt(self, text: str) -> str:
        return self.prompt.format_prompt(
            query=text,
            error_types=ErrorConstants.ERROR_TYPES
        ).to_string()
    
    def _tokenize_text(self, text: str):
        return self.tokenizer(
            text, 
            max_length=tokenizer_max_len, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
    
    def _add_target(self, text: str, target_text: str):
        return self.tokenizer(
            text, 
            max_length=self.tokenizer_max_len, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
    
    def _normalize_spaces(self, text):
        return ' '.join(text.split())

In [27]:
train_corpus = Corpus(partition="train", annotation_layer="gec-only")
train_list = list(train_corpus)[:50]
test_list = list(train_corpus)[50:5]

In [31]:
# train_corpus = Corpus(partition="train", annotation_layer="gec-only")
# test_corpus = Corpus(partition="test", annotation_layer="gec-only")
train_dataset, val_dataset = [UAGECDataset(
    generator=corpus,
    device=device,
    prompt=it_prompt,
    max_sentences=max_sentences,
    tokenizer=tokenizer,
) for corpus in [train_list, test_list]]

In [32]:
train_dataset[0]

{'prompt': '[INST] Given a text ("ORIGINAL_TEXT") in Ukrainian with potential errors, correct them to fulfill the GEC (Grammar Error Correction) Task, especially tailored for Mistral 7B LLM. Consider the provided set of error types ("ERROR_TYPES"): [\'Fluency\', \'Grammar\', \'Punctuation\', \'Spelling\'] When you identify an error ("ERROR") in the text, correct it according to the format: ("ERROR") => ("CORRECTION") The correction should address the error without providing explicit reasoning for the change. The resulting text ("FIXED_TEXT") should be error-free, maintaining the original information\'s semantics. Focus solely on correcting Ukrainian language errors. Ensure that the corrected text doesn\'t include original errors, additional text, comments, or parts of these instructions. ORIGINAL_TEXT: Byte for France або “Мій досвід ведення блогу у Instagram” Останні 3 місяці мого життя видалися аж занадто насиченими на події та емоції, але ось нарешті у мене з’явилося декілька вільни

In [33]:
from trl import SFTTrainer
from transformers import TrainingArguments

fine_tuned_model_name = "mistral-7b-ua-gec"

# # Since the model is loaded in 4bit precision, use right-side padding for tokenizer
peft_model.config.use_cache = False
tokenizer.padding_side = 'right'

training_arguments = TrainingArguments(
    output_dir=fine_tuned_model_name,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    learning_rate=2e-4,
    logging_steps=25,
    num_train_epochs=5,
    save_total_limit = 2,
    save_strategy="no",
    load_best_model_at_end=True,
    hub_private_repo=False,
    report_to='wandb',
    optim="paged_adamw_32bit",
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)
peft_model = peft_model.to(device)
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    dataset_text_field="prompt",
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=tokenizer_max_len,
    packing=False,

)

In [34]:
training_arguments.device

device(type='cuda', index=0)

In [35]:
import gc

def clear_gpu_memory():
    torch.cuda.empty_cache()
    print(gc.collect())

In [36]:
clear_gpu_memory()

3390


In [37]:
import time
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

def wait_until_enough_gpu_memory(min_memory_available, max_retries=10, sleep_time=5):
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(torch.cuda.current_device())

    for _ in range(max_retries):
        clear_gpu_memory()
        info = nvmlDeviceGetMemoryInfo(handle)
        if info.free >= min_memory_available:
            break
        print(f"Waiting for {min_memory_available} bytes of free GPU memory. Retrying in {sleep_time} seconds...")
        time.sleep(sleep_time)
    else:
        raise RuntimeError(f"Failed to acquire {min_memory_available} bytes of free GPU memory after {max_retries} retries.")

# Usage example
min_memory_available = 2 * 1024 * 1024 * 1024  # 2GB
clear_gpu_memory()
wait_until_enough_gpu_memory(min_memory_available)

0
0


In [None]:
trainer.train()



Step,Training Loss
25,0.9193


In [None]:
trainer.model.save_pretrained(fine_tuned_model_name)
wandb.finish()
peft_model.config.use_cache = True

In [None]:
trainer.push_to_hub()