In [1]:
!pip install flash_attn==2.5.8
!pip install torch==2.3.1
!pip install accelerate==0.31.0
!pip install transformers==4.41.2
!pip install datasets
!pip install transformers
!pip install trl
!pip install peft 
!pip install auto-gptq 
!pip install optimum
!pip install xformers
!pip install huggingface_hub
!pip install git+https://github.com/microsoft/LoRA


Collecting git+https://github.com/microsoft/LoRA
  Cloning https://github.com/microsoft/LoRA to /tmp/pip-req-build-3u7jm8wk
  Running command git clone --filter=blob:none --quiet https://github.com/microsoft/LoRA /tmp/pip-req-build-3u7jm8wk
  Resolved https://github.com/microsoft/LoRA to commit 4c0333854cb905966f8cc4e9a74068c1e507c7b7
  Preparing metadata (setup.py) ... [?25ldone
[?25h

> **Packages successfully installed**

# Model used: microsoft/Phi-3-mini-4k-instruct
Todo: bnb NF4 configs ``!Bitsandbytes NF4``

In [2]:
#load tokens
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [3]:
#logging into Hugging Face
!huggingface-cli login --token $hf_token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from random import randrange

import torch
from datasets import load_dataset

from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
    pipeline
)
from trl import SFTTrainer

2024-07-18 07:54:24.878154: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-18 07:54:24.878211: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-18 07:54:24.879865: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# MODEL_ID is a string that specifies the identifier of the pre-trained model that will be fine-tuned. 
# In this case, the model is 'Phi-3-mini-4k-instruct' from Microsoft.
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

# NEW_MODEL_NAME is a string that specifies the name of the new model after fine-tuning.
# Here, the new model will be named 'opus-samantha-phi-3-mini-4k'.
NEW_MODEL_NAME = "sql-xp-phi-3-mini-4k"

In [6]:
# preparing datasets

# DATASET_NAME is a string that specifies the name of the dataset to be used for fine-tuning.
# Replace "replace with your dataset" with the actual name of your dataset.
DATASET_NAME = synthetic_text_to_sql_dataset_name = "gretelai/synthetic_text_to_sql"

# SPLIT specifies the portion of the dataset to be used. In this case, the 'train' split of the dataset will be used.
SPLIT = "train"

# Load the dataset specified by DATASET_NAME using the load_dataset function.
# The 'split="train"' argument specifies that we want to load the training split of the dataset.
dataset = load_dataset(DATASET_NAME)

dataset

# Extract relevant fields

# old
# def extract_fields_synthetic(example):
#     return {
#         "question": example["sql_prompt"],
#         "context": example["sql_context"],
#         "sql": example["sql"]
#     }
# new
def extract_fields_synthetic(example):
    return {
        "instruction": example["sql_prompt"],
        "input": example["sql_context"],
        "output": example["sql"]
    }
synthetic_extracted_dataset = dataset.map(extract_fields_synthetic, remove_columns=dataset['train'].column_names)



In [7]:
import random 

synthetic_extracted_train_dataset = synthetic_extracted_dataset["train"]
synthetic_extracted_test_dataset = synthetic_extracted_dataset["test"]

# Shuffle the dataset
synthetic_extracted_dataset = synthetic_extracted_dataset.shuffle(seed=random.randint(10,99))
synthetic_extracted_dataset = synthetic_extracted_dataset.shuffle(seed=random.randint(10,99))

print(synthetic_extracted_train_dataset)
print(synthetic_extracted_test_dataset)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 100000
})
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 5851
})


In [8]:
%whos

Variable                             Type                 Data/Info
-------------------------------------------------------------------
AutoModelForCausalLM                 type                 <class 'transformers.mode<...>to.AutoModelForCausalLM'>
AutoTokenizer                        type                 <class 'transformers.mode<...>tion_auto.AutoTokenizer'>
BitsAndBytesConfig                   type                 <class 'transformers.util<...>nfig.BitsAndBytesConfig'>
DATASET_NAME                         str                  gretelai/synthetic_text_to_sql
LoraConfig                           type                 <class 'peft.tuners.lora.config.LoraConfig'>
MODEL_ID                             str                  microsoft/Phi-3-mini-4k-instruct
NEW_MODEL_NAME                       str                  sql-xp-phi-3-mini-4k
PeftModel                            type                 <class 'peft.peft_model.PeftModel'>
SFTTrainer                           type                 <class '

In [9]:
# 'set_seed(1234)' sets the random seed for reproducibility.
set_seed(1234)

# MAX_SEQ_LENGTH is an integer that specifies the maximum length of the sequences that the model will handle.
MAX_SEQ_LENGTH = 2048

# num_train_epochs is an integer that specifies the number of times the training process will go through the entire dataset.
num_train_epochs = 1

# license is a string that specifies the license under which the model is distributed. In this case, it's Apache License 2.0.
license = "apache-2.0"

# username is a string that specifies the GitHub username of the person who is fine-tuning the model.
username = "spectrewolf8"

# learning_rate is a float that specifies the learning rate to be used during training.
learning_rate = 1.41e-5

# per_device_train_batch_size is an integer that specifies the number of samples to work through before updating the internal model parameters.
per_device_train_batch_size = 4

# gradient_accumulation_steps is an integer that specifies the number of steps to accumulate gradients before performing a backward/update pass.
gradient_accumulation_steps = 1

In [10]:
# 'torch' is a library for scientific computing that provides a wide range of functionalities for dealing with tensors, which are multi-dimensional arrays.

# 'torch.cuda.is_bf16_supported()' is a function that checks if BF16 is supported on the current GPU. BF16 is a data type that uses 16 bits, like float16, but allocates more bits to the exponent, which can result in higher precision.

# 'compute_dtype' is a variable that will hold the data type to be used for computations.

# 'attn_implementation' is a variable that will hold the type of attention implementation to be used.

# 'if torch.cuda.is_bf16_supported():' checks if BF16 is supported on the current GPU. If it is, the following block of code is executed.

# 'compute_dtype = torch.bfloat16' sets 'compute_dtype' to 'torch.bfloat16', which is the BF16 data type in PyTorch.

# 'attn_implementation = 'flash_attention_2'' sets 'attn_implementation' to 'flash_attention_2', which is a type of attention implementation.

# 'else:' specifies that the following block of code should be executed if BF16 is not supported on the current GPU.

# 'compute_dtype = torch.float16' sets 'compute_dtype' to 'torch.float16', which is the float16 data type in PyTorch.

# 'attn_implementation = 'sdpa'' sets 'attn_implementation' to 'sdpa', which is a type of attention implementation.

# 'print(attn_implementation)' prints the value of 'attn_implementation', which is the type of attention implementation to be used.

# 'print(compute_dtype)' prints the value of 'compute_dtype', which is the data type to be used for computations.
if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

print(attn_implementation)
print(compute_dtype)

flash_attention_2
torch.bfloat16


In [11]:
#load tokenizr to prepare dataset

# 'AutoTokenizer' is a class from the 'transformers' library that provides a generic tokenizer class from which all other tokenizer classes inherit.

# 'from_pretrained' is a method of the 'AutoTokenizer' class that loads a tokenizer from the Hugging Face Model Hub.

# 'tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)' loads the tokenizer associated with 'tokenizer_id' from the Hugging Face Model Hub and assigns it to the variable 'tokenizer'.

# 'tokenizer.padding_side' is a property of the 'tokenizer' object that determines on which side of the input sequences padding should be added. It can be set to either 'left' or 'right'.

# 'tokenizer.padding_side = 'right'' sets 'tokenizer.padding_side' to 'right', which means that padding will be added to the right side of the input sequences. This is done to prevent warnings that can occur when 'tokenizer.padding_side' is set to 'left'.

# Load the tokenizer associated with the pre-trained model specified by MODEL_ID using the AutoTokenizer class.
# The 'trust_remote_code=True' argument allows the execution of code from the model card (if any).
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.padding_side = 'right' # to prevent warnings

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def create_message_column(row):
    messages = []
    user = {
        "content": f"{row['instruction']}\n Input: {row['input']}",
        "role": "user"
    }
    messages.append(user)
    assistant = {
        "content": f"{row['output']}",
        "role": "assistant"
    }
    messages.append(assistant)
    return {"messages": messages}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

In [13]:
# Apply create_message_column function
synthetic_extracted_train_dataset = synthetic_extracted_train_dataset.map(create_message_column)
synthetic_extracted_test_dataset = synthetic_extracted_test_dataset.map(create_message_column)

# Format dataset using ChatML
synthetic_extracted_train_dataset = synthetic_extracted_train_dataset.map(format_dataset_chatml)
synthetic_extracted_test_dataset = synthetic_extracted_test_dataset.map(format_dataset_chatml)

# Output the results to verify
print(synthetic_extracted_train_dataset)
print(synthetic_extracted_test_dataset)

Dataset({
    features: ['instruction', 'input', 'output', 'messages', 'text'],
    num_rows: 100000
})
Dataset({
    features: ['instruction', 'input', 'output', 'messages', 'text'],
    num_rows: 5851
})


In [14]:
# 'hf_model_repo' is the identifier for the Hugging Face repository where you want to save the fine-tuned model.
hf_model_repo="spectrewolf8/"+NEW_MODEL_NAME

# Load Model on GPU 

# 'device_map' is a dictionary that maps devices to model parts. In this case, it is set to {"": 0}, which means that the entire model will be loaded on GPU 0.
device_map = {"": 0}

# Bits and Bytes configuration for the model

# 'use_4bit' is a boolean that controls whether 4-bit precision should be used for loading the base model.
use_4bit = True

# 'bnb_4bit_compute_dtype' is the data type that should be used for computations with the 4-bit base model. In this case, it is set to 'bfloat16'.
bnb_4bit_compute_dtype = "bfloat16"

# 'bnb_4bit_quant_type' is the type of quantization that should be used for the 4-bit base model. In this case, it is set to 'nf4'.
bnb_4bit_quant_type = "nf4"

# 'use_double_quant' is a boolean that controls whether nested quantization should be used for the 4-bit base model.
use_double_quant = True

# LoRA configuration for the model

# 'lora_r' is the dimension of the LoRA attention.
lora_r = 16

# 'lora_alpha' is the alpha parameter for LoRA scaling.
lora_alpha = 16

# 'lora_dropout' is the dropout probability for LoRA layers.
lora_dropout = 0.05

# 'target_modules' is a list of the modules that should be targeted by LoRA.
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]



In [15]:
!pip install bitsandbytes-cuda110 bitsandbytes

  pid, fd = os.forkpty()




In [17]:
# 'AutoTokenizer' is a class from the Hugging Face Transformers library that provides a tokenizer for a given pre-trained model.

# 'from_pretrained' is a method of the 'AutoTokenizer' class that loads a tokenizer from a pre-trained model.

# 'model_name' is a variable that contains the name of the pre-trained model.

# 'trust_remote_code=True' is a parameter that allows the execution of remote code when loading the tokenizer.

# 'add_eos_token=True' is a parameter that adds an end-of-sentence token to the tokenizer.

# 'use_fast=True' is a parameter that uses the fast version of the tokenizer, if available.

# 'tokenizer.pad_token = tokenizer.unk_token' sets the padding token of the tokenizer to be the same as the unknown token.

# 'tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)' sets the ID of the padding token to be the same as the ID of the padding token.

# 'tokenizer.padding_side = 'left'' sets the side where padding will be added to be the left side.

# 'BitsAndBytesConfig' is a class that provides a configuration for quantization.

# 'bnb_config' is a variable that holds the configuration for quantization.

# 'AutoModelForCausalLM' is a class from the Hugging Face Transformers library that provides a model for causal language modeling.

# 'from_pretrained' is a method of the 'AutoModelForCausalLM' class that loads a model from a pre-trained model.

# 'torch_dtype=compute_dtype' is a parameter that sets the data type of the model to be the same as 'compute_dtype'.

# 'quantization_config=bnb_config' is a parameter that sets the configuration for quantization to be 'bnb_config'.

# 'device_map=device_map' is a parameter that sets the device map of the model to be 'device_map'.

# 'attn_implementation=attn_implementation' is a parameter that sets the type of attention implementation to be 'attn_implementation'.

# 'prepare_model_for_kbit_training' is a function that prepares a model for k-bit training.

# 'model = prepare_model_for_kbit_training(model)' prepares 'model' for k-bit training and assigns the result back to 'model'.
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'left'

bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_double_quant,
)

model = AutoModelForCausalLM.from_pretrained(
          MODEL_ID, torch_dtype=compute_dtype, trust_remote_code=True, quantization_config=bnb_config, device_map=device_map,
          attn_implementation=attn_implementation
)

model = prepare_model_for_kbit_training(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]