In [2]:
import torch  # Import the PyTorch library

# Retrieve the CUDA capability of the current GPU device
major, minor = torch.cuda.get_device_capability()

# Print the major and minor CUDA version supported by the device
print(f"CUDA Major Version: {major}")
print(f"CUDA Minor Version: {minor}")


CUDA Major Version: 7
CUDA Minor Version: 5


In [3]:
# Unsloth is a specialized optimization library designed for fine-tuning large language models (LLMs) such as Llama-3,
#  Mistral, Phi, and Gemma. It offers significant improvements in efficiency, allowing models to fine-tune up to 2-5 times
# faster while consuming up to 80% less memory. The key advantage of using Unsloth is that it achieves these speed-ups and
# memory reductions without any loss in accuracy.

# Installing the unsloth
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-6thwqddv/unsloth_051c92acfc604513ae812ab3db7e416b
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-6thwqddv/unsloth_051c92acfc604513ae812ab3db7e416b
  Resolved https://github.com/unslothai/unsloth.git to commit 8001d30a8f7c179ff7036eaa2a7552ce620176b6
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading tyro-0.8.5-py3-none-any.whl.metadata (8.2 kB)
Collecting transformers>=4.43.2 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[c

In [4]:
!pip install --no-deps xformers trl peft accelerate bitsandbytes

Collecting xformers
  Downloading xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl (20.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m89.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.9.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137

In [3]:
import torch
from trl import SFTTrainer
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
# #  Imports the FastLanguageModel from the Unsloth library. This model is optimized for fast training and low memory usage, making it ideal for fine-tuning large language models.
# #  Imports the SFTTrainer from the TRL (Token-level Reinforcement Learning) library. This trainer is used for applying reinforcement learning techniques at the token level during fine-tuning.
# #  Imports TrainingArguments from the Transformers library. This class is used to set up and configure various training parameters such as learning rate, batch size, number of epochs, etc.
# #  AutoPeftModelForCausalLM: Parameter-Efficient Fine-Tuning generally refers to techniques like adapter layers, prefix-tuning, or low-rank adaptations, which allow for the fine-tuning of large-scale models without modifying all the model parameters.
# #  Imports AutoTokenizer from the Transformers library, which is used to tokenize text into a format suitable for model input. The tokenizer converts text strings into sequences of integers representing each token in the model's vocabulary.

In [4]:
# Maximum sequence length supported with automatic RoPE scaling.
max_seq_length = 2048

# Data type for model processing. Auto-detected if None. Use 'float16' for Tesla T4, V100 GPUs, and 'bfloat16' for Ampere GPUs.
dtype = None

# Enable 4-bit quantization to reduce memory usage; can also be set to False.
load_in_4bit = True


In [5]:
# Load the pretrained language model and tokenizer using specified configurations.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Phi-3-mini-4k-instruct",  # Specifies the model to be loaded.
    max_seq_length=max_seq_length,  # Sets the maximum sequence length for input sequences.
    dtype=dtype,  # Defines the data type for model computations; auto-detected if set to None.
    load_in_4bit=load_in_4bit,  # Enables 4-bit quantization to reduce memory usage.
)

==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [7]:
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Your goal is to convert the given text description into an appropriate SQL query using the provided database schema.

### Instruction:
Write an SQL query based on the following description.

### Input:
Text Description: {}
Database Schema: {}

### Response:
SQL Query:
{}"""


In [8]:
EOS_TOKEN = tokenizer.eos_token  # Retrieve the End-Of-Sequence (EOS) token from the tokenizer.
                               # The EOS token is crucial for indicating the end of a text sequence,
                               # which helps the model understand where the sequence should stop.
                               # Ensure this token is used appropriately in sequence processing tasks.


In [12]:
def formatting_prompts_func(examples):
    instruction = examples["question"]
    input       = examples["input"]
    output      = examples["sql_query"]
    # Format the prompt string using the provided instruction, input, and output.
    # Append the EOS_TOKEN to indicate the end of the sequence and ensure that
    # the generation process stops appropriately. Without this token, the model
    # might continue generating output indefinitely.
    text = prompt.format(instruction, input, output) + EOS_TOKEN
    return text


In [17]:
import json

# Load data from a JSON file into a Python dictionary.
# The file '/content/text_sql.json' is read and parsed as JSON.
data = json.load(open("/content/text_sql.json"))

# Apply the `formatting_prompts_func` function to each item in the `data` list.
# The `map` function is used to apply `formatting_prompts_func` to every element of `data`.
# The result of `map` is converted into a list using `list()`.
# This transforms the original data into a list of formatted prompts with the EOS_TOKEN appended.
data_set = list(map(formatting_prompts_func, data))


In [16]:
data_set[0]

'Below is an instruction that describes a task, paired with an input that provides further context. Your goal is to convert the given text description into an appropriate SQL query using the provided database schema.\n\n### Instruction:\nWrite an SQL query based on the following description.\n\n### Input:\nText Description: Find all genres available in the bookstore.\nDatabase Schema: \n\ndatabase schema:\n\n  CREATE TABLE bookstore.genres \n( id SERIAL PRIMARY KEY,\n"name" varchar(255) NOT NULL UNIQUE,\n"description" varchar(255) NOT NULL )\n\n  CREATE TABLE bookstore.authors \n( id SERIAL PRIMARY KEY,\n "name" varchar(255) NOT NULL,\n "bio" varchar(500) NOT NULL )\n\n\n  CREATE TABLE bookstore.books\n( id SERIAL PRIMARY KEY,\ntitle varchar(255) NOT NULL,\ndescription varchar(255) NOT NULL,\nISBN char(13) NOT NULL,\ngenre_id INT NOT NULL,\nCONSTRAINT fk_genre\nFOREIGN KEY(genre_id) \nREFERENCES bookstore.genres(id) )\n\n  CREATE TABLE bookstore.books_authors \n( book_id  int REFERENCE

In [21]:
from datasets import Dataset

# Convert the list-format dataset to a Dataset object from the Hugging Face `datasets` library.
# This assumes that each element in `data_set` is a dictionary with a key "text".
formatted_dataset = Dataset.from_dict({"text": data_set})

In [24]:
# This code initializes a model using the FastLanguageModel class with the PEFT (Parameter-Efficient Fine-Tuning) method.
# The get_peft_model method is used to configure the model with specific parameters for efficient training.

model = FastLanguageModel.get_peft_model(
    model,  # The base model to be adapted for parameter-efficient fine-tuning.

    r = 16,  # Rank of the LoRA (Low-Rank Adaptation) matrices. Choose any number > 0. Suggested values are 8, 16, 32, 64, 128.
             # Higher values increase the model's capacity to learn but require more resources.

    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    # List of module names in the model where LoRA will be applied.
    # These modules correspond to different components of the model's attention mechanism and feed-forward networks.

    lora_alpha = 16,  # Scaling factor for the LoRA updates. Controls the strength of the LoRA adaptation.
                      # Higher values make LoRA updates more impactful.

    lora_dropout = 0,  # Dropout rate applied to LoRA layers. Set to 0 for no dropout, which is optimized for most cases.
                       # Non-zero values can be used to improve generalization in certain scenarios.

    bias = "none",    # Specifies the bias handling for LoRA layers. "none" means no additional bias will be added, which is optimized.
                       # Alternative settings might include adding bias terms if needed.

    use_gradient_checkpointing = "unsloth",  # Gradient checkpointing strategy to optimize memory usage during training.
                                              # "unsloth" uses 30% less VRAM and allows for larger batch sizes.
                                              # Set to True or "unsloth" for very long context windows, or use False for standard checkpointing.

    random_state = 3407,  # Seed for random number generation to ensure reproducibility of the training process.

    use_rslora = False,  # Flag to enable or disable rank stabilized LoRA. If True, rank stabilization will be applied to LoRA layers.
                          # False means rank stabilization is not used.

    loftq_config = None,  # Configuration for LoftQ, an advanced technique for quantization. Set to None if LoftQ is not being used.
)

# The resulting 'model' is now adapted with LoRA configurations and is ready for efficient fine-tuning with the specified parameters.


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [25]:
# Initialize an SFTTrainer object for fine-tuning a model on a dataset
trainer = SFTTrainer(
    # The model to be fine-tuned
    model = model,

    # The tokenizer used for processing text input and output
    tokenizer = tokenizer,

    # The dataset used for training the model
    train_dataset = formatted_dataset,

    # The field in the dataset that contains the text data
    dataset_text_field = "text",

    # Maximum sequence length for input sequences; longer sequences will be truncated
    max_seq_length = max_seq_length,

    # Number of processes to use for dataset loading; parallelizes data loading
    dataset_num_proc = 2,

    # Whether to use packing; packing short sequences together can speed up training
    packing = False,  # Set to True to enable packing for potentially faster training with short sequences

    # TrainingArguments object to configure training specifics
    args = TrainingArguments(
        # Batch size per device during training
        per_device_train_batch_size = 2,

        # Number of gradient accumulation steps; accumulates gradients over multiple batches
        gradient_accumulation_steps = 4,

        # Number of warmup steps before learning rate becomes stable
        warmup_steps = 5,

        # Total number of training steps; training will stop after this number
        max_steps = 60,

        # Learning rate for the optimizer
        learning_rate = 2e-4,

        # Whether to use mixed precision (fp16) for training; improves performance on supported hardware
        fp16 = not torch.cuda.is_bf16_supported(),

        # Whether to use bfloat16 precision for training; set to True if hardware supports it
        bf16 = torch.cuda.is_bf16_supported(),

        # Frequency of logging training metrics
        logging_steps = 1,

        # Optimizer to use; "adamw_8bit" uses 8-bit AdamW optimization for reduced memory usage
        optim = "adamw_8bit",

        # Weight decay for regularization; helps to prevent overfitting
        weight_decay = 0.01,

        # Scheduler type for adjusting the learning rate; "linear" means the learning rate decreases linearly
        lr_scheduler_type = "linear",

        # Random seed for reproducibility of results
        seed = 3407,

        # Directory where training outputs and checkpoints will be saved
        output_dir = "outputs",
    ),
)


Map (num_proc=2):   0%|          | 0/483 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [29]:
# This code displays the current GPU memory statistics using PyTorch. It provides information on the GPU name,
# the total memory available on the GPU, and the amount of memory currently reserved.

# Get the properties of the GPU device at index 0 (assuming a single GPU or primary GPU).
gpu_stats = torch.cuda.get_device_properties(0)

# Calculate the amount of GPU memory currently reserved.
# 'torch.cuda.max_memory_reserved()' returns the maximum amount of memory reserved by the GPU,
# which is then converted from bytes to gigabytes (GB) and rounded to 3 decimal places.
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)

# Get the total memory available on the GPU.
# 'gpu_stats.total_memory' returns the total memory of the GPU in bytes,
# which is then converted from bytes to gigabytes (GB) and rounded to 3 decimal places.
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

# Print the GPU name and its total memory.
print(f"GPU = {gpu_stats.name}")
print(f"Max memory = {max_memory} GB.")

# Print the amount of GPU memory currently reserved.
print(f"{start_gpu_memory} GB reserved memory.")


GPU = Tesla T4
Max memory = 14.748 GB.
2.283 GB reserved memory.


In [30]:
# Initiates the training process for the model managed by the 'trainer' object.
# The 'train' method is called on the 'trainer' instance, which handles the training loop, including
# forward and backward passes, optimization, and potentially validation, depending on its implementation.

trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 483 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss
1,0.4746
2,0.4739
3,0.5439
4,0.6304
5,0.508
6,0.3347
7,0.3986
8,0.3005
9,0.3783
10,0.3124


In [31]:
trainer_stats.metrics

{'train_runtime': 636.8694,
 'train_samples_per_second': 0.754,
 'train_steps_per_second': 0.094,
 'total_flos': 9029372464926720.0,
 'train_loss': 0.14600331485271453,
 'epoch': 0.9917355371900827}

**Summary of Metrics:**

Runtime: The training process took around 637 seconds.

Sample Throughput: The model processed roughly 0.754 samples per second.

Step Throughput: The model completed approximately 0.094 training steps per second.

Computational Complexity: The training involved about 9.03 trillion FLOPs.

Final Loss: The training loss value at the end of training is 0.146,
indicating the model's performance on the training set.

Epoch Progress: The training covered about 99.2% of an epoch, suggesting the training process was nearly complete.

In [34]:
# Prepare the model for inference with FastLanguageModel for optimized performance.
# This configuration enables native support for 2x faster inference.
FastLanguageModel.for_inference(model)

# Format the input prompt with a specific instruction and SQL query.
# 'alpaca_prompt' is assumed to be a string with placeholders for instruction, input, and output.
formatted_prompt = alpaca_prompt.format(
    "Retrieve the names of all employees who work in the 'Sales' department.",  # Instruction to be processed by the model.
    '''CREATE TABLE employees (
    employee_id INTEGER PRIMARY KEY,
    name VARCHAR(100),
    department_id INTEGER
);

CREATE TABLE departments (
    department_id INTEGER PRIMARY KEY,
    department_name VARCHAR(100)
);''',  # Input SQL schema or other relevant data to be processed by the model.
    "",  # Output field for the model's generation. Left blank as the model will fill this in.
)

# Tokenize the formatted prompt and move the tensors to GPU.
# 'return_tensors="pt"' ensures that the tokenized inputs are returned as PyTorch tensors.
inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

# Import the TextStreamer class from the transformers library.
# This class handles streaming the model's generated text in real-time.
from transformers import TextStreamer

# Initialize the TextStreamer with the tokenizer to handle text streaming.
text_streamer = TextStreamer(tokenizer)

# Generate text using the model with the given inputs and streaming the output in real-time.
# 'max_new_tokens' specifies the maximum number of new tokens to generate.
# The generation is streamed using the 'text_streamer' to handle output incrementally.
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)


Below is an instruction that describes a task, paired with an input that provides further context. Your goal is to convert the given text description into an appropriate SQL query using the provided database schema.

### Instruction:
Write an SQL query based on the following description.

### Input:
Text Description: Retrieve the names of all employees who work in the 'Sales' department.
Database Schema: CREATE TABLE employees (
    employee_id INTEGER PRIMARY KEY,
    name VARCHAR(100),
    department_id INTEGER
);

CREATE TABLE departments (
    department_id INTEGER PRIMARY KEY,
    department_name VARCHAR(100)
);

### Response:
SQL Query:
SELECT name FROM employees JOIN departments ON employees.department_id = departments.department_id WHERE department_name = 'Sales';<|endoftext|>


In [35]:
model.save_pretrained("finetuned_model")
tokenizer.save_pretrained("finetuned_model")

('finetuned_model/tokenizer_config.json',
 'finetuned_model/special_tokens_map.json',
 'finetuned_model/tokenizer.model',
 'finetuned_model/added_tokens.json',
 'finetuned_model/tokenizer.json')

In [1]:
!zip -r archive_name.zip . -x "sample_data/*"

  adding: .config/ (stored 0%)
  adding: .config/default_configs.db (deflated 98%)
  adding: .config/.last_survey_prompt.yaml (stored 0%)
  adding: .config/configurations/ (stored 0%)
  adding: .config/configurations/config_default (deflated 15%)
  adding: .config/config_sentinel (stored 0%)
  adding: .config/gce (stored 0%)
  adding: .config/.last_opt_in_prompt.yaml (stored 0%)
  adding: .config/active_config (stored 0%)
  adding: .config/.last_update_check.json (deflated 23%)
  adding: .config/logs/ (stored 0%)
  adding: .config/logs/2024.08.05/ (stored 0%)
  adding: .config/logs/2024.08.05/13.23.28.842671.log (deflated 57%)
  adding: .config/logs/2024.08.05/13.23.06.500078.log (deflated 58%)
  adding: .config/logs/2024.08.05/13.22.45.846144.log (deflated 93%)
  adding: .config/logs/2024.08.05/13.23.17.376172.log (deflated 85%)
  adding: .config/logs/2024.08.05/13.23.29.421130.log (deflated 57%)
  adding: .config/logs/2024.08.05/13.23.18.241910.log (deflated 58%)
