In [1]:
!pip install flash_attn==2.5.8
!pip install torch==2.3.1
!pip install accelerate==0.31.0
!pip install transformers==4.41.2
!pip install datasets
!pip install transformers
!pip install trl
!pip install peft 
!pip install auto-gptq 
!pip install optimum
!pip install xformers
!pip install huggingface_hub
!pip install git+https://github.com/microsoft/LoRA

Collecting flash_attn==2.5.8
  Using cached flash_attn-2.5.8-cp310-cp310-linux_x86_64.whl
Installing collected packages: flash_attn
  Attempting uninstall: flash_attn
    Found existing installation: flash-attn 2.6.1
    Uninstalling flash-attn-2.6.1:
      Successfully uninstalled flash-attn-2.6.1
Successfully installed flash_attn-2.5.8
Collecting git+https://github.com/microsoft/LoRA
  Cloning https://github.com/microsoft/LoRA to /tmp/pip-req-build-uwj8iwpk
  Running command git clone --filter=blob:none --quiet https://github.com/microsoft/LoRA /tmp/pip-req-build-uwj8iwpk
  Resolved https://github.com/microsoft/LoRA to commit 4c0333854cb905966f8cc4e9a74068c1e507c7b7
  Preparing metadata (setup.py) ... [?25ldone
[?25h

> **Packages successfully installed**

# Model used: microsoft/Phi-3-mini-4k-instruct
Todo: bnb NF4 configs ``!Bitsandbytes NF4``

In [2]:
#load tokens
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [3]:
#logging into Hugging Face
!huggingface-cli login --token $hf_token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from random import randrange

import torch
from datasets import load_dataset

from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
    pipeline
)
from trl import SFTTrainer

2024-07-18 09:07:32.979898: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-18 09:07:32.979968: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-18 09:07:32.981590: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# MODEL_ID is a string that specifies the identifier of the pre-trained model that will be fine-tuned. 
# In this case, the model is 'Phi-3-mini-4k-instruct' from Microsoft.
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

# NEW_MODEL_NAME is a string that specifies the name of the new model after fine-tuning.
# Here, the new model will be named 'opus-samantha-phi-3-mini-4k'.
NEW_MODEL_NAME = "sql-xp-phi-3-mini-4k"

In [55]:
# preparing datasets

# DATASET_NAME is a string that specifies the name of the dataset to be used for fine-tuning.
# Replace "replace with your dataset" with the actual name of your dataset.
DATASET_NAME = synthetic_text_to_sql_dataset_name = "gretelai/synthetic_text_to_sql"

# SPLIT specifies the portion of the dataset to be used. In this case, the 'train' split of the dataset will be used.
SPLIT = "train"

# Load the dataset specified by DATASET_NAME using the load_dataset function.
# The 'split="train"' argument specifies that we want to load the training split of the dataset.
dataset = load_dataset(DATASET_NAME)

dataset

# Extract relevant fields

# old
# def extract_fields_synthetic(example):
#     return {
#         "question": example["sql_prompt"],
#         "context": example["sql_context"],
#         "sql": example["sql"]
#     }
# new
def extract_fields_synthetic(example):
    return {
        "instruction": example["sql_prompt"],
        "input": example["sql_context"],
        "output": example["sql"]
    }
synthetic_extracted_dataset = dataset.map(extract_fields_synthetic, remove_columns=dataset['train'].column_names)



In [56]:
import random 

synthetic_extracted_train_dataset = synthetic_extracted_dataset["train"]
synthetic_extracted_test_dataset = synthetic_extracted_dataset["test"]

# Shuffle the dataset
synthetic_extracted_dataset = synthetic_extracted_dataset.shuffle(seed=random.randint(10,99))
synthetic_extracted_dataset = synthetic_extracted_dataset.shuffle(seed=random.randint(10,99))

print(synthetic_extracted_train_dataset)
print(synthetic_extracted_test_dataset)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 100000
})
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 5851
})


In [57]:
%whos

Variable                             Type                      Data/Info
------------------------------------------------------------------------
AutoModelForCausalLM                 type                      <class 'transformers.mode<...>to.AutoModelForCausalLM'>
AutoTokenizer                        type                      <class 'transformers.mode<...>tion_auto.AutoTokenizer'>
BitsAndBytesConfig                   type                      <class 'transformers.util<...>nfig.BitsAndBytesConfig'>
DATASET_NAME                         str                       gretelai/synthetic_text_to_sql
LoraConfig                           type                      <class 'peft.tuners.lora.config.LoraConfig'>
MAX_SEQ_LENGTH                       int                       2048
MODEL_ID                             str                       microsoft/Phi-3-mini-4k-instruct
NEW_MODEL_NAME                       str                       sql-xp-phi-3-mini-4k
PeftModel                            type      

In [58]:
# 'set_seed(1234)' sets the random seed for reproducibility.
set_seed(1234)

# MAX_SEQ_LENGTH is an integer that specifies the maximum length of the sequences that the model will handle.
MAX_SEQ_LENGTH = 2048

# num_train_epochs is an integer that specifies the number of times the training process will go through the entire dataset.
num_train_epochs = 1

# license is a string that specifies the license under which the model is distributed. In this case, it's Apache License 2.0.
license = "apache-2.0"

# username is a string that specifies the GitHub username of the person who is fine-tuning the model.
username = "spectrewolf8"

# learning_rate is a float that specifies the learning rate to be used during training.
learning_rate = 1.41e-5

# per_device_train_batch_size is an integer that specifies the number of samples to work through before updating the internal model parameters.
per_device_train_batch_size = 4

# gradient_accumulation_steps is an integer that specifies the number of steps to accumulate gradients before performing a backward/update pass.
gradient_accumulation_steps = 1

In [59]:
# 'torch' is a library for scientific computing that provides a wide range of functionalities for dealing with tensors, which are multi-dimensional arrays.

# 'torch.cuda.is_bf16_supported()' is a function that checks if BF16 is supported on the current GPU. BF16 is a data type that uses 16 bits, like float16, but allocates more bits to the exponent, which can result in higher precision.

# 'compute_dtype' is a variable that will hold the data type to be used for computations.

# 'attn_implementation' is a variable that will hold the type of attention implementation to be used.

# 'if torch.cuda.is_bf16_supported():' checks if BF16 is supported on the current GPU. If it is, the following block of code is executed.

# 'compute_dtype = torch.bfloat16' sets 'compute_dtype' to 'torch.bfloat16', which is the BF16 data type in PyTorch.

# 'attn_implementation = 'flash_attention_2'' sets 'attn_implementation' to 'flash_attention_2', which is a type of attention implementation.

# 'else:' specifies that the following block of code should be executed if BF16 is not supported on the current GPU.

# 'compute_dtype = torch.float16' sets 'compute_dtype' to 'torch.float16', which is the float16 data type in PyTorch.

# 'attn_implementation = 'sdpa'' sets 'attn_implementation' to 'sdpa', which is a type of attention implementation.

# 'print(attn_implementation)' prints the value of 'attn_implementation', which is the type of attention implementation to be used.

# 'print(compute_dtype)' prints the value of 'compute_dtype', which is the data type to be used for computations.
if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

attn_implementation = 'eager'
print(attn_implementation)
print(compute_dtype)

eager
torch.bfloat16


In [60]:
#load tokenizr to prepare dataset

# 'AutoTokenizer' is a class from the 'transformers' library that provides a generic tokenizer class from which all other tokenizer classes inherit.

# 'from_pretrained' is a method of the 'AutoTokenizer' class that loads a tokenizer from the Hugging Face Model Hub.

# 'tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)' loads the tokenizer associated with 'tokenizer_id' from the Hugging Face Model Hub and assigns it to the variable 'tokenizer'.

# 'tokenizer.padding_side' is a property of the 'tokenizer' object that determines on which side of the input sequences padding should be added. It can be set to either 'left' or 'right'.

# 'tokenizer.padding_side = 'right'' sets 'tokenizer.padding_side' to 'right', which means that padding will be added to the right side of the input sequences. This is done to prevent warnings that can occur when 'tokenizer.padding_side' is set to 'left'.

# Load the tokenizer associated with the pre-trained model specified by MODEL_ID using the AutoTokenizer class.
# The 'trust_remote_code=True' argument allows the execution of code from the model card (if any).
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.padding_side = 'right' # to prevent warnings

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [61]:
def create_message_column(row):
    messages = []
    user = {
        "content": f"{row['instruction']}\n Input: {row['input']}",
        "role": "user"
    }
    messages.append(user)
    assistant = {
        "content": f"{row['output']}",
        "role": "assistant"
    }
    messages.append(assistant)
    return {"messages": messages}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

In [62]:
# Apply create_message_column function
synthetic_extracted_train_dataset = synthetic_extracted_train_dataset.map(create_message_column)
synthetic_extracted_test_dataset = synthetic_extracted_test_dataset.map(create_message_column)

# Format dataset using ChatML
synthetic_extracted_train_dataset = synthetic_extracted_train_dataset.map(format_dataset_chatml)
synthetic_extracted_test_dataset = synthetic_extracted_test_dataset.map(format_dataset_chatml)

# Output the results to verify
print(synthetic_extracted_train_dataset)
print(synthetic_extracted_test_dataset)

Dataset({
    features: ['instruction', 'input', 'output', 'messages', 'text'],
    num_rows: 100000
})
Dataset({
    features: ['instruction', 'input', 'output', 'messages', 'text'],
    num_rows: 5851
})


In [63]:
synthetic_extracted_train_dataset = synthetic_extracted_train_dataset.select(range(10000))
synthetic_extracted_test_dataset = synthetic_extracted_test_dataset.select(range(2500))

In [64]:
# !pip install bitsandbytes-cuda110 bitsandbytes

In [65]:
# 'hf_model_repo' is the identifier for the Hugging Face repository where you want to save the fine-tuned model.
hf_model_repo="spectrewolf8/"+NEW_MODEL_NAME

# Load Model on GPU 

# 'device_map' is a dictionary that maps devices to model parts. In this case, it is set to {"": 0}, which means that the entire model will be loaded on GPU 0.
device_map = {"": 0}

# Bits and Bytes configuration for the model

# 'use_4bit' is a boolean that controls whether 4-bit precision should be used for loading the base model.
use_4bit = True

# 'bnb_4bit_compute_dtype' is the data type that should be used for computations with the 4-bit base model. In this case, it is set to 'bfloat16'.
bnb_4bit_compute_dtype = "bfloat16"

# 'bnb_4bit_quant_type' is the type of quantization that should be used for the 4-bit base model. In this case, it is set to 'nf4'.
bnb_4bit_quant_type = "nf4"

# 'use_double_quant' is a boolean that controls whether nested quantization should be used for the 4-bit base model.
use_double_quant = True

# LoRA configuration for the model

# 'lora_r' is the dimension of the LoRA attention.
lora_r = 16

# 'lora_alpha' is the alpha parameter for LoRA scaling.
lora_alpha = 16

# 'lora_dropout' is the dropout probability for LoRA layers.
lora_dropout = 0.05

# 'target_modules' is a list of the modules that should be targeted by LoRA.
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]



In [66]:

bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_double_quant,
)

In [67]:
# 'AutoTokenizer' is a class from the Hugging Face Transformers library that provides a tokenizer for a given pre-trained model.

# 'from_pretrained' is a method of the 'AutoTokenizer' class that loads a tokenizer from a pre-trained model.

# 'trust_remote_code=True' is a parameter that allows the execution of remote code when loading the tokenizer.

# 'add_eos_token=True' is a parameter that adds an end-of-sentence token to the tokenizer.

# 'use_fast=True' is a parameter that uses the fast version of the tokenizer, if available.

# 'tokenizer.pad_token = tokenizer.unk_token' sets the padding token of the tokenizer to be the same as the unknown token.

# 'tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)' sets the ID of the padding token to be the same as the ID of the padding token.

# 'tokenizer.padding_side = 'left'' sets the side where padding will be added to be the left side.

# 'BitsAndBytesConfig' is a class that provides a configuration for quantization.

# 'bnb_config' is a variable that holds the configuration for quantization.

# 'AutoModelForCausalLM' is a class from the Hugging Face Transformers library that provides a model for causal language modeling.

# 'from_pretrained' is a method of the 'AutoModelForCausalLM' class that loads a model from a pre-trained model.

# 'torch_dtype=compute_dtype' is a parameter that sets the data type of the model to be the same as 'compute_dtype'.

# 'quantization_config=bnb_config' is a parameter that sets the configuration for quantization to be 'bnb_config'.

# 'device_map=device_map' is a parameter that sets the device map of the model to be 'device_map'.

# 'attn_implementation=attn_implementation' is a parameter that sets the type of attention implementation to be 'attn_implementation'.

# 'prepare_model_for_kbit_training' is a function that prepares a model for k-bit training.

# 'model = prepare_model_for_kbit_training(model)' prepares 'model' for k-bit training and assigns the result back to 'model'.
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'left'

model = AutoModelForCausalLM.from_pretrained(
          MODEL_ID, torch_dtype=compute_dtype, trust_remote_code=True, quantization_config=bnb_config, device_map=device_map,
          attn_implementation=attn_implementation
)

model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [68]:
# 'wandb' is a library for machine learning experiment tracking, dataset versioning, and model management.

# 'import wandb' is a line of code that imports the 'wandb' library.

# 'wandb.login()' is a function that logs you into your Weights & Biases account. If you're not logged in, it will prompt you to enter your API key.

# This block of code is used to initialize Weights & Biases for experiment tracking.

# get wandb token
wandb_token = user_secrets.get_secret("WANDB_TOKEN")

import wandb
wandb.login(key = wandb_token)

run = wandb.init(
    project='Training and tuning Phi-3-mini-4k-instruct for SQL | kaggle-sql-xp-phi-3-mini-4k-instruct.ipynb', 
    job_type="training", 
    anonymous="allow"
)



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇███
train/grad_norm,▆▅▃▃▇▂▂▃▃▅▂▅▂▄▅▁▂▂▃█▂▂▂▃▅
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▆▄▅▇▁▂▃▃▆▂▂▃▄▆▁▂▂▄▅▁▂▃▃▆

0,1
total_flos,3917708208783360.0
train/epoch,1.0
train/global_step,250.0
train/grad_norm,0.3427
train/learning_rate,0.0002
train/loss,0.6112
train_loss,0.52477
train_runtime,753.0591
train_samples_per_second,1.328
train_steps_per_second,0.332


In [69]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb"
)

In [70]:
# peft config
peft_config = LoraConfig(
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    r = lora_r,
    target_modules=target_modules
)

In [71]:
# 'SFTTrainer' is a class that provides a trainer for fine-tuning a model.

# 'trainer' is a variable that holds the trainer.

# 'model=model' is a parameter that sets the model to be trained to be 'model'.

# 'train_dataset=dataset_chatml['train']' is a parameter that sets the training dataset to be 'dataset_chatml['train']'.

# 'eval_dataset=dataset_chatml['test']' is a parameter that sets the evaluation dataset to be 'dataset_chatml['test']'.

# 'peft_config=peft_config' is a parameter that sets the configuration for the Lora layer to be 'peft_config'.

# 'dataset_text_field="text"' is a parameter that sets the field in the dataset that contains the text to be 'text'.

# 'max_seq_length=512' is a parameter that sets the maximum sequence length for the model to be 512.

# 'tokenizer=tokenizer' is a parameter that sets the tokenizer to be 'tokenizer'.

# 'args=args' is a parameter that sets the training arguments to be 'args'.

# This line of code is used to create a trainer for fine-tuning the model with the specified parameters.
trainer = SFTTrainer(
        model=model,
        train_dataset=synthetic_extracted_train_dataset,
        eval_dataset=synthetic_extracted_test_dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]



In [72]:
# !pip install -U flash_attn

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [73]:
# 'trainer.train()' is a method that starts the training of the model. It uses the training dataset, model, and training arguments that were specified when the trainer was created.

# 'trainer.save_model()' is a method that saves the trained model to the local file system. The model will be saved in the output directory that was specified in the training arguments.

# This block of code is used to train the model and then save the trained model to the local file system.
# train
trainer.train()

# save model in local
trainer.save_model()



Step,Training Loss
10,0.7387
20,0.6201
30,0.534
40,0.5912
50,0.6345
60,0.4192
70,0.4727
80,0.5341
90,0.5407
100,0.6016




In [74]:
hf_model_repo= "spectrewolf8/sql-xp-phi-3-mini-4k"
trainer.push_to_hub(hf_model_repo)
trainer.model.push_to_hub(hf_model_repo)
tokenizer.push_to_hub(hf_model_repo)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/35.7M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/35.7M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/spectrewolf8/sql-xp-phi-3-mini-4k/commit/8498ab9b16bb2dd2127299d043fcfc4e896f9260', commit_message='Upload tokenizer', commit_description='', oid='8498ab9b16bb2dd2127299d043fcfc4e896f9260', pr_url=None, pr_revision=None, pr_num=None)

# Model stats

In [82]:
wandb.finish()
model.config.use_cache = True
model.eval()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▆▃▃█▁▂▄▂▁▂▃▅▂▂▅▁▁▆▁▁▂▂▆▂▂▅▁▂▄▁▁▂▃▅▁▃▄▂▁▂
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▆▅█▃▃█▂▃▅▅▇▄▅▇▂▃▇▁▃▃▄▇▃▄▇▁▂▆▁▃▃▅▇▃▄▇▁▂▄

0,1
total_flos,3.938091109810176e+16
train/epoch,1.0
train/global_step,2500.0
train/grad_norm,0.35459
train/learning_rate,0.0002
train/loss,0.5108
train_loss,0.47145
train_runtime,7554.136
train_samples_per_second,1.324
train_steps_per_second,0.331


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): lora.Linear(
            (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=3072, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=3072, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
    

# Testing model

In [75]:
synthetic_extracted_test_dataset[10]

{'instruction': "How many decentralized applications have been downloaded from the 'Asia-Pacific' region?",
 'input': "CREATE TABLE dapp_ranking (dapp_id INT, dapp_name VARCHAR(50), dapp_category VARCHAR(30), dapp_rating DECIMAL(3,2), dapp_downloads INT, dapp_region VARCHAR(30)); INSERT INTO dapp_ranking (dapp_id, dapp_name, dapp_category, dapp_rating, dapp_downloads, dapp_region) VALUES (1, 'AsiaPacificDapp', 'Social', 4.3, 2000000, 'Asia-Pacific');",
 'output': "SELECT SUM(dapp_downloads) FROM dapp_ranking WHERE dapp_region = 'Asia-Pacific';",
 'messages': [{'content': "How many decentralized applications have been downloaded from the 'Asia-Pacific' region?\n Input: CREATE TABLE dapp_ranking (dapp_id INT, dapp_name VARCHAR(50), dapp_category VARCHAR(30), dapp_rating DECIMAL(3,2), dapp_downloads INT, dapp_region VARCHAR(30)); INSERT INTO dapp_ranking (dapp_id, dapp_name, dapp_category, dapp_rating, dapp_downloads, dapp_region) VALUES (1, 'AsiaPacificDapp', 'Social', 4.3, 2000000, 'Asi

In [76]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [77]:
input_phrase = """
insert 5 values
"""
context_phrase = """
CREATE TABLE tasks (
    id INT AUTO_INCREMENT PRIMARY KEY,
    name VARCHAR(100) NOT NULL,
    task_name VARCHAR(100) NOT NULL,
    userid INT NOT NULL,
    date DATE NOT NULL,
    FOREIGN KEY (userid) REFERENCES users(id)
);
"""
prompt = pipe.tokenizer.apply_chat_template([{"role": "user", "content": f"\n {input_phrase} Input:{context_phrase}"}], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, num_beams=1, temperature=0.3, top_k=50, top_p=0.95, max_time= 180)
print(outputs[0]['generated_text'][len(prompt):].strip())



INSERT INTO tasks (name, task_name, userid, date) VALUES ('John', 'Task 1', 1, '2022-01-01'), ('Jane', 'Task 2', 2, '2022-01-02'), ('Bob', 'Task 3', 3, '2022-01-03'), ('Alice', 'Task 4', 4, '2022-01-04'), ('Charlie', 'Task 5', 5, '2022-01-05');


In [78]:
input_phrases = [
    "insert 5 values",
    "select all records",
    "update record with id 3",
    "delete all records where task_name is 'coding'",
    "add a new column 'status' to the table",
    "find all tasks with userid 2",
    "count the number of tasks per user",
    "list all tasks sorted by date",
    "join tasks with users",
    "find the average number of tasks per user"
]

context_phrases = [
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """,
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """,
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """,
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """,
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """,
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """,
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """,
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """,
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """,
    """
    CREATE TABLE tasks (
        id INT AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        task_name VARCHAR(100) NOT NULL,
        userid INT NOT NULL,
        date DATE NOT NULL,
        FOREIGN KEY (userid) REFERENCES users(id)
    );
    """
]

# Apply the chat template to create prompts
prompts = [pipe.tokenizer.apply_chat_template([{"role": "user", "content": f"\n {input_phrase} Input:{context_phrase}"}], tokenize=False, add_generation_prompt=True)
           for input_phrase, context_phrase in zip(input_phrases, context_phrases)]

# Generate SQL queries
outputs = [pipe(prompt, max_new_tokens=256, do_sample=True, num_beams=1, temperature=0.3, top_k=50, top_p=0.95, max_time=180)
           for prompt in prompts]

# Print the results
for i, output in enumerate(outputs):
    generated_text = output[0]['generated_text'][len(prompts[i]):].strip()
    print(f"Prompt {i+1}:")
    print(generated_text)
    print("\n")


Prompt 1:
INSERT INTO tasks (name, task_name, userid, date) VALUES (1, 'Task 1', 1, '2022-01-01'), (2, 'Task 2', 2, '2022-01-02'), (3, 'Task 3', 3, '2022-01-03'), (4, 'Task 4', 4, '2022-01-04'), (5, 'Task 5', 5, '2022-01-05');


Prompt 2:
SELECT * FROM tasks;


Prompt 3:
UPDATE tasks SET task_name = 'Updated Task Name' WHERE id = 3;


Prompt 4:
DELETE FROM tasks WHERE task_name = 'coding';


Prompt 5:
ALTER TABLE tasks ADD COLUMN status VARCHAR(20) NOT NULL DEFAULT 'pending';


Prompt 6:
SELECT * FROM tasks WHERE userid = 2;


Prompt 7:
SELECT userid, COUNT(*) as num_tasks FROM tasks GROUP BY userid;


Prompt 8:
SELECT * FROM tasks ORDER BY date;


Prompt 9:
SELECT t.name, t.task_name, u.name, u.email FROM tasks t JOIN users u ON t.userid = u.id;


Prompt 10:
SELECT AVG(task_count) FROM (SELECT userid, COUNT(*) as task_count FROM tasks GROUP BY userid) as subquery;




# Loading model from hugging face

In [79]:


# 'hf_model_repo' is a variable that holds the repository name for the Hugging Face model.

# This line of code is used to reference the repository name for the Hugging Face model.

# 'hf_model_repo' is a variable that holds the repository name for the Hugging Face model.

# 'username/modelname' is the repository name, where 'username' is the username of the repository owner and 'modelname' is the name of the model.

# This line of code is used to set the repository name for the Hugging Face model.
hf_model_repo= "spectrewolf8/sql-xp-phi-3-mini-4k"
     

# Retrieve the model and tokenizer from the Hub.

# 'device_map' is a variable that holds the mapping of the devices that are used for computation.

# 'compute_dtype' is a variable that holds the data type that is used for computation.

# This line of code is used to return the values of the 'device_map' and 'compute_dtype' variables.
device_map, compute_dtype
     

# This block of code is used to import the necessary libraries, set the seed for reproducibility, and load a pre-trained tokenizer and model.

# 'import torch' is a line of code that imports the PyTorch library, which is a popular open-source machine learning library.

# 'from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed' is a line of code that imports the 'AutoTokenizer', 'AutoModelForCausalLM', and 'set_seed' functions from the Hugging Face Transformers library.

# 'set_seed(1234)' is a line of code that sets the seed for the random number generator to '1234'. This is done to ensure that the results are reproducible.

# 'tokenizer = AutoTokenizer.from_pretrained(hf_model_repo,trust_remote_code=True)' is a line of code that loads a pre-trained tokenizer from the Hugging Face Model Hub. 'hf_model_repo' is the repository name for the model and 'trust_remote_code=True' allows the execution of code from the model file.

# 'model = AutoModelForCausalLM.from_pretrained(hf_model_repo, trust_remote_code=True, torch_dtype=compute_dtype, device_map=device_map)' is a line of code that loads a pre-trained model for causal language modeling from the Hugging Face Model Hub. 'hf_model_repo' is the repository name for the model, 'trust_remote_code=True' allows the execution of code from the model file, 'torch_dtype=compute_dtype' sets the data type for the PyTorch tensors, and 'device_map=device_map' sets the device mapping.
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed

set_seed(1234)  # For reproducibility

tokenizer = AutoTokenizer.from_pretrained(hf_model_repo,trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(hf_model_repo, trust_remote_code=True, torch_dtype=compute_dtype, device_map=device_map) # compute "auto" dev_map "cuda"

tokenizer.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/35.7M [00:00<?, ?B/s]

In [80]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [83]:
input_phrase = """
insert 5 values
"""
context_phrase = """
CREATE TABLE tasks (
    id INT AUTO_INCREMENT PRIMARY KEY,
    name VARCHAR(100) NOT NULL,
    task_name VARCHAR(100) NOT NULL,
    userid INT NOT NULL,
    date DATE NOT NULL,
    FOREIGN KEY (userid) REFERENCES users(id)
);
"""
prompt = pipe.tokenizer.apply_chat_template([{"role": "user", "content": f"\n {input_phrase} Input:{context_phrase}"}], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, num_beams=1, temperature=0.3, top_k=50, top_p=0.95, max_time= 180)
print(outputs[0]['generated_text'][len(prompt):].strip())

INSERT INTO tasks (name, task_name, userid, date) VALUES 
    ('Alice', 'Task 1', 1, '2022-01-01'),
    ('Bob', 'Task 2', 2, '2022-01-02'),
    ('Charlie', 'Task 3', 3, '2022-01-03'),
    ('David', 'Task 4', 4, '2022-01-04'),
    ('Eve', 'Task 5', 5, '2022-01-05');
