<a href="https://colab.research.google.com/github/Prkhar05/Tiny_Text2SQL/blob/main/end_to_end_text2sql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**INSTRUCTIONS** <br>
In "imports and installations" section do hf login, run the "functions" section and for finetuning given the model name in "model training" section and run that section.

For loading the finetuning model(if google colab disconnects), run  the "loading trained model" section.

For testing, run "imports", "dataset loading", "evaluate function" sub-sections in "testing" section and run the "testing" sub-section.

For testing using sql-eval:<br>
Add "if toks[idx]==',':idx+=1" this line in "content/test-suite-sql-eval/process_sql.py" file between lines 259 and 260 and run "testing" section.

# installations

In [None]:
!pip install -U -q accelerate bitsandbytes git+https://github.com/huggingface/transformers trl datasets "sqlglot[rs]" peft

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.1/245.1 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.2/385.2 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m37.1 MB/s

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# imports

In [None]:
import torch
from datasets import load_dataset,Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftConfig, PeftModel,get_peft_model
from trl import SFTTrainer

# functions

In [None]:
def prepare_model(model_name):
  # The instruction dataset to use
  dataset_name = "Cynaptics/Test2Sql_InstructionTuned"

  # Fine-tuned model name
  new_model = model_name.split('/')[-1]

  ################################################################################
  # QLoRA parameters
  ################################################################################

  # LoRA attention dimension
  lora_r = 16

  # Alpha parameter for LoRA scaling
  lora_alpha = 16

  # Dropout probability for LoRA layers
  lora_dropout = 0.1

  ################################################################################
  # bitsandbytes parameters
  ################################################################################

  # Activate 4-bit precision base model loading
  use_4bit = True

  # Compute dtype for 4-bit base models
  bnb_4bit_compute_dtype = "float16"

  # Quantization type (fp4 or nf4)
  bnb_4bit_quant_type = "nf4"

  # Activate nested quantization for 4-bit base models (double quantization)
  use_nested_quant = False

  ################################################################################
  # TrainingArguments parameters
  ################################################################################

  # Output directory where the model predictions and checkpoints will be stored
  output_dir = "./results"

  # Number of training epochs
  num_train_epochs = 1

  # Enable fp16/bf16 training (set bf16 to True with an A100)
  fp16 = False
  bf16 = False

  # Batch size per GPU for training
  per_device_train_batch_size = 1

  # Batch size per GPU for evaluation
  per_device_eval_batch_size = 1

  # Number of update steps to accumulate the gradients for
  gradient_accumulation_steps = 4

  # Enable gradient checkpointing
  gradient_checkpointing = False

  # Maximum gradient normal (gradient clipping)
  max_grad_norm = 0.3

  # Initial learning rate (AdamW optimizer)
  learning_rate = 5e-4

  # Weight decay to apply to all layers except bias/LayerNorm weights
  weight_decay = 0.5

  # Optimizer to use
  optim = "paged_adamw_32bit"

  # Learning rate schedule
  lr_scheduler_type = "cosine"

  # Number of training steps (overrides num_train_epochs)
  max_steps = 100

  # Ratio of steps for a linear warmup (from 0 to learning rate)
  warmup_ratio = 0.03

  # Group sequences into batches with same length
  # Saves memory and speeds up training considerably
  group_by_length = True

  # Save checkpoint every X updates steps
  save_steps = 50

  # Log every X updates steps
  logging_steps = 20

  ################################################################################
  # SFT parameters
  ################################################################################

  # Maximum sequence length to use
  max_seq_length = 512

  # Pack multiple short examples in the same input sequence to increase efficiency
  packing = False

  # Load the entire model on the GPU 0
  device_map = {"": 0}

  dataset = load_dataset(dataset_name,split='small_train')
  # data_proc = Dataset.from_dict({"text" : dataset['prompt']})

  compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=use_4bit,
      bnb_4bit_quant_type=bnb_4bit_quant_type,
      bnb_4bit_compute_dtype=compute_dtype,
      bnb_4bit_use_double_quant=use_nested_quant,
  )
  model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
  )
  model.config.use_cache = False
  model.config.pretraining_tp = 1

  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "right"

  peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules=[
            "q_proj",
            "v_proj",
            "k_proj",
            "out_proj",
            "fc_in",
            "fc_out",
            "wte",
        ],
    bias="none",
    task_type="CAUSAL_LM",
  )

  m=get_peft_model(model,peft_config)
  print("paramaters info",m.print_trainable_parameters())

  training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    # push_to_hub = True,
    )

  trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="prompt",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
  )

  return trainer,tokenizer

In [None]:
def train_model(trainer,model_name,new_model):
  trainer.train()
  trainer.model.save_pretrained(new_model)

  peft_config = PeftConfig.from_pretrained(f"/content/{new_model}")
  # tokenizer = AutoTokenizer.from_pretrained(model_name)
  base_model = AutoModelForCausalLM.from_pretrained(model_name ,return_dict=True, torch_dtype=torch.float16, load_in_4bit=True)

  base_model = PeftModel.from_pretrained(base_model,f"/content/{new_model}")
  base_model.eval()

  base_model = base_model.merge_and_unload()
  base_model.save_pretrained(new_model+"_final")
  return base_model

# model training

In [None]:
model_name="NexaAIDev/Octopus-v2"
model_name="mistralai/Mistral-7B-Instruct-v0.2"
model_name = "PipableAI/pip-sql-1.3b"
model_name='google/gemma-2b'
model_name="google/codegemma-2b"
new_model_name="gemma"

In [None]:
trainer,tokenizer=prepare_model(model_name)
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 2,506,752 || all params: 2,508,679,168 || trainable%: 0.09992317997356608
paramaters info None


max_steps is given, it will override any value given in num_train_epochs


In [None]:
model=train_model(trainer,model_name,new_model_name)

Step,Training Loss
20,1.136
40,1.2253
60,1.151
80,0.9745
100,1.2519


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
model.push_to_hub('Cynaptics/ft_model')
# tokenizer.push_to_hub('Cynaptics/ft_model')

# loading trained model

In [None]:
new_model_name=new_model_name+"_final"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(new_model_name , load_in_4bit=True)

#testing

## imports and installations

In [None]:
!pip install -q sqlparse==0.4.2 nltk==3.7 "sqlglot[rs]"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m732.5 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import tqdm

In [None]:
!git clone https://github.com/taoyds/test-suite-sql-eval.git

Cloning into 'test-suite-sql-eval'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 61 (delta 14), reused 11 (delta 10), pack-reused 39[K
Receiving objects: 100% (61/61), 619.45 KiB | 3.05 MiB/s, done.
Resolving deltas: 100% (25/25), done.


In [None]:
import gdown
def download_zip(file_id,output_file):
    gdown.download(f'https://drive.google.com/uc?id={file_id}', output_file, quiet=True)

In [None]:
download_zip('1mkCx2GOFIqNesD4y8TDAO1yX1QZORP5w','testsuitedatabases.zip')
!unzip -q testsuitedatabases.zip -d database/

In [None]:
!cp -r /content/database/database /content/test-suite-sql-eval

In [None]:
!cd /content/test-suite-sql-eval && mkdir -p out

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## dataset loading

In [None]:
# !pip install -q datasets
# !huggingface-cli login

In [None]:
from datasets import Dataset,load_dataset
dataset_name='Cynaptics/Test2Sql'
dataset=load_dataset(dataset_name,split='sql_eval_test')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/882 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/338k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/64.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/187k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/154k [00:00<?, ?B/s]

Generating small_validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/489956 [00:00<?, ? examples/s]

Generating small_train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating spider_test split:   0%|          | 0/4840 [00:00<?, ? examples/s]

Generating sql_eval_test split:   0%|          | 0/3509 [00:00<?, ? examples/s]

Dataset({
    features: ['schema', 'query', 'question'],
    num_rows: 3509
})

In [None]:
import pickle
with open('/content/test-suite-sql-eval/classical_test.pkl', 'rb') as f:
    data = pickle.load(f)

databases=[]
for d in data:
    databases.append(d['db_id'])
len(databases)

databasestoberemoved=["advising","geography","atis"]
dataset=dataset.filter(lambda example, idx: databases[idx] not in databasestoberemoved, with_indices=True)
databases=[db for db in databases if db not in databasestoberemoved]
bad_list=[21, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, 120, 126, 132, 134, 143, 144, 145, 146, 147, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 170, 171, 172, 173, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 316, 317, 318, 319, 320, 325, 814, 815, 816, 817, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 957, 958, 959, 960, 961, 970, 978, 979, 980, 981, 982, 983, 984, 986, 987, 988, 999, 1002, 1006, 1007, 1009, 1010, 1014, 1079, 1080, 1081, 1082, 1083, 1084, 1089, 1092, 1093, 1094, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1119, 1121, 1122, 1123, 1124, 1127, 1128, 1129, 1133, 1134, 1141, 1146, 1147]

with open("/content/test-suite-sql-eval/evaluation_examples/test_gold.txt", 'w') as f:
    pass
for i in tqdm.tqdm(range(len(databases))):
    if databases[i] in databasestoberemoved:
        continue
    if i in bad_list:
        continue
    with open("/content/test-suite-sql-eval/evaluation_examples/test_gold.txt",'a') as f:
        f.write(f"{dataset[i]['query']}\t{databases[i]}\n")

with open("/content/test-suite-sql-eval/evaluation_examples/test_predict.txt", 'w') as f:
    pass
for i in tqdm.tqdm(range(len(databases))):
    if databases[i] in databasestoberemoved:
        continue
    if i in bad_list:
        continue
    with open("/content/test-suite-sql-eval/evaluation_examples/test_predict.txt",'a') as f:
        f.write(f"{dataset[i]['query']}\n")

Filter:   0%|          | 0/3509 [00:00<?, ? examples/s]

100%|██████████| 1148/1148 [00:00<00:00, 7561.45it/s]
100%|██████████| 1148/1148 [00:00<00:00, 9684.76it/s]


In [None]:
def prompt_pip_sql(d):
  prompt = f"""<schema>{d['schema']}</schema>
    <question>{d['question']}</question>
    <sql>"""
  return prompt
ds=[prompt_pip_sql(d) for d in dataset]
bad_list=[]
ds = [ds[i] for i in range(len(ds)) if i not in bad_list]
len(ds)

3509

## cmd prompts for sql eval

In [None]:
# !rm -r /content/test-suite-sql-eval/out/out_academic_test.json

In [None]:
# !cd test-suite-sql-eval && python3 evaluate_classical.py --gold=classical_test.pkl --pred=evaluation_examples/academic_gold.txt --subset=academic --out_file=out/out_academic_test.json

In [None]:
# !cd test-suite-sql-eval && python3 evaluate_classical.py --gold=classical_test.pkl --pred=evaluation_examples/classical_test_gold.txt --out_file=out/out_academic_test.json

In [None]:
# !cd test-suite-sql-eval && python3 evaluation.py --gold evaluation_examples/test_gold.txt --pred evaluation_examples/test_predict.txt --etype all --db database --table tables.json --progress_bar_for_each_datapoint

## model loading (to be deleted)

In [None]:
!pip install -U -q accelerate bitsandbytes git+https://github.com/huggingface/transformers trl datasets "sqlglot[rs]" peft

In [None]:
!huggingface-cli login

In [None]:
import torch
from datasets import load_dataset,Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftConfig, PeftModel,get_peft_model
from trl import SFTTrainer

In [None]:
model_name="google/gemma-2b"
model_name = "PipableAI/pip-sql-1.3b"
model_name="suriya7/Gemma2B-Finetuned-Sql-Generator"
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
device_map = {"": 0}

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  quantization_config=bnb_config,
  device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

## evaluate function

In [None]:
import sqlparse
import sqlglot
from sqlglot.optimizer import optimize

def evaluate_query(sql1,sql2):
    if sql1.lower().replace('\n', '').replace(' ','').replace(';','').replace('"',"'").replace('"','')==sql2.lower().replace('\n', '').replace(' ','').replace(';','').replace('"',"'").replace('"',''):
        # print("Correct without optimization")
        return True
    # try:
    #     sql1=optimize(sqlglot.parse_one(sql1)).sql(pretty=True)
    #     sql2=optimize(sqlglot.parse_one(sql2)).sql(pretty=True)
    # except Exception as e:
    #     print("Not Executable")
    #     return False

    if(sql1==sql2):
        # print("correct just after optimisation")
        return True
    sql_parsed1=[t for t in sqlparse.parse(sql1)[0].tokens if not t.is_whitespace]
    sql_parsed2=[t for t in sqlparse.parse(sql2)[0].tokens if not t.is_whitespace]

    if(len(sql_parsed1)!=len(sql_parsed2)):
        # print("Different lenght of sql queries")
        return False

    for i in range(len(sql_parsed1)):
        if sql_parsed1[i].__class__ != sqlparse.sql.IdentifierList:
            if(sql_parsed1[i].value.lower()!=sql_parsed2[i].value.lower()):
                # print("Wrong token")
                return False
        else:
            value1=sql_parsed1[i].value
            value1=sorted(value1.replace("\n","").replace(" ","").split(","))
            value1=[value.split("AS")[0] for value in value1]
            value2=sql_parsed2[i].value
            value2=sorted(value2.replace("\n","").replace(" ","").split(","))
            value2=[value.split("AS")[0] for value in value2]
            if(value1!=value2):
                # print("Wrong identifiers")
                return False
    # print("correct after rearranging")
    return True

## testing

In [None]:
# import logging
# import warnings
# logging.getLogger("transformers").setLevel(logging.ERROR)
# warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
correct=0
for i in tqdm.tqdm(range(len(ds))):
  token=tokenizer(ds[i],return_tensors='pt')
  #  padding=True, truncation=True
  output=model.generate(**token,max_new_tokens=300,eos_token_id=[tokenizer.encode('</sql>')[-1]])
  decode=tokenizer.decode(output[0][len(token.input_ids[0]):],skip_special_tokens=True).split('</sql>')[0]
  with open("results.txt", "a") as f:
    f.write(f"{decode}\n")
  if evaluate_query(decode,dataset[i]['query']):
    correct+=1
print("number of correct answers: ",correct)

In [None]:
!cp -r /content/results.txt /content/test-suite-sql-eval

In [None]:
# !cd test-suite-sql-eval && python3 evaluate_classical.py --gold=classical_test.pkl --pred=results.txt --out_file=out/out_academic_test.json

In [None]:
from datetime import datetime
time=datetime.now()
!cd test-suite-sql-eval && python3 evaluation.py --gold evaluation_examples/test_gold.txt --pred evaluation_examples/results.txt --etype all --db database --table tables.json --progress_bar_for_each_datapoint
current_time=datetime.now()

In [None]:
print("time taken: ",(current_time-time).seconds)

## debugging

In [None]:
token=tokenizer(ds[0],return_tensors='pt')
output=model.generate(**token,max_new_tokens=300,eos_token_id=[tokenizer.encode('</sql>')[-1]])
decode=tokenizer.decode(output[0][len(token.input_ids[0]):],skip_special_tokens=True).split("</sql>")[0]
print(decode)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained("Cynaptics/sft-gemma-1.1-2b-it", load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/3.37G [00:00<?, ?B/s]

Some weights of the model checkpoint at Cynaptics/sft-gemma-1.1-2b-it were not used when initializing GemmaForCausalLM: ['model.layers.0.self_attn.k_proj.base_layer.weight', 'model.layers.0.self_attn.k_proj.base_layer.weight.absmax', 'model.layers.0.self_attn.k_proj.base_layer.weight.quant_map', 'model.layers.0.self_attn.k_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'model.layers.0.self_attn.k_proj.lora_A.default.weight', 'model.layers.0.self_attn.k_proj.lora_B.default.weight', 'model.layers.0.self_attn.q_proj.base_layer.weight', 'model.layers.0.self_attn.q_proj.base_layer.weight.absmax', 'model.layers.0.self_attn.q_proj.base_layer.weight.quant_map', 'model.layers.0.self_attn.q_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'model.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.layers.0.self_attn.q_proj.lora_B.default.weight', 'model.layers.0.self_attn.v_proj.base_layer.weight', 'model.layers.0.self_attn.v_proj.base_layer.weight.absmax', 'model.layers.0.self

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
token=tokenizer(ds[0],return_tensors='pt')
output=base_model.generate(**token,max_new_tokens=300,eos_token_id=[tokenizer.encode('</sql>')[-1]])
decode=tokenizer.decode(output[0][len(token.input_ids[0]):],skip_special_tokens=True).split("</sql>")[0]
print(decode)



>


In [None]:
dataset[0]['query']

'SELECT JOURNALalias0.HOMEPAGE FROM JOURNAL AS JOURNALalias0 WHERE JOURNALalias0.NAME = "PVLDB" ;'