In [None]:
!pip install datasets evaluate bitsandbytes optimum peft

In [None]:
!pip install -U bitsandbytes

In [None]:
!accelerate config default

In [1]:
from datasets import load_dataset
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, Subset
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, BitsAndBytesConfig
from accelerate import Accelerator
import torch.distributed as dist
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from peft import LoraConfig, get_peft_model
import tqdm

In [3]:
torch.cuda.empty_cache()

In [4]:
torch.set_float32_matmul_precision('high')

In [5]:
accelerator = Accelerator(mixed_precision="bf16")

In [6]:
subset_indices = list(range(256))

In [7]:
training_ds = load_dataset("gretelai/synthetic_text_to_sql", split="train")
valid_ds = load_dataset("gretelai/synthetic_text_to_sql", split="test")

In [8]:
valid_ds = Subset(valid_ds, subset_indices)

In [9]:
training_ds[0]

{'id': 5097,
 'domain': 'forestry',
 'domain_description': 'Comprehensive data on sustainable forest management, timber production, wildlife habitat, and carbon sequestration in forestry.',
 'sql_complexity': 'single join',
 'sql_complexity_description': 'only one join (specify inner, outer, cross)',
 'sql_task_type': 'analytics and reporting',
 'sql_task_type_description': 'generating reports, dashboards, and analytical insights',
 'sql_prompt': 'What is the total volume of timber sold by each salesperson, sorted by salesperson?',
 'sql_context': "CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');",
 'sql'

In [10]:
torch.cuda.empty_cache()

In [11]:
torch.backends.cuda.matmul.allow_tf32 = True

In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

In [13]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [14]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [15]:
model = get_peft_model(model, lora_config)

In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
device

device(type='cuda')

In [19]:
!nvidia-smi

Sun Feb 23 03:53:42 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   57C    P0             28W /   70W |    1691MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [20]:
def collate_fn(batch):
    """ Tokenizes and dynamically pads a batch of text samples. """
    system_prompt = "You are an AI that translates natural language into SQL queries. You will output only the SQL query that outputs the following natural language question."
    sql_pairs = [f"{system_prompt}\nQuestion:\n{example['sql_prompt']} {tokenizer.eos_token}\n SQL Query:\n{example['sql']}" for example in batch]

    tokenized = tokenizer(sql_pairs, padding=True, truncation=True, return_tensors="pt")

    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]

    labels = input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100

    return {
        "input_ids": input_ids.to(device),
        "attention_mask": attention_mask.to(device),
        "labels": labels.to(device),
    }

In [21]:
training_ds.set_format(type="torch")

In [23]:
train_dataloader = DataLoader(training_ds, batch_size=4, collate_fn=collate_fn, shuffle=True)
valid_dataloader = DataLoader(valid_ds, batch_size=4, collate_fn=collate_fn, shuffle=False)

In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [25]:
num_epochs = 3

In [29]:
gradient_accumulation_steps = 4
batch_size = 1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for param in model.parameters():
    param.data = param.data.to(device)
    if param.grad is not None:
        param.grad.data = param.grad.data.to(device)

import bitsandbytes as bnb
for name, module in model.named_modules():
    if isinstance(module, bnb.nn.Linear4bit):
        module.to(device)

for step, batch in tqdm.notebook.tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
    batch = {k: v.to(device) for k, v in batch.items()}
    
    outputs = model(**batch)
    loss = outputs.loss
    loss = loss / gradient_accumulation_steps

    loss.mean().backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    if (step + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

    if (step + 1) % 1000 == 0:
        training_loss = loss.item() * 4
        
        model.eval()
        total_loss = 0

        for val_step, val_batch in tqdm.notebook.tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
            val_batch = {k: v.to(device) for k, v in val_batch.items()}
            with torch.no_grad():
                val_outputs = model(**val_batch)
                val_loss = val_outputs.loss
                total_loss += val_loss.mean().item()

        avg_loss = total_loss / len(valid_dataloader)
        print(f"Training Loss at Step {step+1}, Loss: {training_loss:.4f}")
        print(f"Validation Loss at Step {step+1}, Loss: {avg_loss:.4f}")

        model.train()

  0%|          | 0/25000 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 1000, Loss: 1.0017
Validation Loss at Step 1000, Loss: 1.0256


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 2000, Loss: 1.1644
Validation Loss at Step 2000, Loss: 0.8818


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 3000, Loss: 0.7851
Validation Loss at Step 3000, Loss: 0.8507


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 4000, Loss: 0.7416
Validation Loss at Step 4000, Loss: 0.8322


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 5000, Loss: 0.6960
Validation Loss at Step 5000, Loss: 0.8184


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 6000, Loss: 1.0118
Validation Loss at Step 6000, Loss: 0.8068


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 7000, Loss: 0.9897
Validation Loss at Step 7000, Loss: 0.7997


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 8000, Loss: 0.9165
Validation Loss at Step 8000, Loss: 0.7938


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 9000, Loss: 0.8048
Validation Loss at Step 9000, Loss: 0.7875


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 10000, Loss: 0.8869
Validation Loss at Step 10000, Loss: 0.7822


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 11000, Loss: 0.8387
Validation Loss at Step 11000, Loss: 0.7788


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 12000, Loss: 0.8117
Validation Loss at Step 12000, Loss: 0.7746


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 13000, Loss: 0.7259
Validation Loss at Step 13000, Loss: 0.7719


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 14000, Loss: 0.8100
Validation Loss at Step 14000, Loss: 0.7678


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 15000, Loss: 0.6901
Validation Loss at Step 15000, Loss: 0.7626


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 16000, Loss: 0.9630
Validation Loss at Step 16000, Loss: 0.7600


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 17000, Loss: 0.6599
Validation Loss at Step 17000, Loss: 0.7571


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 18000, Loss: 0.6770
Validation Loss at Step 18000, Loss: 0.7541


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 19000, Loss: 0.7360
Validation Loss at Step 19000, Loss: 0.7509


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 20000, Loss: 0.7170
Validation Loss at Step 20000, Loss: 0.7458


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 21000, Loss: 0.7993
Validation Loss at Step 21000, Loss: 0.7446


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 22000, Loss: 0.5846
Validation Loss at Step 22000, Loss: 0.7412


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 23000, Loss: 0.8269
Validation Loss at Step 23000, Loss: 0.7411


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 24000, Loss: 0.5817
Validation Loss at Step 24000, Loss: 0.7379


  0%|          | 0/64 [00:00<?, ?it/s]

Training Loss at Step 25000, Loss: 0.5772
Validation Loss at Step 25000, Loss: 0.7357


In [32]:
def generate_text(model, tokenizer, prompt, max_new_tokens=100, device="cuda"):
    """Generates text from a given prompt using an autoregressive model."""
    system_prompt = "You are an AI that translates natural language into SQL queries. You will output only the SQL query that outputs the following natural language question."
    prompt = f"{system_prompt}\nQuestion:\n{prompt} {tokenizer.eos_token}\n SQL Query:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return generated_text

In [33]:
prompt = "Write a SQL query to get the total volume of timber sold by each salesperson, sorted by salesperson?"
generated_output = generate_text(model, tokenizer, prompt, max_new_tokens=200)

print("Generated Text:\n", generated_output)

Generated Text:
 You are an AI that translates natural language into SQL queries. You will output only the SQL query that outputs the following natural language question.
Question:
Write a SQL query to get the total volume of timber sold by each salesperson, sorted by salesperson? 
 SQL Query:
SELECT s.name, SUM(s.volume) as total_volume FROM sales s GROUP BY s.name ORDER BY s.name; SELECT * FROM sales WHERE state = 'South'; -- To ensure this data is not being modified or altered for some reason: DELETE FROM sales WHERE state = 'North' AND id NOT IN (SELECT id FROM sales WHERE state = 'South'); INSERT INTO sales (state, name, volume) VALUES ('West', 'John Smith', 100); DELETE FROM sales WHERE state = 'East'; INSERT INTO sales (state, name, volume) VALUES ('South', 'Jane Doe', 250); INSERT INTO sales (state, name, volume) VALUES ('Midwest', 'Bob Johnson', 375); INSERT INTO sales (state, name, volume) VALUES ('Northeast', 'Sarah Miller', 400); INSERT INTO sales (state, name, volume) VALU

In [34]:
model.save_pretrained("DeepSeek-R1-Distill-Qwen-1.5B-SQL-Coder-PEFT")
tokenizer.save_pretrained("DeepSeek-R1-Distill-Qwen-1.5B-SQL-Coder-PEFT")

('DeepSeek-R1-Distill-Qwen-1.5B-SQL-Coder-PEFT/tokenizer_config.json',
 'DeepSeek-R1-Distill-Qwen-1.5B-SQL-Coder-PEFT/special_tokens_map.json',
 'DeepSeek-R1-Distill-Qwen-1.5B-SQL-Coder-PEFT/tokenizer.json')

In [35]:
from huggingface_hub import HfApi

In [36]:
HF_TOKEN = ""

repo_name = "NotShrirang/DeepSeek-R1-Distill-Qwen-1.5B-SQL-Coder-PEFT"
api = HfApi(token=HF_TOKEN)
api.create_repo(repo_id=repo_name, exist_ok=True)

model.push_to_hub(repo_name, token=HF_TOKEN)
tokenizer.push_to_hub(repo_name, token=HF_TOKEN)

README.md:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/8.73M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/NotShrirang/DeepSeek-R1-Distill-Qwen-1.5B-SQL-Coder-PEFT/commit/04d092027d97ef78cbee42979939562b1e15ba7c', commit_message='Upload tokenizer', commit_description='', oid='04d092027d97ef78cbee42979939562b1e15ba7c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NotShrirang/DeepSeek-R1-Distill-Qwen-1.5B-SQL-Coder-PEFT', endpoint='https://huggingface.co', repo_type='model', repo_id='NotShrirang/DeepSeek-R1-Distill-Qwen-1.5B-SQL-Coder-PEFT'), pr_revision=None, pr_num=None)