## Load Dataset

In [1]:
import torch
import pandas as pd
import sys
import wandb

import random
random.seed(0)

train = pd.read_parquet('~/ML_Projects/text-sql/data/train/train-00000-of-00001.parquet')
eval = pd.read_parquet('~/ML_Projects/text-sql/data/validation/validation-00000-of-00001.parquet')



In [2]:
train = train[:100]
eval = eval[:60]

In [3]:
def prompt_no_input(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{question}\n\n### Response:\n").format_map(row) 

In [4]:
row = train.iloc[0]
print(prompt_no_input(row))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
How many heads of the departments are older than 56 ?

### Response:



In [5]:
train.columns

Index(['db_id', 'query', 'question', 'query_toks', 'query_toks_no_value',
       'question_toks'],
      dtype='object')

In [6]:
def prompt_no_input(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{question}\n\n### Response:\n").format_map(row)

In [7]:
row = train.iloc[0,:]
print(prompt_no_input(row))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
How many heads of the departments are older than 56 ?

### Response:



In [8]:
def prompt_input(row):
    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{question}\n\n### Input:\n{db_id}\n\n### Response:\n").format_map(row)

In [9]:
print(prompt_input(row))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
How many heads of the departments are older than 56 ?

### Input:
department_management

### Response:



In [10]:
def create_alpaca_prompt(row):
    return prompt_no_input(row) if row["db_id"] == "" else prompt_input(row)

In [11]:
print(create_alpaca_prompt(row))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
How many heads of the departments are older than 56 ?

### Input:
department_management

### Response:



In [12]:
train_prompts = [create_alpaca_prompt(row) for i, row in train.iterrows()]
eval_prompts = [create_alpaca_prompt(row) for i, row in eval.iterrows()]

In [13]:
print(train_prompts[0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
How many heads of the departments are older than 56 ?

### Input:
department_management

### Response:



In [14]:
for _,row in train.iterrows():
    print(row.query)
    break

SELECT count(*) FROM head WHERE age  >  56


In [15]:
def pad_eos(ds):
    EOS_TOKEN = "</s>"
    return [f"{row.query}{EOS_TOKEN}" for _,row in ds.iterrows()]

In [16]:
train_outputs = pad_eos(train)
eval_outputs = pad_eos(eval)
train_outputs[0]

'SELECT count(*) FROM head WHERE age  >  56</s>'

In [17]:
train_dataset = [{"prompt":s, "output":t, "example": s + t} for s, t in zip(train_prompts, train_outputs)]
eval_dataset = [{"prompt":s, "output":t, "example": s + t} for s, t in zip(eval_prompts, eval_outputs)]

In [18]:
train_dataset[0]

{'prompt': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nHow many heads of the departments are older than 56 ?\n\n### Input:\ndepartment_management\n\n### Response:\n',
 'output': 'SELECT count(*) FROM head WHERE age  >  56</s>',
 'example': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nHow many heads of the departments are older than 56 ?\n\n### Input:\ndepartment_management\n\n### Response:\nSELECT count(*) FROM head WHERE age  >  56</s>'}

In [19]:
print(train_dataset[0]["example"])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
How many heads of the departments are older than 56 ?

### Input:
department_management

### Response:
SELECT count(*) FROM head WHERE age  >  56</s>


## Tokenize the data

In [20]:
import tiktoken

In [21]:
model_id = 'gpt2'
tokenizer = tiktoken.get_encoding(model_id)


In [22]:
[tokenizer.encode(s['example'],allowed_special={'</s>'})for s in train_dataset]

[[21106,
  318,
  281,
  12064,
  326,
  8477,
  257,
  4876,
  11,
  20312,
  351,
  281,
  5128,
  326,
  3769,
  2252,
  4732,
  13,
  19430,
  257,
  2882,
  326,
  20431,
  32543,
  262,
  2581,
  13,
  198,
  198,
  21017,
  46486,
  25,
  198,
  2437,
  867,
  6665,
  286,
  262,
  13346,
  389,
  4697,
  621,
  7265,
  5633,
  198,
  198,
  21017,
  23412,
  25,
  198,
  10378,
  1823,
  62,
  27604,
  198,
  198,
  21017,
  18261,
  25,
  198,
  46506,
  954,
  7,
  28104,
  16034,
  1182,
  33411,
  2479,
  220,
  1875,
  220,
  7265,
  3556,
  82,
  29],
 [21106,
  318,
  281,
  12064,
  326,
  8477,
  257,
  4876,
  11,
  20312,
  351,
  281,
  5128,
  326,
  3769,
  2252,
  4732,
  13,
  19430,
  257,
  2882,
  326,
  20431,
  32543,
  262,
  2581,
  13,
  198,
  198,
  21017,
  46486,
  25,
  198,
  8053,
  262,
  1438,
  11,
  4642,
  1181,
  290,
  2479,
  286,
  262,
  6665,
  286,
  13346,
  6149,
  416,
  2479,
  13,
  198,
  198,
  21017,
  23412,
  25,
  198,
  103

In [23]:
max_sequence_len = 1024

def pack(dataset, max_seq_len=max_sequence_len):
    tkds_ids = [tokenizer.encode(s['example'],allowed_special={'</s>'})for s in train_dataset]
    
    all_token_ids = []
    for tokenized_input in tkds_ids:
        all_token_ids.extend(tokenized_input)# + [tokenizer.eos_token_id])
    
    print(f"Total number of tokens: {len(all_token_ids)}")
    packed_ds = []
    for i in range(0, len(all_token_ids), max_seq_len+1):
        input_ids = all_token_ids[i : i + max_seq_len+1]
        if len(input_ids) == (max_seq_len+1):
            packed_ds.append({"input_ids": input_ids[:-1], "labels": input_ids[1:]})  # this shift is not needed if using the model.loss
    return packed_ds

train_ds_packed = pack(train_dataset)
eval_ds_packed = pack(eval_dataset)
len(train_ds_packed)

Total number of tokens: 9664
Total number of tokens: 9664


9

In [24]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

torch.manual_seed(1)
batch_size = 4  # I have an A100 GPU with 40GB of RAM 😎

train_dataloader = DataLoader(
    train_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator, # we don't need any special collator 😎
)

eval_dataloader = DataLoader(
    eval_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator,
    shuffle=False,
)

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
train_dataloader.dataset

[{'input_ids': [21106,
   318,
   281,
   12064,
   326,
   8477,
   257,
   4876,
   11,
   20312,
   351,
   281,
   5128,
   326,
   3769,
   2252,
   4732,
   13,
   19430,
   257,
   2882,
   326,
   20431,
   32543,
   262,
   2581,
   13,
   198,
   198,
   21017,
   46486,
   25,
   198,
   2437,
   867,
   6665,
   286,
   262,
   13346,
   389,
   4697,
   621,
   7265,
   5633,
   198,
   198,
   21017,
   23412,
   25,
   198,
   10378,
   1823,
   62,
   27604,
   198,
   198,
   21017,
   18261,
   25,
   198,
   46506,
   954,
   7,
   28104,
   16034,
   1182,
   33411,
   2479,
   220,
   1875,
   220,
   7265,
   3556,
   82,
   29,
   21106,
   318,
   281,
   12064,
   326,
   8477,
   257,
   4876,
   11,
   20312,
   351,
   281,
   5128,
   326,
   3769,
   2252,
   4732,
   13,
   19430,
   257,
   2882,
   326,
   20431,
   32543,
   262,
   2581,
   13,
   198,
   198,
   21017,
   46486,
   25,
   198,
   8053,
   262,
   1438,
   11,
   4642,
   1181,
   290

In [27]:
b = next(iter(train_dataloader))
b

{'input_ids': tensor([[21106,   318,   281,  ..., 19430,   257,  2882],
         [20431, 32543,   262,  ...,    30,   198,   198],
         [23412,    25,   198,  ...,  5318,    62,  5589],
         [ 7054,   309,    17,  ..., 32543,   262,  2581]]),
 'labels': tensor([[  318,   281, 12064,  ...,   257,  2882,   326],
         [32543,   262,  2581,  ...,   198,   198, 21017],
         [   25,   198, 43323,  ...,    62,  5589, 15620],
         [  309,    17,  6177,  ...,   262,  2581,    13]])}

In [28]:
from types import SimpleNamespace

gradient_accumulation_steps = 2

config = SimpleNamespace(
    model_id='gpt2',
    dataset_name=train_dataloader,
    precision="bf16",  # faster and better than fp16, requires new GPUs
    n_freeze=1,  # How many layers we don't train, LLama 7B has 32.
    lr=2e-4,
    n_eval_samples=10, # How many samples to generate on validation
    max_seq_len=max_sequence_len, # Lenght of the sequences to pack
    epochs=3,  # we do 3 pasess over the dataset.
    gradient_accumulation_steps=gradient_accumulation_steps,  # evey how many iterations we update the gradients, simulates larger batch sizes
    batch_size=batch_size,  # what my GPU can handle, depends on how many layers are we training  
    log_model=False,  # upload the model to W&B?
    gradient_checkpointing = True,  # saves even more memory
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
    seed=1,
)

config.total_train_steps = config.epochs * len(train_dataloader) // config.gradient_accumulation_steps


In [29]:
import sys
sys.path.append('/Users/jagpreetsingh/ML_Projects/text-sql/src-llm/components')
from gpt_model import GPTModel
from config import GPT_CONFIG_124M

model = GPTModel(GPT_CONFIG_124M)


In [30]:
def param_count(m):
    params = sum([p.numel() for p in m.parameters()])/1_000_000
    trainable_params = sum([p.numel() for p in m.parameters() if p.requires_grad])/1_000_000
    print(f"Total params: {params:.2f}M, Trainable: {trainable_params:.2f}M")
    return params, trainable_params

params, trainable_params = param_count(model)

Total params: 0.60M, Trainable: 0.60M


In [31]:
# freeze layers (disable gradients)
for param in model.parameters(): param.requires_grad = False
for param in model.out_head.parameters(): param.requires_grad = True
for param in model.trf_block[config.n_freeze].parameters(): param.requires_grad = True

In [32]:
# Just freeze embeddings for small memory decrease
if config.freeze_embed:
    model.tok_emb.weight.requires_grad_(False);
    # model.pos_emb.weight.requires_grad_(False);

In [33]:
# # # save more memory
# # if config.gradient_checkpointing:
#     model.gradient_checkpointing_enable()

In [34]:
params, trainable_params = param_count(model)

Total params: 0.60M, Trainable: 0.30M


In [35]:
from transformers import get_cosine_schedule_with_warmup

optim = torch.optim.AdamW(model.parameters(), lr=config.lr, betas=(0.9,0.99), eps=1e-5)
scheduler = get_cosine_schedule_with_warmup(
    optim,
    num_training_steps=config.total_train_steps,
    num_warmup_steps=config.total_train_steps // 10,
)

In [36]:
def loss_fn(x, y):
    "A Flat CrossEntropy" 
    return torch.nn.functional.cross_entropy(x.view(-1, x.shape[-1]), y.view(-1))

In [37]:
from types import SimpleNamespace
from transformers import GenerationConfig

gen_config = GenerationConfig.from_pretrained(config.model_id)
test_config = SimpleNamespace(
    max_new_tokens=100,
    gen_config=gen_config)

In [38]:
context = torch.zeros((1, 1), dtype=torch.long, device='cpu')
context.shape, type(context)
# print(tokenizer.decode(model.generate(context, max_new_tokens=1000)[0].tolist()))

(torch.Size([1, 1]), torch.Tensor)

In [39]:
def generate(prompt, max_new_tokens=test_config.max_new_tokens, gen_config=gen_config):
    tokenized_prompt = torch.tensor(tokenizer.encode(prompt)).view(1, -1)

    
    with torch.inference_mode():
        output = model.generate(tokenized_prompt, 
                            max_new_tokens=max_new_tokens, 
                            # generation_config=gen_config,
                            )
    return tokenizer.decode(output[0].tolist())

In [40]:
prompt = eval_dataset[0]["prompt"]

# prompt = (tokenizer.encode(prompt))
# # reshape the tensor to be 1x1


# my_tensor = torch.tensor(prompt)
# # prompt
# my_tensor = my_tensor.view(1, -1)
# my_tensor.shape

print(generate(prompt,))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
How many singers do we have?

### Input:
concert_singer

### Response:
 Paint commission Finder RiceCompareathi Sister abolition Woody characterized sermonuably vibrant destabil record scholarship convenience cooperative Threeunit islandancy tirezek complain reignatures artisan 197onicauntu Predator ported legalicas tort Twitch estimation intensity War Serge informant smartphonesanks Telephone'm swallowing menace redpower whereby Phantsym Reasons Appendixastic although flows amendment exposures Income quaint Ge headerspt activitiesenaries Printed Abilities sauces pointsanty tobaccoUTC bankruptcyouk Islandersifle Lara magic decentralRace leasing suddenly Pilgrimrist thoughalleryflag licensee settlement129 hollow untrue documentariesatever calc juven dealer Kelvin


In [41]:
device = "cuda" if torch.cuda.is_available() else "cpu"
class Accuracy:
    "A simple Accuracy function compatible with HF models"
    def __init__(self):
        self.count = 0
        self.tp = 0.
    def update(self, logits, labels):
        logits, labels = logits.argmax(dim=-1).view(-1).cpu(), labels.view(-1).cpu()
        tp = (logits == labels).sum()
        self.count += len(logits)
        self.tp += tp
        return tp / len(logits)
    def compute(self):
        return self.tp / self.count
    

In [42]:
def prompt_table(examples, log=False, table_name="predictions"):
    table = wandb.Table(columns=["prompt", "generation", "concat", "output", "max_new_tokens", "temperature", "top_p"])
    for example in tqdm(examples, leave=False):
        prompt, gpt4_output = example["prompt"], example["output"]
        out = generate(prompt, test_config.max_new_tokens, test_config.gen_config)
        table.add_data(prompt, out, prompt+out, gpt4_output, test_config.max_new_tokens, test_config.gen_config.temperature, test_config.gen_config.top_p)
    if log:
        wandb.log({table_name:table})
    return table

# data loading
def get_batch(dataloader):
    for batch in dataloader:
        xb, yb = batch
        return xb.to(device), yb.to(device)
    
@torch.no_grad()
def validate():
    model.eval();
    eval_acc = Accuracy()
    loss, total_steps = 0., 0
    for step, batch in enumerate(pbar:=tqdm(eval_dataloader, leave=False)):
        pbar.set_description(f"doing validation")
        batch = get_batch(batch)

        total_steps += 1
        with torch.amp.autocast("cpu", dtype=torch.bfloat16):
            out = model(batch)
            loss += loss_fn(out.logits, batch["labels"])  # you could use out.loss and not shift the dataset
        eval_acc.update(out.logits, batch["labels"])
    # we log results at the end
    wandb.log({"eval/loss": loss.item() / total_steps,
               "eval/accuracy": eval_acc.compute()})
    prompt_table(eval_dataset[:config.n_eval_samples], log=True)
    model.train();

In [43]:
# for step, batch in enumerate(train_dataloader):
#     x, y = batch
#     print(x,y)

In [53]:
import tqdm

# Training

acc = Accuracy()

model.train()
train_step = 0
for epoch in range(config.epochs):

        xb, yb = get_batch(train_dataloader)
        with torch.amp.autocast("cpu", dtype=torch.bfloat16):
            out = model(xb, yb)
            loss = loss_fn(out.logits, batch["labels"]) / config.gradient_accumulation_steps  # you could use out.loss and not shift the dataset  
            loss.backward()
        if step%config.gradient_accumulation_steps == 0:
            # we can log the metrics to W&B
            wandb.log({"train/loss": loss.item() * config.gradient_accumulation_steps,
                       "train/accuracy": acc.update(out.logits, batch["labels"]),
                       "train/learning_rate": scheduler.get_last_lr()[0],
                       "train/global_step": train_step})
            optim.step()
            scheduler.step()
            optim.zero_grad(set_to_none=True)
            train_step += 1
validate()    

AttributeError: 'str' object has no attribute 'to'

In [45]:

GPT_CONFIG_124M = {
    "vocab_size" : 50257, # vocabulary size
    "batch_size" : 4, # batch size
    "ctx_len" : 32,     # context length
    "emb_dim" : 6,      # embedding dimesension
    "n_head" : 3,        # number of attention heads
    "n_layers" : 3,      # number of layers
    "learning_rate" : 1e-3, # learning rate
    "drop_rate" : 0.1,    # Dropout rate
    "qkv_bias" : False,  # qkv bias
    "eval_interval" : 100, # evaluation interval
    "max_iters" : 1000,
    "eval_iters" : 200
}

In [52]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
eval_interval = GPT_CONFIG_124M['eval_interval']
eval_iters = GPT_CONFIG_124M['eval_iters']
max_iters = GPT_CONFIG_124M['max_iters']


# data loading
def get_batch(dataloader):
    for batch in dataloader:
        X, Y = batch
        return X.to(device), Y.to(device)
    
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(train_dataloader)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()

    model.train()
    return out

    
for iter in range(max_iters):

        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss()
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
        xb, yb = get_batch(train_dataloader)

        # evaluate the loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    # genarate a sequence of tokens
    # context = torch.zeros((1, 1), dtype=torch.long, device=device)
    # print(tokenizer.decode(model.generate(context, max_new_tokens=1000)[0].tolist()))
    # save the model
    # torch.save(model.state_dict(), 'model.pt')\
    # model.predict("SELECT * FROM users WHERE name = 'John'")
    



AttributeError: 'str' object has no attribute 'to'

In [165]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x176e05f70>