# Imports

In [1]:
import math
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import tiktoken
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, DatasetDict
from torchsummaryX import summary
import wandb
from dataclasses import dataclass
from tqdm import tqdm
import re
from multiprocessing import cpu_count
import random
import gc
import pickle

In [2]:
# set seeds
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
cudnn.deterministic = True
cudnn.benchmark = False
random.seed(42)

# Load Data

In [3]:
datasets_train = load_dataset("Shannnh/hw5-changed", split = 'train')
datasets_val = load_dataset("Shannnh/hw5-changed", split = 'validation')
datasets_test = load_dataset("Shannnh/hw5-changed", split = 'test_ds')

In [4]:
print(datasets_train[0].keys())
print(len(datasets_train))
print(len(datasets_val))
print(len(datasets_test))

dict_keys(['Classifier', 'Prompt', 'Messages', 'PromptId'])
392632
27664
15434


In [5]:
# datasets_train = datasets_train.shuffle(seed=42).select(range(100))
# datasets_val = datasets_val.shuffle(seed=42).select(range(10))

In [6]:
datasets_train.unique('Classifier')

['Summarization', 'Question&Answer', 'SentimentAnalysis', 'NamedEntity']

# Hyperparameters

In [5]:
@dataclass
class IDeaLGPTConfig:

    # General
    batch_size: int = 16 # 16
    gradient_accumulation_steps: int = 4
    num_iters: int = 10000
    eval_iters: int = 3
    eval_interval: int = 1000
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    finetune_epochs: int = 3
    # device: str = 'cpu'

    # Model
    sequence_length: int = 256
    vocab_size: int = 50257 # gpt2 vocab
    num_blocks: int = 8
    num_heads: int = 8
    embed_dim: int = 512
    dropout: float = 0.1
    bias: bool = False

    # Data
    num_workers: int = 8
    train_test_split: float = 0.8
    SUBSET_PERCENTAGE: float =0.01 # % of OWT to train on, between 0 and 1

    # LR scheduler
    lr: float = 2e-3
    lr_decay: bool = True
    warmup_iters: int = 1000
    min_lr: float = 6e-6

    finetune_lr: float = 1e-4

    # optimizer
    weight_decay: float = 1e-1
    grad_clip: float = 1.0


config = IDeaLGPTConfig()
device = config.device
config

IDeaLGPTConfig(batch_size=16, gradient_accumulation_steps=4, num_iters=10000, eval_iters=3, eval_interval=1000, device='cuda', finetune_epochs=3, sequence_length=256, vocab_size=50257, num_blocks=8, num_heads=8, embed_dim=512, dropout=0.1, bias=False, num_workers=8, train_test_split=0.8, SUBSET_PERCENTAGE=0.01, lr=0.002, lr_decay=True, warmup_iters=1000, min_lr=6e-06, finetune_lr=0.0001, weight_decay=0.1, grad_clip=1.0)

In [6]:
print(f'Effective batch size = {config.batch_size * config.gradient_accumulation_steps}')

Effective batch size = 64


# Loading Data and Preprocessing

In [9]:
# hf_dataset = load_dataset("Skylion007/openwebtext", split='train') # only has one split - train
# hf_dataset = hf_dataset.with_format("torch")
# hf_dataset

In [10]:
# # data = dataset['train'].shuffle(seed=42).select(range(int(len(dataset['train']) * SUBSET_PERCENTAGE)))
# # hf_dataset = hf_dataset.select(range(int(len(hf_dataset) * config.SUBSET_PERCENTAGE)))
# hf_dataset = hf_dataset.train_test_split(train_size=config.train_test_split)
# hf_dataset

In [11]:
# train_hf_dataset, val_hf_dataset = hf_dataset['train'], hf_dataset['test']

## Tokenizer - OpenAI tiktoken (changed to GPT2Tokenizer)

In [7]:
#tokenizer = tiktoken.get_encoding("cl100k_base") # gpt4 tokenizer - NOTE: need to change vocab_size in config if used
#tokenizer = tiktoken.encoding_for_model('gpt-2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.encode('hello world')


[31373, 995]

In [8]:
tokenizer.model_max_length = config.sequence_length

In [9]:
tokenizer.pad_token_id

In [10]:
vocab_size = tokenizer.vocab_size #same as tiktoken
vocab_size

50257

In [11]:
# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [12]:
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + ' '.join(message['content'].split()[:150]) + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
# DEFAULT_CHAT_TEMPLATE = "{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + ' '.join(message['content'].split()[:150]) + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

In [29]:
'abcdefd'.find('d')

3

In [13]:
def apply_chat_template(example, tokenizer):
    messages = example["Messages"]
    #
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False, max_length=config.sequence_length, truncation=True)
    example["tokens"] = tokenizer.apply_chat_template(messages, tokenize=True, max_length=config.sequence_length, truncation=True)
    return example

column_names = list(datasets_train.features)
# datasets_train = datasets_train.map(apply_chat_template,
#                                 num_proc=8,
#                                 fn_kwargs={"tokenizer": tokenizer},
#                                 remove_columns=column_names,
#                                 desc="Applying chat template")
datasets_val = datasets_val.map(apply_chat_template,
                                num_proc=8,
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template")
datasets_test = datasets_test.map(apply_chat_template,
                                num_proc=8,
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template")


LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how he'll mark his landmark birthday are under wraps. His agent and publicist had no comment on his plans. "I'll definitely have some sort of party," he said in an interview. "Hopefully none of you will be reading about it." Radcliffe's earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. "People are always looking to say 'kid star goes off the rails,'" he told reporters last month. "But I try very hard not to go that way because it would be too easy for them." His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films.  Watch I-Reporter give her review of Potter's latest » . There is life beyond Potter, however. The Londoner has filmed a TV movie called "My Boy Jack," about author Rudyard Kipling and his son, due for release later this year. He will also appear in "December Boys," an Australian film about four boys who escape an orphanage. Earlier this year, he made his stage debut playing a tortured teenager in Peter Shaffer's "Equus." Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: "I just think I'm going to be more sort of fair game," he told Reuters. E-mail to a friend . Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.

In [27]:
# what's in datasets now
# datasets_train : [{'text':'abcd','tokens':[1,2,3]},{'text':'bcd','tokens':[2,3]},...]
for index in random.sample(range(len(datasets_val)), 2):
    print(f"Sample {index} of the processed training set:\n\n{datasets_val[index]['text']}")
    # print(f"token: {datasets_val[index]['tokens']}")
    print(f"sample length: {len(datasets_val[index]['text'])}") 
    print(f"token length:{len(datasets_val[index]['tokens'])}")

Sample 13746 of the processed training set:

<|system|>
<|endoftext|>
<|user|>
Super_Bowl_50 With Rivera having been a linebacker with the Chicago Bears in Super Bowl XX, and Kubiak replacing Elway at the end of the Broncos' defeats in Super Bowls XXI and XXIV, this will be the first Super Bowl in which both head coaches played in the game themselves. In what Super Bowl did Rivera play?<|endoftext|>
<|assistant|>
Super Bowl XX,Super Bowl XX,XX<|endoftext|>

sample length: 416
token length:105
Sample 7223 of the processed training set:

<|system|>
<|endoftext|>
<|user|>
Summarize the following CNN article: Ben Cohen's mother-in-law is due in court after allegedly harassing the former rugby player following his split from her daughter. Felicity Bassouls, 67, was outspoken in her criticism of Cohen - labelling him a 'disrespectful bully' - after he broke up with Abby late last year amid rumours of an affair with his Strictly Come Dancing partner Kristina Rihanoff. The athlete and activist

In [20]:
# total_token_length = 0
# sample_count = len(datasets_train)

# # Calculate total token length across all samples
# for data in tqdm(datasets_train):
#     encoded_length = len(tokenizer.encode(data['text']))
#     total_token_length += encoded_length

# # Compute the average token length
# average_token_length = total_token_length / sample_count

# print(f"Average token length: {average_token_length}")

In [20]:
torch.cuda.empty_cache()
gc.collect()

482

In [21]:
# # save dataset

# def save_dataset(dataset, filename):
#     with open(filename, 'wb') as f:
#         pickle.dump(dataset, f)
# save_dataset(datasets_train, 'data/finetune/train.bin')
# save_dataset(datasets_val, 'data/finetune/val.bin')
# save_dataset(datasets_test, 'data/finetune/test.bin')

## Pytorch Dataset

For long texts, the current approach randomly selects segments of text that are equal to config.sequence_length. However, methods such as sliding windows could also be explored.

In [22]:
class TrainDataset(Dataset):
    def __init__(self, root_dir, split):
        """
        Args:
            root_dir (str): Dataset root directory containing the data files.
        """
        file_path = os.path.join(root_dir, "train.bin") if split == 'train' else os.path.join(root_dir, "val.bin")
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        tokens = sample['tokens']
        # if the number of tokens is more than the sequence_length, randomly choose a segment
        # if len(tokens) > config.sequence_length + 1:
        #     num_possible_starts = len(tokens) - config.sequence_length
        #     start = random.randint(0, num_possible_starts - 1)
        #     segment = tokens[start:start + self.sequence_length + 1]
        # else:
        #     segment = tokens

        if len(tokens) < config.sequence_length + 1:
            padded_tokens = np.pad(tokens, (0, config.sequence_length + 1 - len(tokens)), 'constant', constant_values=tokenizer.pad_token_id)
        else:
            padded_tokens = tokens[:config.sequence_length + 1]

        xb = torch.tensor(padded_tokens[:-1], dtype=torch.int64)
        yb = torch.tensor(padded_tokens[1:], dtype=torch.int64)
        return xb, yb


In [23]:
# class TestDataset(Dataset):
#     def __init__(self, root_dir):
#         """
#         Args:
#             root_dir (str): Dataset root directory containing the data files.
#         """
#         file_path = os.path.join(root_dir, "test.bin")
#         with open(file_path, 'rb') as f:
#             data = pickle.load(f)
#         self.data = data

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         sample = self.data[idx]
#         tokens = sample['tokens']
#         # if len(tokens) > config.sequence_length + 1:
#         #     num_possible_starts = len(tokens) - config.sequence_length
#         #     start = random.randint(0, num_possible_starts - 1)
#         #     segment = tokens[start:start + config.sequence_length + 1]
#         # else:
#         #     segment = tokens
#         if len(tokens) < config.sequence_length + 1:
#             padded_tokens = np.pad(tokens, (0, config.sequence_length + 1 - len(tokens)), 'constant', constant_values=tokenizer.pad_token_id)
#         else:
#             padded_tokens = tokens[:config.sequence_length + 1]

#         xb = torch.tensor(padded_tokens[:-1], dtype=torch.int64)
#         return xb


## Dataloader

In [24]:
# train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=1)
# val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers)

In [25]:
# for x, y in train_loader:
#     print(x.shape, y.shape)
#     break

In [26]:
'''
# poor man's dataloader
# but actual motivation is - im too lazy to write and deal with pad tokens in above method to read data
# since there are documents which are less than sequence length and they mess up the batch
# this method is cleaner, i get to learn something new (np.memmap!) and it's fun!

data_dir = os.path.join('data', 'owt')

def get_batch(split):
    file_path = os.path.join(data_dir, 'val' if split == 'val.bin' else 'train.bin')
    # memmap allows to read huge .bin files without loading entire thing. magic?
    data = np.memmap(file_path, mode='r', dtype=np.uint16) # fp16?
    idx = torch.randint(len(data) - config.sequence_length, (config.batch_size, ))
    xb = torch.stack([torch.from_numpy(data[i:i+config.sequence_length].astype(np.int64)) for i in idx], dim=0)
    yb = torch.stack([torch.from_numpy(data[i+1:i+config.sequence_length+1].astype(np.int64)) for i in idx], dim=0)
    if device == 'cuda':
        # pin_memory is an optimization to reserve some space in cpu mem which is used for moving to gpu
        # reduces overhead -> increases perf
        # non_blocking = True is async data transfer
        xb, yb = xb.pin_memory().to(device, non_blocking=True), yb.pin_memory().to(device, non_blocking=True)
    return xb, yb
'''

"\n# poor man's dataloader\n# but actual motivation is - im too lazy to write and deal with pad tokens in above method to read data\n# since there are documents which are less than sequence length and they mess up the batch\n# this method is cleaner, i get to learn something new (np.memmap!) and it's fun!\n\ndata_dir = os.path.join('data', 'owt')\n\ndef get_batch(split):\n    file_path = os.path.join(data_dir, 'val' if split == 'val.bin' else 'train.bin')\n    # memmap allows to read huge .bin files without loading entire thing. magic?\n    data = np.memmap(file_path, mode='r', dtype=np.uint16) # fp16?\n    idx = torch.randint(len(data) - config.sequence_length, (config.batch_size, ))\n    xb = torch.stack([torch.from_numpy(data[i:i+config.sequence_length].astype(np.int64)) for i in idx], dim=0)\n    yb = torch.stack([torch.from_numpy(data[i+1:i+config.sequence_length+1].astype(np.int64)) for i in idx], dim=0)\n    if device == 'cuda':\n        # pin_memory is an optimization to reserv

In [27]:

DATA_DIR        = 'data/finetune'

train_dataset   = TrainDataset(
    root_dir    = DATA_DIR,
    split   = "train"
)

val_dataset     = TrainDataset(
    root_dir    = DATA_DIR,
    split   = "val"
)

# test_dataset    = TestDataset(
#     root_dir    = DATA_DIR
# )
gc.collect()


0

In [28]:
xb, yb = train_dataset[0]
xb.shape, yb.shape

(torch.Size([256]), torch.Size([256]))

In [29]:
train_loader    = torch.utils.data.DataLoader(
    dataset     = train_dataset,
    batch_size  = config.batch_size,
    shuffle     = True,
    num_workers = 2,
    pin_memory  = True
)

val_loader      = torch.utils.data.DataLoader(
    dataset     = val_dataset,
    batch_size  = config.batch_size,
    shuffle     = False,
    num_workers = 1,
    pin_memory  = True
)

# test_loader     = torch.utils.data.DataLoader(
#     dataset     = test_dataset,
#     batch_size  = config.batch_size,
#     shuffle     = False,
#     num_workers = 1,
#     pin_memory  = True
# )

print("Batch Size           : ", config.batch_size)
print("Train Batches        : ", train_loader.__len__())
print("Val Batches          : ", val_loader.__len__())
# print("Test Batches         : ", test_loader.__len__())


Batch Size           :  16
Train Batches        :  24540
Val Batches          :  1729


In [30]:
''' Sanity Check '''

print("Checking the Shapes of the Data --\n")

for batch in train_loader:
    xb, yb = batch

    print(f"xb shape:\t\t{xb.shape}")
    print(f"yb shape:\t\t{yb.shape}\n")



    break

Checking the Shapes of the Data --

xb shape:		torch.Size([16, 256])
yb shape:		torch.Size([16, 256])



In [31]:
# I tried it, but failed.QAQ. It seems that using np.memmap requires synchronously recording the length of each data entry, which makes padding inconvenient.
'''
data_dir = '/content/hw5/'
def get_batch(split):
    file_path = os.path.join(data_dir, 'val.bin' if split == 'val' else 'train.bin')


    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    xb = torch.empty((config.batch_size, config.sequence_length), dtype=torch.int64)
    yb = torch.empty((config.batch_size, config.sequence_length), dtype=torch.int64)

    for b in range(config.batch_size):
        tokens = data[b]['tokens']
        if len(tokens) < config.sequence_length:
            padded_tokens = np.pad(tokens, (0, config.sequence_length - len(tokens)), 'constant', constant_values=tokenizer.pad_token_id)
        else:
            padded_tokens = tokens[:config.sequence_length]


        xb[b] = torch.tensor(padded_tokens[:-1], dtype=torch.int64)
        yb[b] = torch.tensor(padded_tokens[1:], dtype=torch.int64)

    if device == 'cuda':
        xb, yb = xb.pin_memory().to(device, non_blocking=True), yb.pin_memory().to(device, non_blocking=True)

    return xb, yb
'''

"\ndata_dir = '/content/hw5/'\ndef get_batch(split):\n    file_path = os.path.join(data_dir, 'val.bin' if split == 'val' else 'train.bin')\n\n\n    with open(file_path, 'rb') as f:\n        data = pickle.load(f)\n\n    xb = torch.empty((config.batch_size, config.sequence_length), dtype=torch.int64)\n    yb = torch.empty((config.batch_size, config.sequence_length), dtype=torch.int64)\n\n    for b in range(config.batch_size):\n        tokens = data[b]['tokens']\n        if len(tokens) < config.sequence_length:\n            padded_tokens = np.pad(tokens, (0, config.sequence_length - len(tokens)), 'constant', constant_values=tokenizer.pad_token_id)\n        else:\n            padded_tokens = tokens[:config.sequence_length]\n\n\n        xb[b] = torch.tensor(padded_tokens[:-1], dtype=torch.int64)\n        yb[b] = torch.tensor(padded_tokens[1:], dtype=torch.int64)\n\n    if device == 'cuda':\n        xb, yb = xb.pin_memory().to(device, non_blocking=True), yb.pin_memory().to(device, non_blocki

# Model

In [32]:
class Head(nn.Module):
    # def __init__(self, embed_dim, head_size, sequence_length, dropout):
    def __init__(self, config, interim_head_size):
        super().__init__()
        self.embed_dim = config.embed_dim
        self.interim_head_size = interim_head_size # say embed_dim = 32 -> broken into say 4 heads, so this will be 8, to be concated back to 32
        self.key = nn.Linear(config.embed_dim, interim_head_size, bias=config.bias)
        self.query = nn.Linear(config.embed_dim, interim_head_size, bias=config.bias)
        self.value = nn.Linear(config.embed_dim, interim_head_size, bias=config.bias)
        self.register_buffer('tril', torch.tril(torch.ones((config.sequence_length, config.sequence_length))))

        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (b,t,c) -> (b,t,h)
        q = self.query(x) # (b,t,c) -> (b,t,h)
        v = self.value(x) # (b,t,c) -> (b,t,h)
        wei = k @ q.transpose(-2, -1) * self.embed_dim**(-0.5) # (b,t,h) @ (b,h,t) -> (b,t,t)

        wei = wei.masked_fill((self.tril[:T, :T] == 0.), -torch.inf) # type: ignore
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        xbow = wei @ v # (b,t,t) @ (b,t,h) -> (b,t,h)
        return xbow

class MultiHeadAttention(nn.Module):
    # def __init__(self, num_heads, embed_dim, head_size, sequence_length, dropout):
    def __init__(self, config, interim_head_size):
        super().__init__()
        self.head_list = nn.ModuleList([Head(config, interim_head_size) for _ in range(config.num_heads)])
        self.proj = nn.Linear(config.embed_dim, config.embed_dim)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.head_list], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(config.embed_dim, 4*config.embed_dim),
            nn.GELU(),
            nn.Linear(4*config.embed_dim, config.embed_dim),
            nn.Dropout(config.dropout)
        )

    def forward(self, x):
        return self.layers(x)

class Block(nn.Module):
    # def __init__(self, num_heads, embed_dim, sequence_length, dropout):
    def __init__(self, config):
        super().__init__()
        self.interim_head_size = config.embed_dim // config.num_heads
        self.sa = MultiHeadAttention(config, self.interim_head_size)
        self.ff = FeedForward(config)
        self.ln1 = nn.LayerNorm(config.embed_dim)
        self.ln2 = nn.LayerNorm(config.embed_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # communication
        x = x + self.ff(self.ln2(x)) # computation
        return x


class Transformer(torch.nn.Module):
    # def __init__(self, embed_dim, vocab_size, sequence_length, num_heads, num_blocks, dropout):
    def __init__(self, config):
        super().__init__()
        self.sequence_length = config.sequence_length
        self.token_embeddings = torch.nn.Embedding(config.vocab_size, config.embed_dim)
        self.position_embeddings = nn.Embedding(config.sequence_length, config.embed_dim)
        self.block_list = nn.Sequential(*[Block(config)
                                          for _ in range(config.num_blocks)])
        self.final_ln = nn.LayerNorm(config.embed_dim)
        self.lm_head = nn.Linear(config.embed_dim, config.vocab_size)

    def forward(self, ixs, targets=None):
        # ixs: (b,t)
        # targets: (b,t)
        B, T = ixs.shape
        x = self.token_embeddings(ixs) # (b,t,c=embed_dim)
        pos_embeds = self.position_embeddings(torch.arange(T, device=device)) # (t,c=embed_dim)
        x += pos_embeds
        x = self.block_list(x)
        x = self.final_ln(x)
        logits = self.lm_head(x) # (b,t,c=vocab_size)
        if targets is None:
            loss = None
        else:
            mask = (ixs != tokenizer.pad_token_id)  # (b,t), True where not a pad token
            logits = logits.permute(0, 2, 1)  # (b,c,t)
    
            # Use the mask to filter out loss on padding positions
            # logits are now (b, c, t), targets are (b, t), mask is (b, t)
            # Utilizing .masked_fill to turn pad positions to a very large negative value to ignore them in softmax
            loss = F.cross_entropy(logits, targets, reduction='none')  # (b, t) get loss per token
            loss = (loss * mask).sum() / mask.sum()  # average loss only over non-pad tokens
        return logits, loss

    def generate(self, ixs, max_len):
        """
        ixs: (b,t) - input sequence to start generating from
        max_len: int - maximum length of the generated sequence
        """
        b, t = ixs.shape
        for _ in range(max_len):
            # generation (b, ) next tokens in parallel
            ixs_cond = ixs[:, -self.sequence_length:] # consider only the last sequence_length tokens
            logits, loss = self.forward(ixs_cond) # logits=(b,t,c), loss is ignored
            # get juse the final timestep
            last_logits = logits[:, -1, :] # (b,c)
            # normalize
            last_probs = F.softmax(last_logits, dim=-1) # across c
            next_tokens = torch.multinomial(last_probs, 1) # (b,c) -> (b)
            ixs = torch.cat((ixs, next_tokens), dim=1) # across t so (b,t) -> (b, t+1)
        return ixs

    def generate_prompt(self, ixs, max_len):
        """
        ixs: (1,t) - input sequence to start generating from
        max_len: int - maximum length of the generated sequence
        """
        b, t = ixs.shape
        for _ in range(max_len):
            ixs_cond = ixs[:, -self.sequence_length:] # consider only the last sequence_length tokens
            logits, loss = self.forward(ixs_cond) # logits=(b,t,c), loss is ignored
            # get juse the final timestep
            last_logits = logits[:, -1, :] # (1,c)
            # normalize
            last_probs = F.softmax(last_logits, dim=-1) # across c
            next_tokens = torch.multinomial(last_probs, 1) # (b,c) -> (1)
            print(next_tokens)
            ixs = torch.cat((ixs, next_tokens), dim=1) # across t so (b,t) -> (b, t+1)
        return ixs


# Training

In [33]:
# model = Transformer(embed_dim, vocab_size, sequence_length, num_heads, num_blocks, dropout).to(device)
model = Transformer(config).to(device)

In [34]:
summary(model, xb.to(device), yb.to(device))

                                                 Kernel Shape  \
Layer                                                           
0_token_embeddings                               [512, 50257]   
1_position_embeddings                              [512, 256]   
2_block_list.0.LayerNorm_ln1                            [512]   
3_block_list.0.sa.head_list.0.Linear_key            [512, 64]   
4_block_list.0.sa.head_list.0.Linear_query          [512, 64]   
5_block_list.0.sa.head_list.0.Linear_value          [512, 64]   
6_block_list.0.sa.head_list.0.Dropout_dropout               -   
7_block_list.0.sa.head_list.1.Linear_key            [512, 64]   
8_block_list.0.sa.head_list.1.Linear_query          [512, 64]   
9_block_list.0.sa.head_list.1.Linear_value          [512, 64]   
10_block_list.0.sa.head_list.1.Dropout_dropout              -   
11_block_list.0.sa.head_list.2.Linear_key           [512, 64]   
12_block_list.0.sa.head_list.2.Linear_query         [512, 64]   
13_block_list.0.sa.head_l

  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_token_embeddings,"[512, 50257]","[16, 256, 512]",25731584.0,25731584.0
1_position_embeddings,"[512, 256]","[256, 512]",131072.0,131072.0
2_block_list.0.LayerNorm_ln1,[512],"[16, 256, 512]",1024.0,512.0
3_block_list.0.sa.head_list.0.Linear_key,"[512, 64]","[16, 256, 64]",32768.0,32768.0
4_block_list.0.sa.head_list.0.Linear_query,"[512, 64]","[16, 256, 64]",32768.0,32768.0
...,...,...,...,...
319_block_list.7.ff.layers.GELU_1,-,"[16, 256, 2048]",,
320_block_list.7.ff.layers.Linear_2,"[2048, 512]","[16, 256, 512]",1049088.0,1048576.0
321_block_list.7.ff.layers.Dropout_3,-,"[16, 256, 512]",,
322_final_ln,[512],"[16, 256, 512]",1024.0,512.0


In [35]:
# # poor man's lr scheduler. why? because cosine with warmup isn't readily available on torch (it's warm RESTARTS)
# # but idc about restarting eh?
# def get_lr(it):
#     "get lr at a specific iteration"
#     max_lr = config.lr
#     min_lr = config.min_lr
#     warmup_iters = config.warmup_iters
#     max_lr_decay_iters = config.num_iters # can also be made into another param
#     if it <= warmup_iters:
#         return max_lr * (it / warmup_iters)

#     if it > max_lr_decay_iters:
#         # decaying only up to a certain point, interesting
#         return min_lr
#     ratio = (it - warmup_iters) / (max_lr_decay_iters - warmup_iters) # how much % of decay cycle is done?
#     coeff = 0.5 * (1 + math.cos(math.pi * ratio)) # [0,1]
#     return min_lr + coeff * (max_lr - min_lr) # beautiful

In [36]:
# def test_lr():
#     import random
#     import matplotlib.pyplot as plt
#     x = [i for i in range(0,10000,100)]
#     y = [get_lr(i) for i in x]
#     plt.plot(x, y)
#     plt.show()

# test_lr()


In [37]:
# '''
# @torch.no_grad()
# def estimate_losses(config):
#     model.eval()
#     losses = {'train': -1., 'val': -1.}
#     for split in ['train', 'val']:
#         loss = 0
#         for _ in range(config.eval_iters):
#             # xb, yb = next(iter(val_loader))
#             # xb, yb = xb.to(device), yb.to(device)
#             xb, yb = get_batch('val')
#             loss += model(xb, yb)[1].item()
#         loss /= config.eval_iters
#         if split == 'train':
#             losses['train'] = loss
#         else:
#             losses['val'] = loss
#     model.train()
#     return losses
#     '''

In [46]:
@torch.no_grad()
def estimate_losses(config, train_loader, val_loader):
    model.eval()
    losses = {'train': -1., 'val': -1.}
    # train_loss = 0
    # train_iters = min(config.eval_iters, len(train_loader))
    # for i, (xb, yb) in enumerate(train_loader):
    #     if i >= train_iters:
    #         break
    #     xb, yb = xb.to(device), yb.to(device)
    #     _, loss = model(xb, yb)
    #     train_loss += loss.item()
    # losses['train'] = train_loss / train_iters

    # Evaluate validation loss (considering only config.eval_iters iterations)
    val_loss = 0
    val_iters = min(config.eval_iters, len(val_loader))
    for i, (xb, yb) in enumerate(val_loader):
        if i >= val_iters:
            break
        xb, yb = xb.to(device), yb.to(device)
        _, loss = model(xb, yb)
        val_loss += loss.item()
    losses['val'] = val_loss / val_iters

    model.train()
    return losses

In [39]:
#@title Load Pretrained
CKPT_PATH = 'exps/pretrain_v2/best_model.pth'
ckpt = torch.load(CKPT_PATH)
model.load_state_dict(ckpt)

<All keys matched successfully>

## WandB

In [40]:
# wandb.login(key="8f970021374ae46aad44762a771dd8136c460b2d")
run = wandb.init(
        name    = 'finetune_try2', ## Wandb creates random run names if you skip this field
        reinit = True, ### Allows reinitalizing runs when you re-run this cell
        # entity = 'thunderbuddies',
        # run_id = ### Insert specific run id here if you want to resume a previous run
        # resume = "must" ### You need this to resume previous runs, but comment out reinit = True when using this
        project = "ideal_gpt", ### Project should be created in your wandb account
        config = config, ### Wandb Config for your run,
        # mode='disabled'

    )

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mssgandhi1[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [41]:
start_ix.shape

NameError: name 'start_ix' is not defined

In [42]:
cur_iter = 0
best_val = 1e9
best_path = 'finetune_best_model.pth'
running_loss = 0.0
loss_counter=0
pbar = tqdm(total=config.num_iters, dynamic_ncols=True, leave=False, position=0, desc="Train")
e = 0
cur_iter = 0

Train:   0%|          | 0/10000 [00:00<?, ?it/s]

In [43]:
optimizer = torch.optim.AdamW(model.parameters(), lr=config.finetune_lr, weight_decay=config.weight_decay)
scaler = torch.cuda.amp.GradScaler(enabled=True)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader) * config.finetune_epochs, eta_min=1e-6)


# for generation
start_ix = torch.zeros((1,1), dtype=torch.long, device=device) # (newline character in a single batch)

In [44]:
config.eval_interval = 2000

In [47]:
for epoch in range(e, e+config.finetune_epochs):
    steps = 0
    pbar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc="Train")
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        cur_lr = optimizer.param_groups[0]['lr']
        with torch.cuda.amp.autocast():
            logits, loss = model(xb, yb)
        running_loss += loss.item()
        train_loss = running_loss / (loss_counter + 1)
        loss_counter += 1
        scaler.scale(loss).backward()
        if steps % config.gradient_accumulation_steps == 0:
            if config.grad_clip != 0.0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)

            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
        wandb.log({
            'train_loss': train_loss,
            'iter': cur_iter,
            'lr': cur_lr
        })
        pbar.set_postfix(loss="{:.04f}".format(train_loss), lr=cur_lr)
        pbar.update()
        cur_iter += 1
        if cur_iter % config.eval_interval == 0:
        
            losses = estimate_losses(config, train_loader, val_loader)  # Now we pass val_loader to estimate_losses
            val_loss = losses['val']
            # train_loss = losses['train']
            print(f'Val @ Epoch {cur_iter}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}')
            wandb.log({
                'val_loss': val_loss,
                'iter': cur_iter,
                'lr': optimizer.param_groups[0]['lr']
            })
            if val_loss < best_val:
                best_val = val_loss
                torch.save(model.state_dict(), best_path)
                print(f'Saved best model to {best_path}')
            print('Sample Generation')
            print(tokenizer.decode(model.generate(start_ix, 100)[0].tolist()))

Train:   8%|▊         | 2000/24540 [08:05<1:31:05,  4.12it/s, loss=2.8509, lr=9.93e-5]

Val @ Epoch 4000: Train Loss=2.8509, Val Loss=2.8382
Saved best model to finetune_best_model.pth
Sample Generation
!|system|>
<|endoftext|>Summarize the following CNN article: (CNN) -- Palm tells you that after following the advice of colleagues and butler Hankinson, he isn't one of the better ones. Maybe he could even do an evening out. So, personal best wishes baby Mario 16, now known as too young, took her first letter from Ronnie Scott and wrote telling her how fantastic it would be. You know what? Jim Butler, the go-your-own guidebook can


Train:  16%|█▋        | 4000/24540 [16:14<1:23:20,  4.11it/s, loss=2.8260, lr=9.84e-5]

Val @ Epoch 6000: Train Loss=2.8260, Val Loss=2.8082
Saved best model to finetune_best_model.pth
Sample Generation
!|system|>
<|endoftext|>Police in Majestic province near Bengaluru, India, and the municipality of Jammu and Kashmir as well as Azad Kashmir and neighbouring states exited into the Indian Ocean. The federal government of India (Erdotō Moedai) denied (and tendence) the arrest of the alleged 3rd Charged Bankers of Delhi or the first Manorte (BID) by the Central Control Board along with the stated apologetic nebul (KIN),


Train:  24%|██▍       | 6000/24540 [24:24<1:15:00,  4.12it/s, loss=2.8088, lr=9.71e-5]

Val @ Epoch 8000: Train Loss=2.8088, Val Loss=2.7905
Saved best model to finetune_best_model.pth
Sample Generation
!|system|>
<|endoftext|>Mali President Mikael Mogilov says the conflict in Libya ended up being the greatest civil war that he has seen since the fall of Gaddafi and his political<|endoftext|>A member of the Prime Minister's Cabinet has voiced his Viewpoint position on Libya, so long as he ends with statements like 'betterUseOne'" in which he said misleading opposition of arms blasting abroad. The president was nominated for a Cabinet position in July by a narrow margin, but was seriously questioned by


Train:  33%|███▎      | 8000/24540 [36:41<2:01:48,  2.26it/s, loss=2.7961, lr=9.56e-5]

Val @ Epoch 10000: Train Loss=2.7961, Val Loss=2.7886
Saved best model to finetune_best_model.pth
Sample Generation
!|system|>
<|endoftext|>Key points: A series of expensive delays on Thanksgiving Wednesday more than half of the 50 minutes work scheduled to start for the Thanksgiving Day subseason. Career service in a number of the last four weeks ended before Lights Up, which fell spectacularly across the interior of mid-October, and accounted for one-fifth of a second of a<|endoftext|>New York's masters Houses of Congress raise funds for infrastructure workers thank Youc Con for building concessions and earning them commissioning vast add


Train:  41%|████      | 10000/24540 [45:17<58:51,  4.12it/s, loss=2.7847, lr=9.37e-5] 

Val @ Epoch 12000: Train Loss=2.7847, Val Loss=2.7814
Saved best model to finetune_best_model.pth
Sample Generation
!|system|>
<|endoftext|>Montevideo,_ Venezuelans, the Allies in the Luhansk administration list Latin saints as 51/50. According to the official figures by the International Council for Religious Death (ICRIC) in 24 hours, the Unknown ThirteenOrders number 49 in the following denominations: two Episcopalians, one born into the investigation of the conflict in Colombia (Anger, Cunha); two Americans, an Italian Asylum inmate of the United States, a Scottish


Train:  49%|████▉     | 12000/24540 [53:26<50:37,  4.13it/s, loss=2.7749, lr=9.14e-5]  

Val @ Epoch 14000: Train Loss=2.7749, Val Loss=2.7780
Saved best model to finetune_best_model.pth
Sample Generation
!|system|>
<|endoftext|>Summarize the following CNN article: Docks and flights are at a phenomenal speed as customers aboard Air France travel to Disney World in France, according to the National Grid corporation subsidiary. Most international airlines carry Moroccan software with their own complaining about weather forecasts, on 26 of which the UK gets promoted to per Devil January 1. But in the UK, it is an annual scrutiny rate of 1,750. for Air France flights, according to.ac UK ratings STAR bulb


Train:  57%|█████▋    | 14000/24540 [1:01:35<42:40,  4.12it/s, loss=2.7666, lr=8.89e-5]

Val @ Epoch 16000: Train Loss=2.7666, Val Loss=2.7669
Saved best model to finetune_best_model.pth
Sample Generation
!|system|>
<|endoftext|>It's been one of Anne Britain's most important cultural traits by Periodic Eysen Bowers' domination of life has Romulus Castellum for the first time and her last constituting an urban artist and philosopher. In Romulus and his Friends in Gillingham, Charshaw delineates the primacy between architecture and architecture on the twenty-fourth. No heritage is given and all humans have connections, cities or towns. The monasteries (names then


Train:  65%|██████▌   | 16000/24540 [1:09:44<34:31,  4.12it/s, loss=2.7589, lr=8.61e-5]  

Val @ Epoch 18000: Train Loss=2.7589, Val Loss=2.7702
Sample Generation
!|system|>
<|endoftext|>Benjamin Maye for a political fight against cancer and the anti-poison claim claiming that mammoths would help the diseases cause high obesity rates. researchers this week said the reason that the battle against cancer were caused by these claims reflected the horrifically morbidity and desperation of British women from colonisation. He insisted critics had managed to calm the situation down South Korea because clinical scientists rather than textured cells could be counted to accommodate disease. Some recent research suggested that the


Train:  73%|███████▎  | 18000/24540 [1:17:52<26:20,  4.14it/s, loss=2.7523, lr=8.3e-5]   

Val @ Epoch 20000: Train Loss=2.7523, Val Loss=2.7665
Saved best model to finetune_best_model.pth
Sample Generation
!|system|>
<|endoftext|><<|endoftext|>There's been controversy over the Census Ise Gymnastics is popular and happened to programme journalist and radio personality Louise Mensch. The manipulation focus on championships of higher and middle class Hockey, with New Year General, Croydon, Cardiff and San Marino having had an employment association run by Mensch running the show and positive events. Gymnastic friendships are common among PGA users, particularly after the Melbourne Open, and times when Wimbledon was also a


Train:  81%|████████▏ | 20000/24540 [1:26:01<18:21,  4.12it/s, loss=2.7458, lr=7.97e-5] 

Val @ Epoch 22000: Train Loss=2.7458, Val Loss=2.7705
Sample Generation
!|system|>
<|endoftext|>What's the sentiment of the sentence it does not think it has a good reputation following Russia's Moslem war in 1918, however, she has dismissed the appeal'. She has denied concerns about Russian-backed separatists trying to oust President Putin at international airports as a pretext for sanctions against Moscow and has denied they were uprooted. In Moscow her membership could be revoked. In a statement the President called upon Russia to'restart’ the Germans who had annexed Afghanistan in


Train:  90%|████████▉ | 22000/24540 [1:34:09<10:16,  4.12it/s, loss=2.7397, lr=7.62e-5]  

Val @ Epoch 24000: Train Loss=2.7397, Val Loss=2.7654
Saved best model to finetune_best_model.pth
Sample Generation
!|system|>
<|endoftext|>(<|endoftext|>Palm Park in the Newmarket railway station under the Strauss-Kahn act can be described as primitive, flying prints that stopped only for the surface to reveal and run in narrow service routes, in which case Piotr and Constantine consciously crashed, as Piotr inherited a unique type of groping which had become the means by which, at great Mahanzar, and others<|endoftext|>In the era of the Piotr Sam Dress Code, ladies often


Train:  98%|█████████▊| 24000/24540 [1:42:18<02:10,  4.13it/s, loss=2.7338, lr=7.25e-5]

Val @ Epoch 26000: Train Loss=2.7338, Val Loss=2.7654
Sample Generation
!|system|>
<|endoftext|>’|user|>
Summarize the following CNN article: A 'n Nutcracker' who stole £3million in lottery prize debts of more than 38 per cent, has filed a Destroyed Old Fillon pension to herself. Jeremiah topper, 48, claimed his employers raided their accounts minutes after collecting lump sum payments, and then left it all on a sofa in the house where he lived. Customers on his Isleworth work weekly, dressed in


Train:   1%|          | 169/24540 [00:41<1:38:16,  4.13it/s, loss=2.7313, lr=7.12e-5]  

KeyboardInterrupt: 

In [None]:
# torch.cuda.empty_cache()
# gc.collect()


# NUM_EPOCHS=5
# cur_iter=0

# while cur_iter <= NUM_EPOCHS:
#     optimizer.zero_grad(set_to_none=True)

#     cur_lr = get_lr(curb_iter) if config.lr_decay else config.lr
#     for param_group in optimizer.param_groups:
#         param_group['lr'] = cur_lr

#     # Iterate over batches from the DataLoader
#     steps = 0
#     for xb, yb in train_loader:
#         xb, yb = xb.to(device), yb.to(device)

#         with torch.cuda.amp.autocast():
#             logits, loss = model(xb, yb)
#         running_loss += loss.item()
#         train_loss = running_loss / (loss_counter + 1)
#         loss_counter += 1

#         scaler.scale(loss).backward()

#         steps += 1
#         if steps % config.gradient_accumulation_steps == 0:
#             if config.grad_clip != 0.0:
#                 scaler.unscale_(optimizer)
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)

#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad(set_to_none=True)

#         del xb, yb, logits, loss
#         torch.cuda.empty_cache()


#     if cur_iter % config.eval_interval == 0:
#         losses = estimate_losses(config, train_loader, val_loader)  # Now we pass val_loader to estimate_losses
#         val_loss = losses['val']
#         train_loss = losses['train']
#         print(f'Val @ Epoch {cur_iter}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}')
#         wandb.log({
#             'val_loss': val_loss,
#             'iter': cur_iter,
#             'lr': optimizer.param_groups[0]['lr']
#         })
#         if val_loss < best_val:
#             best_val = val_loss
#             torch.save(model.state_dict(), best_path)
#             print(f'Saved best model to {best_path}')
#         print('Sample Generation')
#         print(tokenizer.decode(model.generate(start_ix, 100)[0].tolist()))

#     # Log training metrics for current iteration
#     wandb.log({
#         'train_loss': train_loss,
#         'iter': cur_iter,
#         'lr': cur_lr
#     })
#     pbar.set_postfix(loss="{:.04f}".format(train_loss), lr=cur_lr)
#     pbar.update()

#     cur_iter += 1  # Increment iteration count

In [48]:
print(tokenizer.decode(model.generate(start_ix, 100)[0].tolist()))

!|system|>
<|endoftext|>Miami Crater, Florida Crater, lies in 1924 Southern Florida lies approximately 62 miles (72 km) from Florida to Florida.[citation needed] In July, in May, a 7-feet (1.2) tour to help expand the Florida coastline entered into service as the U.S. Navy Air Arm is due to drop a 500-mile violation directive against commercial air traffic control airspace above the ocean line. On August 17, 1942, the United States


In [None]:
def prompt(p, max_len=100):
    if not p:
        print('Enter non-empty string!')
        return

    tokens = torch.tensor(tokenizer.encode_ordinary(p))
    tokens = tokens.unsqueeze(0) # add batch dimension
    tokens = tokens.to(device)
    return tokenizer.decode(model.generate(tokens, max_len)[0].tolist())

In [None]:
<|system|>
<|endoftext|>
<|user|>
Summarize the following CNN article: Controversial CNN anchor Don Lemon has raised more eyebrows after inviting a member of the Ku Klux Klan to speak on his show about alleged racist chants by a fraternity at the University of Oklahoma. The news host interviewed James Moore, also known as Imperial Kludd of the Loyal White Knights, on his programme about race relations in America. The interview was conducted via Skype, with Moore in traditional KKK dress and sunglasses. CNN anchor Don Lemon interviewed a James Moore, also known as Imperial Kludd of the Loyal White Knights about race relations . It came as Mr Lemon attempted to talk to the KKK member about the controversy surrounding racist chanting by a fraternity at the University of Oklahoma and ongoing tension in Ferguson, Missouri. The exchange started with Moore claiming there were double standards when it came to race relations in unversities<|endoftext|>
<|assistant|>

In [49]:
def generate_text(prompt, max_seq):
    prompt = prompt.strip() # remove leading and ending white spaces - leads to weird things
    # Encode the prompt using the tokenizer

    # chat_template = f"<|system|>\n<|endoftext|>\n<|user|>\n{' '.join(prompt.split()[:100])}<|endoftext|>\n<|assistant|>"    
    chat_template = f"<|user|>\n{' '.join(prompt.split()[:100])}<|endoftext|>\n<|assistant|>"
    prompt_tokens = tokenizer.encode(chat_template, return_tensors='pt').to(device)

    # Generate text using the model
    generated_tokens = model.generate(prompt_tokens, max_seq)

    # Decode the tokens back to text
    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)  # Remove batch dimension

    return generated_text

In [50]:
tmp = '''(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don\'t know, but the fact that so many people can have a life extension, that\'s pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page in her name read. "I know this entire journey is much bigger than all of us. I also know I\'m just the messenger." CNN cannot verify the authenticity of the page. But the power that multiplied Broussard\'s gift was<|endoftext|>\n<|assistant|>\nZully Broussard decided to give a kidney to a stranger.\nA new computer program helped her donation spur transplants for six kidney patients.'''
tmp

'(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don\'t know, but the fact that so many people can have a life extension, that\'s pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page in her name read. "I know this entire journey is much bigger than all of us. I also know I\'m just the messenger." CNN cannot verify the authenticity of the page. But the power that multiplied Broussard\'s gift was<|endoftext|>\n<|assistant|>\nZully Broussard decided to give a kidney to a stranger.\nA new computer program helped her donation spur transplants for six kidney patients.'

In [51]:
tokenizer.decode(val_dataset[0][0])

'<|system|>\n<|endoftext|>\n<|user|>\nSummarize the following CNN article: (CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don\'t know, but the fact that so many people can have a life extension, that\'s pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page in her name read. "I know this entire journey is much bigger than all of us. I also know I\'m just the messenger." CNN cannot verify the authenticity of the page. But the power that multiplied Broussard\'s gift was<|endoftext|>\n<|assistant|>\nZully Broussard decided to give a kidney to a stranger.\nA new computer program 

In [52]:
print(generate_text(f'Summarize the following CNN article:\n {tmp}', 50))

<|user|>
Summarize the following CNN article: (CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don't know, but the fact that so many people can have a life extension, that's pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for
<|assistant|>
Zully Broussard received pioneering kidneys to a person infected with HIV.
Broussard first Could transplant her kidney when she had kidney transplants.
She has the ability to perform the same function again using a dedicated tube of


In [None]:
hfd = load_dataset("Shannnh/hw5-changed", split = 'train')

In [None]:
hfd.unique('Classifier')

In [None]:
hfd.filter(lambda example: example['Classifier'] == 'SentimentAnalysis')[0]

In [None]:
print(generate_text(f"What's the sentiment of the sentence:\n I hate this movie", 5))

In [None]:
prompt('Hello world, my name is' , 1)