In [1]:
# Helper functions
import transformers
import torch
import tqdm
import numpy as np

# CUDA gradient accumulation
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

model = transformers.GPT2LMHeadModel.from_pretrained("gpt2_torch_openwebtext-10k")
# model = transformers.GPT2LMHeadModel(transformers.GPT2Config())
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2_torch_openwebtext-10k")
DEVICE = "cuda"

def break_text_to_pieces(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int = 512) -> list[str]:
    """Read a file and convert it to tokenized blocks, edding <|endoftext|> to each block"""
    with open(text_path) as f:
        text = f.read()
    chunk_len0 = block_len - 1  # Leave space for a TOKEN_ENDOFTEXT
    tokens = tokenizer.encode(text)
    blocks = []
    pos = 0
    while pos < len(tokens):
        chunk = tokens[pos: pos + chunk_len0]
        chunk.append(TOKEN_ENDOFTEXT)
        blocks.append(chunk)
        pos += chunk_len0

    if len(blocks[-1]) < block_len:
        del blocks[-1]

    #for block in blocks:
    #    print(len(block))

    return blocks

def train_val_split(data: list[str], ratio: float):
    n = len(data)
    assert n >= 2
    n_val = max(1, int(n * ratio))
    return data[n_val:], data[:n_val]

def prepare_dsets(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int):
    """Read the text, prepare the datasets """
    data = break_text_to_pieces(text_path, tokenizer, block_len)
    data_train, data_val = train_val_split(data, 0.2)
    return MyDset(data_train), MyDset(data_val)

# Custom Dataset
class MyDset(torch.utils.data.Dataset):
    def __init__(self, data: list[list[int]]):
        self.data = []
        for d in data:
            input_ids = torch.tensor(d, dtype=torch.int64)
            attention_mask = torch.ones(len(d), dtype=torch.int64)
            self.data.append({'input_ids': input_ids,
                'attention_mask': attention_mask, 'labels':input_ids})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        return self.data[idx]

## How one epoch of training happens
def train_one(model, loader, optimizer):
    model.train()
    losses = []
    for batch in tqdm.tqdm(loader):
        optimizer.zero_grad()
        with autocast():
            for k, v in batch.items():
                batch[k] = v.to(DEVICE)
            out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
            loss = out['loss']
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        losses.append(loss.item())

    return np.mean(losses)



## How one epoch of validation happens 
def val_one(model: torch.nn.Module, loader: torch.utils.data.DataLoader):
    """Standard PyTorch eval, one epoch"""
    model.eval()
    losses = []
    for batch in tqdm.tqdm(loader):
        for k, v in batch.items():
            batch[k] = v.to(DEVICE)
        with torch.no_grad():
            out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        # loss, logits, past_key_values
        loss = out['loss']
        losses.append(loss.item())

    return np.mean(losses)

  return torch._C._cuda_getDeviceCount() > 0


In [None]:
# Load model and tokenizer


TOKEN_ENDOFTEXT = 50256  # <|endoftext|>
BLOCK_LEN = 1024
TEXT_CORPUS = "dialogs_processed.txt"
model.to(DEVICE)
# Create datasets and loader
dset_train, dset_val = prepare_dsets(TEXT_CORPUS, tokenizer, BLOCK_LEN)
loader_train = torch.utils.data.DataLoader(dset_train, batch_size=1)
loader_val = torch.utils.data.DataLoader(dset_val, batch_size=1)

# Optimizer, device
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training loop
for i_epoch in range(1):
    loss_train = train_one(model, loader_train, optimizer)
    loss_val = val_one(model, loader_val)
    print(f'{i_epoch} : loss_train={loss_train}, loss_val={loss_val}')


# Save the model if needed
if True:
    model.save_pretrained('./gpt2_torch_conversational_openwebtext-10k/')
    tokenizer.save_pretrained('./gpt2_torch_conversational_openwebtext-10k/')



In [2]:
# Now our model is trained, try the generation
model = transformers.GPT2LMHeadModel.from_pretrained("gpt2_torch_conversational_openwebtext-10k")
history = "You are Bot. User will ask you questions. Please do your best to respond to the user helpfully. Keep your response inside one line, following \"Bot: \"\n\n"
model.to(DEVICE)
while True:
    text = input("User: ")
    history += "User: " + text
    batch = tokenizer([history], return_tensors='pt')
    for k, v in batch.items():
        batch[k] = v.to(DEVICE)
    out = model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_length=1000, temperature=1.8, top_k=30, do_sample=True )
    out = tokenizer.batch_decode(out.cpu())[0]
    input_length = len(history)
    history += "\nBot: " + out[input_length:input_length+100].split("User: ")[0] + "\n" #only save 100 characters of context from the bot
    print(out[input_length:].split("User: ")[0])
    

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [None]:
print(out)

You are Bot. User will ask you questions. Please do your best to respond to the user helpfully.

User: Hi buddy
Bot: , i have a good job.
Bot: well, i'm not a doctor.

User: You can be a doctor
Bot: 
Bot: are you doing so?

User: Well, no. I am a software developer. But you can be whatever you want to be.
Bot: Bot: i like to do that.

User: I bet. Do you like music?
Bot: Bot: what's the matter?

User: Nothing, just asking if you like music.
Bot: Bot: well, that's not a nice job.

User: I'm sorry.
Bot: Bot: you're not a doctor.

User: That much is clear. You're not a doctor either.
Bot: Bot: you'll have to walk out over the world.

User: No, you.
Bot: you'll have to walk out over the world.
Bot: you need to do that.

User: No. You.
Bot: you need to do that.
Bot: you need to walk out over the world.

User: Ok fine, maybe I will walk out over the world. Are you happy now?
Bot: Bot: i want to do that.

User: Then do so!
Bot: Bot: then do you know what?

User: What?
Bot: Bot: i know i was a doctor from the beginning.
