In [1]:
import torch
from torch.utils.data import IterableDataset

class ConstantLengthDataset(IterableDataset):
    def __init__(self, tokenizer, dataset, seq_length=1024,
                 num_of_sequences=1024, chars_per_token=3.6):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    m = f"Buffer Full : {buffer_len}>={self.input_characters:.0f}"
                    # print(m)
                    break
                try:
                    m = f"Fill Buffer: {buffer_len}<{self.input_characters:.0f}"
                    # print(m)
                    buffer.append(next(iterator)["content"])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    iterator = iter(self.dataset)

            all_token_ids = []
            tokenized_inputs = self.tokenizer(buffer, truncation=False)
            for tokenized_input in tokenized_inputs["input_ids"]:
                all_token_ids.extend(tokenized_input+[self.concat_token_id])

            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i+self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)

def eval_ds_fun(eval_ds):
    return {"eval_input_ids" : tokenizer(eval_ds["content"], truncation=True, max_length=1024,
                      padding="max_length")["input_ids"]}

In [2]:
from argparse import Namespace

# Commented parameters correspond to the small model
config = {"epochs": 1,
          "train_batch_size": 2, # 12
          "valid_batch_size": 8, # 12
          "weight_decay": 0.1,
          "shuffle_buffer": 1000,
          "learning_rate": 2e-4, # 5e-4
          "lr_scheduler_type": "cosine",
          "num_warmup_steps": 750, # 2000
          "gradient_accumulation_steps": 16, # 1
          "max_train_steps": 20, # 150000
          "max_eval_steps": -1,
          "seq_length": 1024,
          "seed": 1,
          "save_checkpoint_steps": 50000} # 15000

args = Namespace(**config)

In [3]:
print(args)

Namespace(epochs=1, train_batch_size=2, valid_batch_size=8, weight_decay=0.1, shuffle_buffer=1000, learning_rate=0.0002, lr_scheduler_type='cosine', num_warmup_steps=750, gradient_accumulation_steps=16, max_train_steps=20, max_eval_steps=-1, seq_length=1024, seed=1, save_checkpoint_steps=50000)


In [4]:
import wandb
import logging
import datasets
import transformers
from accelerate import Accelerator
from torch.utils.tensorboard import SummaryWriter

accelerator = Accelerator()

def setup_logging(project_name):
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, handlers=[
                    logging.FileHandler(f"log/debug_{accelerator.process_index}.log"),
                    logging.StreamHandler()]
    )
    if accelerator.is_main_process: # We only want to set up logging once
        wandb.init(project=project_name, config=args, name="this_is", id="random",
                   resume="auto")
        run_name = wandb.run.name
        tb_writer = SummaryWriter()
        tb_writer.add_hparams(vars(args), {'0': 0})
        logger.setLevel(logging.INFO)
        datasets.utils.logging.set_verbosity_debug()
        transformers.utils.logging.set_verbosity_info()
    else:
        tb_writer = None
        run_name = ''
        logger.setLevel(logging.ERROR)
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
    return logger, tb_writer, run_name

2023-01-14 18:05:36.732230: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-14 18:05:36.957494: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-01-14 18:05:37.402349: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-14 18:05:37.402400: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [5]:
from torch.utils.data.dataloader import DataLoader

def create_dataloaders(dataset_name):
    train_data = load_dataset(dataset_name+'-train', split="train",
                              streaming=True)
    train_data = train_data.shuffle(buffer_size=args.shuffle_buffer,
                                    seed=args.seed)
    train_dataset = ConstantLengthDataset(tokenizer, train_data,
                                          seq_length=args.seq_length)
    train_dataloader=DataLoader(train_dataset, batch_size=args.train_batch_size)


    valid_data = load_dataset(dataset_name+'-valid', split="validation",
                              streaming=False)
    valid_data = valid_data.map(eval_ds_fun, batched=True, batch_size=128,
                    remove_columns=['repo_name', 'path', 'copies', 'size', 'content', 'license'])
    eval_dataloader = DataLoader(torch.tensor(valid_data["eval_input_ids"][:200]), batch_size=8)
    #### remove [:200] from above!

    return train_dataloader, eval_dataloader


In [6]:
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)

    return [{"params":params_with_wd, "weight_decay":args.weight_decay},
            {"params":params_without_wd, "weight_decay":0.0}]

In [7]:
from torch.optim import AdamW
from datasets import load_dataset
from transformers import set_seed
from transformers import get_scheduler
from huggingface_hub import Repository
from transformers import AutoTokenizer, AutoModelForCausalLM

set_seed(args.seed)
project_name = "codeparrot-small"
model_to_be_trained = "codeparrot-small2"

samples_per_step = accelerator.state.num_processes * args.train_batch_size

#Logging
logger, tb_writer, run_name = setup_logging(project_name)
logger.info(accelerator.state)

#Load model and tokenizer
# if accelerator.is_main_process:
#     hf_repo = Repository("./Training_files/git_files/",
#                          clone_from="susnato/codeparrot-training-from-scratch")
model = AutoModelForCausalLM.from_pretrained(f"susnato/{model_to_be_trained}")
model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained("susnato/codeparrot")
tokenizer.add_special_tokens({'pad_token' : '<|endoftext|>'})

#Load Dataset and DataLoader
train_dl, eval_dl = create_dataloaders("transformersbook/codeparrot")

#Prepare the optimizer and learning rate scheduler
optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
                          num_warmup_steps=args.num_warmup_steps,
                          num_training_steps=args.max_train_steps)

def get_lr():
    return optimizer.param_groups[0]["lr"]

model, optimizer, train_dl, eval_dl = accelerator.prepare(model, optimizer, train_dl, eval_dl)


[34m[1mwandb[0m: Currently logged in as: [33msusnato[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668484783319098, max=1.0…

01/14/2023 18:05:40 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Mixed precision type: no

loading configuration file config.json from cache at /home/susnato/.cache/huggingface/hub/models--susnato--codeparrot-small2/snapshots/98c2bd764fddd5d81402fb3bf55ae51454b4f639/config.json
Model config GPT2Config {
  "_name_or_path": "susnato/codeparrot-small2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary

Downloading:   0%|          | 0.00/457M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /home/susnato/.cache/huggingface/hub/models--susnato--codeparrot-small2/snapshots/98c2bd764fddd5d81402fb3bf55ae51454b4f639/pytorch_model.bin
All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at susnato/codeparrot-small2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.
loading file vocab.json from cache at /home/susnato/.cache/huggingface/hub/models--susnato--codeparrot/snapshots/58e55c7b745f851839fd00898005d2415aa4e975/vocab.json
loading file merges.txt from cache at /home/susnato/.cache/huggingface/hub/models--susnato--codeparrot/snapshots/58e55c7b745f851839fd00898005d2415aa4e975/merges.txt
loading file tokenizer.json from cache at /home/susnato/.cache/huggingface/hub/models--susnato--codeparrot/snapshots/58e55c7b745f851839

In [8]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dl):
        with torch.no_grad():
            opts = model(batch, labels=batch)
            loss = opts.loss.repeat(args.valid_batch_size)
            losses.append(accelerator.gather(loss))
            if args.max_eval_steps > 0 and step >= args.max_eval_steps:
                break
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()

def log_metrics(epoch, step, metrics):
    metrics.update({"Epoch":epoch})
    logger.info(f"Step {step} : {metrics}")
    if accelerator.is_main_process:
        wandb.log(metrics)
        [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]

In [9]:
import os
import time
import pickle

#Train Model
model.train()

def run_epochs(epochs):
    if os.path.isfile("/home/susnato/PycharmProjects/Transformers_From_Scratch"
                             f"/Training_files/epochs_steps.pickle"):
        epoch_step_file = pickle.load(
            open("/home/susnato/PycharmProjects/Transformers_From_Scratch"
                             f"/Training_files/epochs_steps.pickle", "rb"))
        completed_steps = epoch_step_file["completed_steps"]
        if completed_steps==0:
            curr_epoch = epoch_step_file["curr_epoch"]
        else:
            curr_epoch = epoch_step_file["curr_epoch"]-1
        print(f"Prev epoch/steps File found !, got :: epoch - {curr_epoch} steps -"
              f" {completed_steps}")
    else:
        curr_epoch = 0
        completed_steps = 0
    for epoch in range(1+curr_epoch, curr_epoch+epochs+1):
        print(f"""####################################################################################
                                    Starting Epoch - {epoch}
            ####################################################################################
                """)
        for step, batch in enumerate(train_dl, start=1):
            if completed_steps >= args.max_train_steps:
                break
            loss = model(batch, labels=batch).loss
            log_metrics(epoch, step, {"lr":get_lr(), "samples":step*samples_per_step,
                               "steps":completed_steps, "loss/train":loss.item()})
            loss = loss/args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                completed_steps += 1
            if step % args.save_checkpoint_steps == 0:
                logger.info('Evaluating and saving model checkpoint')
                eval_loss, perplexity = evaluate()
                log_metrics(epoch, step, {"loss/eval":eval_loss, "perplexity":perplexity})
                accelerator.wait_for_everyone()
                unwrapped_model = accelerator.unwrap_model(model)

                unwrapped_model.save_pretrained(f"Training_files/models/{model_to_be_trained}")
                unwrapped_model.push_to_hub(repo_id="susnato/codeparrot-small2",
                                    commit_message=f"Trained for Epoch - {epoch}, save_checkpoint_steps reached!")
            #Log Epoch No and completed_steps
            log_dict = {"curr_epoch":epoch, "completed_steps":completed_steps}
            with open("/home/susnato/PycharmProjects/Transformers_From_Scratch"
                             f"/Training_files/epochs_steps.pickle", "wb") as file:
                pickle.dump(obj=log_dict, file=file)


        logger.info("Evaluating and Saving model after training")

        #Saving
        completed_steps = 0
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(f"Training_files/models/{model_to_be_trained}")
        unwrapped_model.push_to_hub(repo_id="susnato/codeparrot-small2",
                                    commit_message=f"Trained for Epoch - {epoch}")

        eval_loss, perplexity = evaluate()
        log_metrics(epoch, step, {"loss/eval":eval_loss, "perplexity":perplexity})
        accelerator.wait_for_everyone()

        log_dict = {"curr_epoch":epoch, "completed_steps":completed_steps}
        with open("/home/susnato/PycharmProjects/Transformers_From_Scratch"
                             f"/Training_files/epochs_steps.pickle", "wb") as file:
            pickle.dump(obj=log_dict, file=file)


In [23]:
import keyboard
while True:
    if keyboard.is_pressed("a"):
        print("You pressed 'a'.")
        break

ImportError: You must be root to use this library on linux.

In [22]:
while True:
    if ord("q"):
        break

In [10]:
run_epochs(5)

Prev epoch/steps File found !, got :: epoch - 7 steps - 20
####################################################################################
                                    Starting Epoch - 8
            ####################################################################################
                


Token indices sequence length is longer than the specified maximum sequence length for this model (2626 > 1024). Running this sequence through the model will result in indexing errors
01/14/2023 18:07:15 - INFO - __main__ - Evaluating and Saving model after training
01/14/2023 18:07:27 - INFO - __main__ - Step 1 : {'loss/eval': 7.925716876983643, 'perplexity': 2767.547607421875, 'Epoch': 8}
Configuration saved in Training_files/models/codeparrot-small2/config.json
Model weights saved in Training_files/models/codeparrot-small2/pytorch_model.bin
Configuration saved in /tmp/tmpmq36wvbc/config.json
Model weights saved in /tmp/tmpmq36wvbc/pytorch_model.bin
Uploading the following files to susnato/codeparrot-small2: config.json,pytorch_model.bin


####################################################################################
                                    Starting Epoch - 9
            ####################################################################################
                


01/14/2023 18:07:37 - INFO - __main__ - Step 1 : {'lr': 0.0, 'samples': 2, 'steps': 0, 'loss/train': 7.354959964752197, 'Epoch': 9}
01/14/2023 18:07:37 - INFO - __main__ - Step 2 : {'lr': 0.0, 'samples': 4, 'steps': 0, 'loss/train': 7.419513702392578, 'Epoch': 9}
01/14/2023 18:07:38 - INFO - __main__ - Step 3 : {'lr': 0.0, 'samples': 6, 'steps': 0, 'loss/train': 7.648837089538574, 'Epoch': 9}
01/14/2023 18:07:38 - INFO - __main__ - Step 4 : {'lr': 0.0, 'samples': 8, 'steps': 0, 'loss/train': 7.377267837524414, 'Epoch': 9}
01/14/2023 18:07:38 - INFO - __main__ - Step 5 : {'lr': 0.0, 'samples': 10, 'steps': 0, 'loss/train': 7.237504959106445, 'Epoch': 9}
01/14/2023 18:07:39 - INFO - __main__ - Step 6 : {'lr': 0.0, 'samples': 12, 'steps': 0, 'loss/train': 7.551029205322266, 'Epoch': 9}
01/14/2023 18:07:39 - INFO - __main__ - Step 7 : {'lr': 0.0, 'samples': 14, 'steps': 0, 'loss/train': 7.976627349853516, 'Epoch': 9}
01/14/2023 18:07:39 - INFO - __main__ - Step 8 : {'lr': 0.0, 'samples': 1

####################################################################################
                                    Starting Epoch - 10
            ####################################################################################
                


01/14/2023 18:10:59 - INFO - __main__ - Step 1 : {'lr': 5.333333333333334e-06, 'samples': 2, 'steps': 0, 'loss/train': 7.29647970199585, 'Epoch': 10}
01/14/2023 18:10:59 - INFO - __main__ - Step 2 : {'lr': 5.333333333333334e-06, 'samples': 4, 'steps': 0, 'loss/train': 7.377992630004883, 'Epoch': 10}
01/14/2023 18:11:00 - INFO - __main__ - Step 3 : {'lr': 5.333333333333334e-06, 'samples': 6, 'steps': 0, 'loss/train': 7.587770462036133, 'Epoch': 10}
01/14/2023 18:11:00 - INFO - __main__ - Step 4 : {'lr': 5.333333333333334e-06, 'samples': 8, 'steps': 0, 'loss/train': 7.324524402618408, 'Epoch': 10}
01/14/2023 18:11:00 - INFO - __main__ - Step 5 : {'lr': 5.333333333333334e-06, 'samples': 10, 'steps': 0, 'loss/train': 7.182759761810303, 'Epoch': 10}
01/14/2023 18:11:01 - INFO - __main__ - Step 6 : {'lr': 5.333333333333334e-06, 'samples': 12, 'steps': 0, 'loss/train': 7.5177812576293945, 'Epoch': 10}
01/14/2023 18:11:01 - INFO - __main__ - Step 7 : {'lr': 5.333333333333334e-06, 'samples': 14

KeyboardInterrupt: 

In [None]:
# run_epochs(5)