In [None]:
# Ensure that the following packages are installed
# !pip install transformers datasets tqdm torch transformers[torch]

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, default_data_collator 
from datasets import load_dataset
import torch, gc, os, sys, copy

# Disable WANDB
os.environ["WANDB_MODE"] = "disabled"

# Lets clear out items and GC
tokenizer = None
eval_dataset = None
model = None
trainer = None

# Clear everything!
gc.collect()
torch.cuda.empty_cache()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('codeparrot/codeparrot', use_fast=True)
tokenizer.pad_token = tokenizer.bos_token
# Load 'codeparrot/codeparrot-clean-valid' Dataset
eval_dataset = load_dataset('codeparrot/codeparrot-clean-valid')["train"]

# # Reduce top level dataset, to speed up debug iteration
# eval_dataset = eval_dataset.select(range(0, 200, 1))

# Tokenize content to input_ids, attention_mask, and labels
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["content"], truncation=True, max_length=1024, 
        padding='max_length', return_attention_mask=True
    ) 

    # Shift the input_ids one position to the right to create the labels.
    tokenized["labels"] = copy.deepcopy(tokenized["input_ids"][1:1024])
    tokenized["input_ids"] = tokenized["input_ids"][:1023]
    tokenized["attention_mask"] = tokenized["attention_mask"][:1023]

    # Tokenized output
    return tokenized

# Tokenize the content datasets
eval_dataset = eval_dataset.map(tokenize_function, batched=False, remove_columns=["content"])
eval_dataset.set_format(type='torch', columns=["labels",'input_ids', 'attention_mask'])

# Compute metrics
def compute_metrics(eval_pred):
    # No gradient
    with torch.no_grad():
        # Logits and labels batches
        logits_batch, labels_batch = eval_pred

        # Batch size
        batch_size = logits_batch.shape[0]
        # print("BATCH_SIZE??", batch_size)

        # Total loss
        total_loss = 0

        # Looks like i gotten logits, and labels in batchs of X
        # I have to compute the loss for each sample, and then average it
        for i in range(batch_size):
            # Get logits and labels for sample
            logits = torch.tensor(logits_batch[i])
            labels = torch.tensor(labels_batch[i])

            # Compute loss
            loss = torch.nn.functional.cross_entropy(logits, labels, reduction="mean")

            # Add to total loss
            total_loss += loss

        # Average loss
        loss = total_loss / batch_size

        # Clear everything!
        gc.collect()
        torch.cuda.empty_cache()

        # Print loss
        return {'loss': loss}

# Reduce it to 10 items (for debugging)
eval_dataset_10 = eval_dataset.select(range(0, 10, 1))
eval_dataset_20 = eval_dataset.select(range(10, 30, 1))
eval_dataset_100 = eval_dataset.select(range(30, 130, 1))

# Setup the model, use eval mode, and move it to GPU
model = AutoModelForCausalLM.from_pretrained('codeparrot/codeparrot').to("cuda")
model.eval()

# # Define training arguments
# training_args = TrainingArguments(
#     per_device_train_batch_size=2,  # batch size per device during training
#     per_device_eval_batch_size=2,   # batch size for evaluation
#     output_dir='./results',          # output directory
#     num_train_epochs=3,              # total number of training epochs
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
# )

# Initializing Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

    # THESE THINGS DO NOT WORK????
    # args=training_args,
)

# Because somehow the eval batching is happening for the full datasample i pass in
# we need to do some form of manual batching ???
def evaluate_dataset(in_dataset, batch_size=10):
    # Lets get the dataset size
    dataset_size = len(in_dataset)

    # Lets get the number of batches
    num_batches = dataset_size // batch_size

    # the total loss
    total_loss = 0

    # Lets iterate over the batches
    for i in range(num_batches):
        # Get the start and end index
        start_index = i * batch_size
        end_index = (i + 1) * batch_size

        # Get the batch
        batch = in_dataset.select(range(start_index, end_index, 1))

        # Perform evaluation
        batch_res = trainer.evaluate(batch, metric_key_prefix="eval")
        batch_loss = float(batch_res["eval_loss"])
        total_loss += batch_loss
        avg_loss = total_loss / (i + 1)

        # Print loss
        print(f"Batch {i} - batch loss: {batch_loss} - avg loss: {avg_loss}   (start: {start_index}, end: {end_index})")

    # Garbage collection, to keep vram managable
    gc.collect()
    torch.cuda.empty_cache()

    # Return the average loss
    return total_loss / num_batches

# # Perform validation for first 10
# print("=== 10 objects ===")
# output_10 = trainer.evaluate(eval_dataset_10)
# print("> Output object ...")
# print(output_10)

# gc.collect()
# torch.cuda.empty_cache()

# Perform validation for first 20
# print("=== 20 objects ===")
# output_20 = evaluate_dataset(eval_dataset_20)
# print("> Output object ...")
# print(output_20)

# # Perform validation for first 100
# print("=== 100 objects ===")
# output_100 = trainer.evaluate(eval_dataset_100)
# print("> Output object ...")
# print(output_100)

# gc.collect()
# torch.cuda.empty_cache()

# gc.collect()

# Perform validation for all objects
print(f"=== All objects {len(eval_dataset)} ===")
output = evaluate_dataset(eval_dataset)
print("> Final result ...")
print("loss = ", output)

  from .autonotebook import tqdm as notebook_tqdm


[2023-07-29 10:00:47,497] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Found cached dataset json (/home/picocreator/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-valid-826c6fd8b27e5523/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 410.68it/s]
Loading cached processed dataset at /home/picocreator/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-valid-826c6fd8b27e5523/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-fc62de51fe8a734f.arrow
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


=== All objects 61373 ===


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Batch 0 - batch loss: 2.597266912460327 - avg loss: 2.597266912460327   (start: 0, end: 10)
Batch 1 - batch loss: 2.320502758026123 - avg loss: 2.458884835243225   (start: 10, end: 20)
Batch 2 - batch loss: 2.331798553466797 - avg loss: 2.416522741317749   (start: 20, end: 30)
Batch 3 - batch loss: 2.1776061058044434 - avg loss: 2.3567935824394226   (start: 30, end: 40)
Batch 4 - batch loss: 2.3772802352905273 - avg loss: 2.3608909130096434   (start: 40, end: 50)
Batch 5 - batch loss: 2.246580123901367 - avg loss: 2.3418391148249307   (start: 50, end: 60)
Batch 6 - batch loss: 2.3905882835388184 - avg loss: 2.3488032817840576   (start: 60, end: 70)
Batch 7 - batch loss: 2.9608523845672607 - avg loss: 2.425309419631958   (start: 70, end: 80)
Batch 8 - batch loss: 2.7365193367004395 - avg loss: 2.459888299306234   (start: 80, end: 90)
Batch 9 - batch loss: 2.8640105724334717 - avg loss: 2.5003005266189575   (start: 90, end: 100)
Batch 10 - batch loss: 1.7566168308258057 - avg loss: 2.432