In [6]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from pathlib import Path
from tqdm import tqdm
import torch
import json
import gc
import os


device = "cuda" if torch.cuda.is_available() else "cpu"
path_to_model = None
path_to_dataset = "../datasets/intellij-train-dataset.jsonl"
checkpoint = "Salesforce/codet5p-220m-bimodal"
path_to_save = "../experiments/"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(
    path_to_model if path_to_model else checkpoint,
    trust_remote_code=True).to(device)

In [8]:
path_to_save = "../experiments/model_0"
batch_size = 1
epochs = 1

In [9]:
Path(path_to_save).mkdir(parents=True, exist_ok=True)

In [10]:
train_dataset = load_dataset("json", data_files=path_to_dataset)["train"].select(range(1000)).with_format("torch")
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
torch.cuda.empty_cache()
gc.collect()

289

In [15]:
model.train()
pba = tqdm(train_dataloader)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(epochs):
    for step, batch in enumerate(pba):
        optimizer.zero_grad()

        for k, v in batch.items():
            batch[k] = v.squeeze(1).to(device)

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        pba.set_description_str(f"Epoch: {epoch} Step: {step} Loss: {loss.item():.4f}")

    model.save_pretrained(os.path.join(path_to_save, "codet5p-220m-bimodal-" + str(epoch)))


  0%|          | 0/1000 [00:16<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 3.09 GiB is allocated by PyTorch, and 325.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# save info related to training into a json file
info = {
    "path_to_model": path_to_model,
    "checkpoint": checkpoint,
    "dataset": {
        "rows": train_dataset.num_rows,
        "features": train_dataset.features,
    },
    "batch_size": batch_size,
    "epochs": epochs
}

In [None]:
with open(os.path.join(path_to_save, "train_info.json"), "w") as f:
    json.dump(info, f)