In [1]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import torch
import gc

checkpoint = "Salesforce/codet5p-220m-bimodal"
path_to_model = ''
path_to_dataset = ''
output_dir = ''
device = "cuda"  if torch.cuda.is_available() else "cpu"

In [2]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(
    path_to_model if not path_to_model == '' else checkpoint,
    trust_remote_code=True).to(device)

In [3]:
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [7]:
dataset = load_dataset("json", data_files=path_to_dataset)["train"].with_format("torch")
dataloader = DataLoader(dataset, batch_size=4)

In [8]:
torch.cuda.empty_cache()
gc.collect()

95

In [9]:
predictions = {
    "input_ids": [],
    "labels": [],
    "pred_ids": []
}

pba = tqdm(dataloader)
for batch in pba:
    for k, v in batch.items():
        batch[k] = v.squeeze(1).to(device)

    with torch.no_grad():
        outputs = model(**batch)

    predictions["input_ids"].extend(batch["input_ids"].cpu().numpy())
    predictions["labels"].extend(batch["labels"].cpu().numpy())
    predictions["pred_ids"].extend(outputs.logits.argmax(-1).cpu().numpy())
    pba.set_description(f"Loss: {outputs.loss.item():.4f}")

Loss: 1.3814: 100%|██████████| 1250/1250 [02:51<00:00,  7.28it/s]


In [10]:
def decode_predictions(predictions):
    decoded_preds = {
        "input_code": [],
        "labels": [],
        "prediction": []
    }
    for i in range(len(predictions["input_ids"])):
        decoded_preds["input_code"].append(tokenizer.decode(predictions["input_ids"][i], skip_special_tokens=True))
        labels = torch.LongTensor(predictions["labels"][i])
        labels = labels[labels != -100]
        decoded_preds["labels"].append(tokenizer.decode(labels, skip_special_tokens=True))
        decoded_preds["prediction"].append(tokenizer.decode(predictions["pred_ids"][i], skip_special_tokens=True))

    return decoded_preds

In [11]:
predictions = decode_predictions(predictions)

In [12]:
df = pd.DataFrame(predictions, columns=["input_code", "labels", "prediction"])

In [13]:
df.to_json(output_dir + "/predictions.jsonl", orient="records", lines=True)