In [None]:
from transformers import AutoModel, AutoTokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import torch
import gc

checkpoint = 'Salesforce/codet5p-220m'
path_to_model = r'' # path to model
path_to_dataset = r'' # path to dataset
output_dir = r'' # path to output directory
device = 'cuda' if torch.cuda.is_available() else 'cpu'
task = 'mask-prediction'  # 'code-summarization' or 'mask-prediction'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
modeling = T5ForConditionalGeneration if task == 'mask-prediction' else AutoModel

print(f'Task: {task}')

if path_to_model:
    model = modeling.from_pretrained(
        path_to_model,
        trust_remote_code=True).to(device)
    print("Loaded model from path")
else:
    model = modeling.from_pretrained(
        checkpoint,
        trust_remote_code=True).to(device)
    print("Loaded model from checkpoint")

In [None]:
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:
dataset = load_dataset("json", data_files=path_to_dataset)["train"].with_format("torch")
dataloader = DataLoader(dataset, batch_size=4)

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
predictions = {
    "input_ids": [],
    "labels": [],
    "pred_ids": []
}

pba = tqdm(dataloader)
for batch in pba:
    for k, v in batch.items():
        batch[k] = v.squeeze(1).to(device)

    with torch.no_grad():
        outputs = model(**batch)

    predictions["input_ids"].extend(batch["input_ids"].cpu().numpy())
    predictions["labels"].extend(batch["labels"].cpu().numpy())
    predictions["pred_ids"].extend(outputs.logits.argmax(-1).cpu().numpy())
    pba.set_description(f"Loss: {outputs.loss.item():.4f}")

In [None]:
def decode_predictions(predictions):
    decoded_preds = {
        "input_code": [],
        "labels": [],
        "prediction": []
    }
    for i in range(len(predictions["input_ids"])):
        decoded_preds["input_code"].append(tokenizer.decode(predictions["input_ids"][i], skip_special_tokens=True))
        labels = torch.LongTensor(predictions["labels"][i])
        labels = labels[labels != -100]
        decoded_preds["labels"].append(tokenizer.decode(labels, skip_special_tokens=True))
        decoded_preds["prediction"].append(tokenizer.decode(predictions["pred_ids"][i], skip_special_tokens=True))

    return decoded_preds

In [None]:
predictions = decode_predictions(predictions)

In [None]:
df = pd.DataFrame(predictions, columns=["input_code", "labels", "prediction"])

In [None]:
df.to_json(output_dir + "/predictions.jsonl", orient="records", lines=True)