In [14]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import torch
import gc

checkpoint = "Salesforce/codet5p-220m-bimodal"
path_to_model = None
path_to_dataset = r'../datasets/intellij-val-dataset.jsonl'
output_dir = r'../experiments/model_0'
device = "cuda"  if torch.cuda.is_available() else "cpu"

In [2]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(
    path_to_model if path_to_model else checkpoint,
    trust_remote_code=True).to(device)

In [15]:
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [7]:
eval_dataset = load_dataset("json", data_files=path_to_dataset)["train"].select(range(10)).with_format("torch")
eval_dataloader = DataLoader(eval_dataset, batch_size=2)

In [8]:
torch.cuda.empty_cache()
gc.collect()

722

In [9]:
predictions = {
    "input_ids": [],
    "labels": [],
    "pred_ids": []
}
model.eval()

pba = tqdm(eval_dataloader)
for batch in pba:
    for k, v in batch.items():
        batch[k] = v.squeeze(1).to(device)

    with torch.no_grad():
        outputs = model(**batch)

    predictions["input_ids"].extend(batch["input_ids"].cpu().numpy())
    predictions["labels"].extend(batch["labels"].cpu().numpy())
    predictions["pred_ids"].extend(outputs.logits.argmax(-1).cpu().numpy())
    pba.set_description(f"Loss: {outputs.loss.item():.4f}")

Loss: 10.1101: 100%|██████████| 5/5 [00:12<00:00,  2.41s/it]


In [10]:
def decode_predictions(predictions):
    decoded_preds = {
        "input_code": [],
        "labels": [],
        "prediction": []
    }
    for i in range(len(predictions["input_ids"])):
        decoded_preds["input_code"].append(tokenizer.decode(predictions["input_ids"][i]))
        labels = torch.LongTensor(predictions["labels"][i])
        labels = labels[labels != -100]
        decoded_preds["labels"].append(tokenizer.decode(labels))
        decoded_preds["prediction"].append(tokenizer.decode(predictions["pred_ids"][i]))

    return decoded_preds

In [11]:
predictions = decode_predictions(predictions)

In [12]:
df = pd.DataFrame(predictions, columns=["input_code", "labels", "prediction"])

In [16]:
df.to_json(output_dir + "/predictions.jsonl", orient="records", lines=True)