In [None]:
!pip install transformers
!pip install tokenizers

In [None]:
import os
import logging
import torch
import numpy as np
from sklearn.metrics import f1_score
from tqdm import tqdm, trange
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import Dataset
from torch.nn.utils.rnn import pad_sequence
from typing import Dict, List, Tuple
from torch.utils.tensorboard.writer import SummaryWriter
from transformers.data.metrics import acc_and_f1, simple_accuracy

In [None]:
!wget -nc -O ds.npy https://ibm.box.com/shared/static/o9x8cglra6bpvngt537nlo7pe7ctjrk6.npy

In [None]:
logging.basicConfig(level=logging.INFO)
tb_writer = SummaryWriter()
EVALUATE = True

LANGUAGES = [
    "tensorflow",
    "pytorch"
]
tokenizer = RobertaTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
model = RobertaForSequenceClassification.from_pretrained("huggingface/CodeBERTa-small-v1", num_labels=len(LANGUAGES))

In [None]:
class CodeSearchNetDataset(Dataset):
    examples: List[Tuple[List[int], int]]

    def __init__(self, data):

        self.examples = []

        lines = []
        for code in tqdm(data):
            if code[0] == '':
                continue
            label = code[1]
            label_idx = LANGUAGES.index(label)
            examples = [(tokenizer.encode(code[0], max_length=512), label_idx)]
            self.examples += examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return self.examples[i]

In [None]:
ds = np.load('ds.npy', allow_pickle = True)
dstrain = ds[:int(len(ds)*.88)]
dstest = ds[int(len(ds)*.88):]

In [None]:
train_dataset = CodeSearchNetDataset(dstrain)
test_dataset = CodeSearchNetDataset(dstest)

In [None]:
def collate(examples):
    input_ids = pad_sequence([torch.tensor(x[0]) for x in examples], batch_first=True, padding_value=1)
    labels = torch.tensor([x[1] for x in examples])
    return input_ids, labels

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate)

batch = next(iter(train_dataloader))

model.to("cuda")
model.train()
for param in model.roberta.parameters():
    param.requires_grad = False
## ^^ Only train final layer.

print(f"num params:", model.num_parameters())
print(f"num trainable params:", model.num_parameters(only_trainable=True))

In [None]:
def evaluate(best):
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = np.empty((0), dtype=np.int64)
    out_label_ids = np.empty((0), dtype=np.int64)

    model.eval()

    eval_dataloader = DataLoader(test_dataset, batch_size=128, collate_fn=collate)
    for step, (input_ids, labels) in enumerate(tqdm(eval_dataloader, desc="Eval")):
        with torch.no_grad():
            outputs = model(input_ids=input_ids.to("cuda"), labels=labels.to("cuda"))
            loss = outputs[0]
            logits = outputs[1]
            eval_loss += loss.mean().item()
            nb_eval_steps += 1
        preds = np.append(preds, logits.argmax(dim=1).detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
    eval_loss = eval_loss / nb_eval_steps
    acc = simple_accuracy(preds, out_label_ids)
    f1 = f1_score(y_true=out_label_ids, y_pred=preds, average="macro")
    print("=== Eval: loss ===", eval_loss)
    print("=== Eval: acc. ===", acc)
    print("=== Eval: f1 ===", f1)
    if acc > best:
      best = round(acc,2)
      os.mkdir('./best' + str(best))
      print("=== Saving model ===")
      model.save_pretrained('./best' + str(best))
    # print(acc_and_f1(preds, out_label_ids))
    tb_writer.add_scalars("eval", {"loss": eval_loss, "acc": acc, "f1": f1}, global_step)

    return best


### Training loop

global_step = 0
best = 0
train_iterator = trange(0, 4, desc="Epoch")
optimizer = torch.optim.AdamW(model.parameters())
for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, (input_ids, labels) in enumerate(epoch_iterator):
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids.to("cuda"), labels=labels.to("cuda"))
        loss = outputs[0]
        loss.backward()
        tb_writer.add_scalar("training_loss", loss.item(), global_step)
        optimizer.step()
        global_step += 1
        if EVALUATE and global_step % 50 == 0:
            best = evaluate(best)
            model.train()


evaluate(best)