<a href="https://colab.research.google.com/github/VerebicsPeter/CodeSim/blob/main/model/model_finetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Similarity with Fine-tuning

## Dependencies

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pprint as pp
# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.amp import autocast, GradScaler
from torch.utils.data import DataLoader, Dataset, random_split
# Hugging Face Transformers (fro CodeBERT etc.)
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
# Libraries for logging
from tqdm.auto import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))

Device: cuda
Tesla T4


## Dataset Access

In [None]:
from google.colab import userdata

paired_dataset_url = f"https://drive.google.com/uc?export=download&id={userdata.get('codenetSamplePaired')}"
#paired_dataset_url = "https://drive.google.com/uc?export=download&id=1I3mbw4CIMij44vUrmf9XULZyg5lXHKwn"  # old dataset
#paired_dataset_url = "https://drive.google.com/uc?export=download&id=1Mdp7FX4YBgv3i0ga69e8gxcbDvSIngDM"  # new dataset

## Dataset

In [None]:
COLUMNS = [
    'pid',    # CodeNet problem ID
    'sid_1',  # CodeNet solution ID of 'src_1'
    'sid_2',  # CodeNet solution ID of 'src_2'
    'src_1',  # CodeNet solution code of 'sid_1'
    'src_2',  # CodeNet solution code of 'sid_2'
    'label',  # Label indicating if 'src_1' and 'src_2' both solve 'pid'
]

In [None]:
df = pd.read_csv(paired_dataset_url, header=0, names=COLUMNS)
df = df.drop(columns=['pid', 'sid_1', 'sid_2'])
df.head()

Unnamed: 0,src_1,src_2,label
0,##C - Step(TLE)\nN = int(input())\nA = list(ma...,"N=int(input())\nA=list(map(int,input().split()...",1
1,S = input()\nINF = 10 ** 9 + 7\nlength = len(S...,s = input()\nn = len(s)\n\nMOD = 10**9+7\ndp =...,1
2,import sys\nfrom bisect import bisect_right as...,'''\n研究室PCでの解答\n'''\nimport math\n#import nump...,1
3,"numb=input()\ninputs = list(map(int,input().sp...","N = int(input())\na = list(map(int, input().sp...",1
4,"import sys, re\nfrom math import ceil, sqrt, h...","from itertools import permutations\nn,m=map(in...",1


In [None]:
# Code datasets (for labeled and unlabeled code snippets)

class CodePairDataset(Dataset):
    def __init__(self, codes_a, codes_b, labels, tokenizer):
        super().__init__()
        assert len(codes_a) == len(codes_b) == len(labels), "Length MUST match!"
        self.codes_a = codes_a
        self.codes_b = codes_b
        self.labels = labels
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        code_a = self.codes_a[idx]
        code_b = self.codes_b[idx]
        label = self.labels[idx]
        # Encode the sequences for sequence pair classification
        # ([CLS], code_a tokens , [SEP], code_b tokens, [SEP])
        encoding = self.tokenizer(
            code_a, code_b,
            padding='max_length',  # Pad to max_length
            max_length=self.tokenizer.model_max_length,
            truncation=True,       # Truncate to max_length
            return_tensors='pt'    # Return torch.Tensor objects
        )
        # Remove batch dimension
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        return encoding, label

    def __len__(self):
        return len(self.labels)

    @classmethod
    def from_csv_data(cls, path: str, tokenizer):
        df = pd.read_csv(path, header=0, names=COLUMNS)
        df = df.drop(columns=['pid', 'sid_1', 'sid_2'])

        # Filter out sequences that are longer then 4096 chararters
        MAX_CHAR_COUNT = 4096
        filter = lambda row: len(row['src_1']) + len(row['src_2']) < MAX_CHAR_COUNT
        df = df[df.apply(filter, axis=1)]
        print('filtered sequences:', df.shape)

        srcs_x = df['src_1'].to_list()
        srcs_y = df['src_2'].to_list()
        labels = df['label'].to_list()
        return cls(srcs_x, srcs_y, labels, tokenizer)

## Model

In [None]:
# Model for finetuning

class CodeSimilarityClassifier(nn.Module):
    def __init__(self,
        bert,  # BERT based model instance
        freeze_bert=False,
        dropout_rate=0.2,
    ):
        super().__init__()
        self.bert = bert
        self.drop = nn.Dropout(dropout_rate)
        self.cls = nn.Linear(self.bert.config.hidden_size, 1)
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, bert_input) -> torch.Tensor:
        with torch.device(device):
            bert_output = self.bert(**bert_input)
            pooler_output = bert_output.pooler_output
            pooler_output = self.drop(pooler_output)
            logits = self.cls(pooler_output)
            return logits

## Training

In [None]:
def train_one_epoch(
    model, device,
    loss_func,
    scaler,
    optimizer,
    scheduler,
    dataloader,
    iters_to_accumulate,
    # for logging
    print_every=10
):
    running_loss = 0.0
    model.train()

    num_iter = len(dataloader)
    for iter, (encoding, labels) in enumerate(tqdm(train_loader)):
        # Converting to cuda tensors
        for k, v in encoding.items(): encoding[k] = v.to(device)
        labels = labels.to(device)

        # Obtaining the logits from the model
        # Enables autocasting for the forward pass (model + loss)
        with autocast('cuda'):
            # Obtaining the logits from the model
            logits = model(encoding)
            # Computing loss
            loss = loss_func(logits.squeeze(-1), labels.float())
            # Normalize the loss because it is averaged
            loss = loss / iters_to_accumulate

        # Backpropagating the gradients
        # Scales loss. (calls backward() on scaled loss to create scaled gradients)
        scaler.scale(loss).backward()

        if (iter + 1) % iters_to_accumulate == 0:
            # Optimization step
            # scaler.step() first unscales the gradients of the optimizer's assigned params.
            # If these gradients do not contain infs or NaNs, opti.step() is then called,
            # otherwise, opti.step() is skipped.
            scaler.step(optimizer)
            # Updates the scale for next iteration.
            scaler.update()
            # Adjust the learning rate based on the number of iterations.
            scheduler.step()
            # Clear gradients
            optimizer.zero_grad()

        running_loss += loss.item()

        # Print training loss information
        if (iter + 1) % print_every == 0:
            print(
                f"Iteration {iter+1}/{num_iter} complete. " +
                f"Loss: {running_loss / print_every}"
            )
            running_loss = 0.0


def evaluate_loss(
    model, device,
    loss_func,
    dataloader
):
    model.eval()

    loss, count = 0,0
    with torch.no_grad():
        for it, (encoding, labels) in enumerate(tqdm(dataloader)):
            # Converting to cuda tensors
            for k, v in encoding.items(): encoding[k] = v.to(device)
            labels = labels.to(device)
            logits = model(encoding)
            loss += loss_func(logits.squeeze(-1), labels.float()).item()
            count += 1

    mean_loss = loss / count
    return mean_loss


def train_bert(
    model,
    loss_func,
    optimizer,
    scheduler,
    train_loader, valid_loader,
    epochs,
    iters_to_accumulate
):
    best_loss = np.Inf
    iters = []
    train_losses = []
    valid_losses = []

    scaler = GradScaler('cuda')

    for epoch in range(epochs):
        # Compute training loss
        train_loss = train_one_epoch(
            model, device, loss_func,
            scaler,  # grad scaler
            optimizer,
            scheduler,
            train_loader,
            iters_to_accumulate,
            # print the training loss 5 times per epoch
            print_every=len(train_loader) // 5
        )
        # Compute validation loss
        valid_loss = evaluate_loss(
            model, device, loss_func,
            valid_loader
        )
        print(f"Epoch {epoch+1} complete! Validation Loss: {valid_loss}")

        if valid_loss < best_loss:
            print(f"Best validation loss improved from {best_loss} to {valid_loss}")
            best_loss = valid_loss
            # TODO: save the model

    torch.cuda.empty_cache()

In [None]:
# Parameters

bert_name = "neulab/codebert-python"

"""
other checkpoints:
- "microsoft/codebert-base"
- "huggingface/CodeBERTa-small-v1"
"""

epochs = 4
lr = 1e-5  # learning rate
wd = 1e-5  # weight decay
bs = 20    # batch size
# The gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate.
# If set to "1", you get the usual batch size
iters_to_accumulate = 2

In [None]:
# Dataset

bert_tokenizer = AutoTokenizer.from_pretrained(bert_name)

dataset = CodePairDataset.from_csv_data(paired_dataset_url, bert_tokenizer)

train_size = int(0.8 * len(dataset))
valid_size = len(dataset) - train_size
train_data, valid_data = random_split(dataset, [train_size, valid_size])

SHUFFLE = True
train_loader = DataLoader(train_data, batch_size=bs, shuffle=SHUFFLE)
valid_loader = DataLoader(valid_data, batch_size=bs, shuffle=SHUFFLE)

In [None]:
# Model instance

bert_model = AutoModel.from_pretrained(bert_name).to(device)

model = CodeSimilarityClassifier(bert_model).to(device)

In [None]:
# Optimizer and Scheduler

optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

# The number of steps for the warmup phase.
num_warmup_steps = 0
# The total number of training steps
num_training_steps = epochs * len(train_loader)
# Necessary to take into account Gradient accumulation
num_training_steps = num_training_steps // iters_to_accumulate

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

In [None]:
# Train the model

loss_func = nn.BCEWithLogitsLoss()

train_bert(
    model, loss_func,
    optimizer,
    scheduler,
    train_loader, valid_loader,
    epochs,
    iters_to_accumulate
)