<a href="https://colab.research.google.com/github/VerebicsPeter/CodeSim/blob/main/model/model_dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Similarity with Contrastive Learning

## Dependencies

In [None]:
# for data augmentation
!pip install python-minifier

Collecting python-minifier
  Downloading python_minifier-2.9.0-py2.py3-none-any.whl.metadata (6.1 kB)
Downloading python_minifier-2.9.0-py2.py3-none-any.whl (46 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.7/46.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-minifier
Successfully installed python-minifier-2.9.0


In [None]:
!pip install pytorch-metric-learning

Collecting pytorch-metric-learning
  Downloading pytorch_metric_learning-2.6.0-py3-none-any.whl.metadata (17 kB)
Downloading pytorch_metric_learning-2.6.0-py3-none-any.whl (119 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.3/119.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-metric-learning
Successfully installed pytorch-metric-learning-2.6.0


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torch.utils.tensorboard import SummaryWriter
from pytorch_metric_learning import losses
# Hugging Face Transformers (CodeBERT etc.)
from transformers import AutoTokenizer, AutoModel
# Libraries for augmenting data
import python_minifier
# Libraries for logging
from tqdm.auto import tqdm
from typing import Iterable

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))

Device: cuda
Tesla T4


## Dataset Access

In [None]:
from google.colab import userdata
labeled_dataset_url   = f"https://drive.google.com/uc?export=download&id={userdata.get('labeledDataset')}"
unlabeled_dataset_url = f"https://drive.google.com/uc?export=download&id={userdata.get('unlabeledDataset')}"

## Dataset and Data Augmentation

In [None]:
# Code datasets (for labeled and unlabeled code snippets)


def numeric_labels(labels) -> torch.Tensor:
    """Transform string labels to int labels for the NTXent loss function."""
    pos_labels = [ label for label in labels if label.endswith('1') ]
    labels_map = { label: i for i,label in enumerate(sorted(set(pos_labels))) }
    int_labels = [ labels_map.get(label, -1) for label in labels ]
    int_labels = torch.Tensor(int_labels)
    neg_indices = (int_labels == -1).nonzero(as_tuple=True)[0]
    M = max(int_labels)
    int_labels[neg_indices] = torch.arange(M + 1, M + 1 + len(neg_indices))
    return int_labels

# TODO: tokenize snippets
class LabeledCodeDataset(Dataset):
    def __init__(self, tokenizer, code_snippets, labels):
        assert len(inputs) == len(labels)
        MAX_LEN = tokenizer.model_max_length
        inputs = tokenizer(
            code_snippets,
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN + 1,
            return_tensors="pt",
        )
        # Find valid pairs that don't exceed model max length
        input_mask = inputs["attention_mask"].sum(dim=1) <= MAX_LEN
        self.inputs = {k: v[input_mask, :MAX_LEN] for k, v in inputs.items()}
        # Move tensors to the specified device
        self.inputs = {k: v.to(device) for k, v in self.inputs.items()}
        self.labels = num_labels(labels)

    def __getitem__(self, idx):
        input = self.inputs[idx]
        label = self.labels[idx]
        return input, label

    def __len__(self):
        return len(self.codes)

    @classmethod
    def from_csv_data(cls, path: str, tokenizer, sample_size=0):
        df = pd.read_csv(path)
        print(df.shape)
        if sample_size:
            print('sampling dataframe...')
            df = df.sample(sample_size, ignore_index=True)
            print(df.shape)
        code = df['source']
        lbls = df['label']
        return cls(tokenizer, code, lbls)


class UnlabeledCodeDataset(Dataset):
    def __init__(
        self,
        tokenizer: (
            transformers.PreTrainedTokenizer |
            transformers.PreTrainedTokenizerFast
        ),
        ref_codes: Iterable[str],
        aug_codes: Iterable[str],
    ):
        assert len(ref_codes) == len(aug_codes)

        MAX_LEN = tokenizer.model_max_length

        # Tokenize all codes at once and filter based on max length
        ref_encodings = tokenizer(
            ref_codes,
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN + 1,
            return_tensors="pt",
        )
        aug_encodings = tokenizer(
            aug_codes,
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN + 1,
            return_tensors="pt",
        )

        # Find valid pairs that don't exceed model max length
        ref_mask = ref_encodings["attention_mask"].sum(dim=1) <= MAX_LEN
        aug_mask = aug_encodings["attention_mask"].sum(dim=1) <= MAX_LEN
        valid_mask = ref_mask & aug_mask

        # Keep only valid encodings
        self.ref_inputs = {k: v[valid_mask] for k, v in ref_encodings.items()}
        self.aug_inputs = {k: v[valid_mask] for k, v in aug_encodings.items()}

        # Remove padding tokens from the last valid position
        self.ref_inputs["input_ids"] = self.ref_inputs["input_ids"][:, :MAX_LEN]
        self.aug_inputs["input_ids"] = self.aug_inputs["input_ids"][:, :MAX_LEN]

        # Also adjust the attention masks
        self.ref_inputs["attention_mask"] = self.ref_inputs["attention_mask"][
            :, :MAX_LEN
        ]
        self.aug_inputs["attention_mask"] = self.aug_inputs["attention_mask"][
            :, :MAX_LEN
        ]

        # Move tensors to the specified device
        self.ref_inputs = {k: v.to(device) for k, v in self.ref_inputs.items()}
        self.aug_inputs = {k: v.to(device) for k, v in self.aug_inputs.items()}

    def __getitem__(self, idx):
        # Return both reference and augmented code inputs for a given index
        ref_input = {k: v[idx] for k, v in self.ref_inputs.items()}
        aug_input = {k: v[idx] for k, v in self.aug_inputs.items()}
        return ref_input, aug_input

    def __len__(self):
        return self.ref_inputs["input_ids"].shape[0]

    @classmethod
    def from_csv_data(cls, path: str, tokenizer, aug_func,
                      sample_size=0):
        df = pd.read_csv(path)
        print(df.shape)
        if sample_size:
            print('sampling dataframe...')
            df = df.sample(sample_size, ignore_index=True)
            print(df.shape)
        ref_codes = df['source']
        aug_codes = df['source'].apply(aug_func)
        return cls(tokenizer, ref_codes.to_list(), aug_codes.to_list())

In [None]:
def minify(code: str) -> str:
    try: return python_minifier.minify(code)
    except Exception as error:
        #print(f'Error while minifying: {error}')  # use a log file for this
        pass
    return code

## Model

In [None]:
# Model

IS_SELF_SUPERVISED = True


def embedding_pipeline(transformer):
    """Create an embedding function with a `tokenizer` and `transformer`."""
    def pipeline(inputs: dict):
        with torch.device(device):
            return transformer(**inputs)
    return pipeline


class CodeSimilarityModel(nn.Module):
    def __init__(self,
        embedding_pipeline,
        in_feat=768,  # depends on the embedding pipeline
        fc_hidden_size=512,
        mlp_sizes=(256, 128, 64),
        out_feat=16,
        dropout_rate=0.2,
    ):
        super().__init__()
        self.embedding_pipeline = embedding_pipeline
        self.in_feat = in_feat
        # Non linearity
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(dropout_rate)
        # MLP 'projection head'
        mlp_layers = []
        mlp_layers.append(nn.Linear(mlp_sizes[0], mlp_sizes[1]))
        mlp_layers.extend([self.relu, self.drop])
        mlp_layers.append(nn.Linear(mlp_sizes[1], mlp_sizes[2]))
        mlp_layers.extend([self.relu, self.drop])
        mlp_layers.append(nn.Linear(mlp_sizes[2], out_feat))
        self.mlp = nn.Sequential(*mlp_layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        with torch.device(device):
            # pass through linear layers
            x = self.mlp(x)
            return x

## NTXent Loss Function

In [None]:
ntxent_loss = losses.NTXentLoss(temperature=0.5)
# Wrap the loss function if needed
ntxent_loss = losses.SelfSupervisedLoss(ntxent_loss) if IS_SELF_SUPERVISED else ntxent_loss

## Training

In [None]:
# Create the dataset
SAMPLE_SIZE = 25_000

if IS_SELF_SUPERVISED:
    dataset = LabeledCodeDataset.from_csv_data(path=labeled_dataset_url, sample_size=SAMPLE_SIZE)
else:
    dataset = UnlabeledCodeDataset.from_csv_data(path=unlabeled_dataset_url, sample_size=SAMPLE_SIZE)

(34806, 3)
sampling dataframe...
(25000, 3)
augmenting dataframe...
(40048, 3)


In [None]:
# Split the data
tsize = int(0.8 * len(dataset))
vsize = len(dataset) - tsize
training_data, validation_data = random_split(dataset, [tsize, vsize])

In [None]:
# Create the data loaders
BATCH_SIZE = 20  # NOTE: Bigger batch size generally leads to better results in contrastive learning
SHUFFLE = True
training_loader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE)
validation_loader = DataLoader(validation_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE)

In [None]:
# Training loop for the NTXEnt loss function

def compute_loss(batched_data, model, loss_func):
    """Computes the loss value for a batch of data."""
    if isinstance(loss_func, losses.SelfSupervisedLoss):
        ref_input, aug_input = batched_data
        ref_emb = model(ref_input)  # fc1-fc2, MLP
        aug_emb = model(aug_input)  # fc1-fc2, MLP
        loss = loss_func(ref_emb, aug_emb)
        return loss
    else:
        inputs, labels = batched_data
        embeddings = model(inputs)  # fc1-fc2, MLP
        loss = loss_func(embeddings, labels)
        return loss


def train_epoch(
    model: CodeSimilarityModel,
    loader: DataLoader,
    loss_func,
    optimizer,
    epochs: int                  = 0,     # number of epochs so far (for logging),
    writer: SummaryWriter | None = None,  # for logging loss values,
):
    """Trains the model for one epoch."""
    def get_last_loss(n_batches, c_batches, batch, acc_loss):
        if batch % c_batches == c_batches - 1:
            return 0, acc_loss / c_batches
        elif batch == N_BATCHES - 1:
            return 0, acc_loss / (n_batches % c_batches)
        return acc_loss, 0

    def write_loss(writer, epoch, n_batches, batch, last_loss):
        # Log the average loss over the last  batches
        print('',f'Batch: {batch + 1}/{n_batches}, Loss: {last_loss}')  # use a log file for this
        if writer is not None:
            writer.add_scalar("loss/train", last_loss, epochs * n_batches + batch + 1)

    model.train()  # Set the model to training mode
    N_BATCHES = len(loader)  # Number of batches
    C_BATCHES = 50  # Number of batches over which the logged loss is cumulated
    sum_loss = 0  # Loss accumulated per EPOCH
    acc_loss = 0  # Loss accumulated per last 25 batches
    progress_bar = tqdm(range(N_BATCHES))
    for i, data in enumerate(loader):
        optimizer.zero_grad()
        loss = compute_loss(data, model, loss_func, is_labeled_data=IS_SUPERVISED)
        # Adjust the weights
        loss.backward()
        optimizer.step()
        # Increase loss accumulator
        loss_val = loss.item()
        acc_loss += loss_val; sum_loss += loss_val
        # Update the loss accumulator and log the last loss
        progress_bar.update(1)
        acc_loss, last_loss = get_last_loss(N_BATCHES, C_BATCHES, i, acc_loss)
        if last_loss: write_loss(writer, epochs, N_BATCHES, i, last_loss)
    # Return the average loss in the epoch
    avg_loss = sum_loss / N_BATCHES
    return avg_loss


def validate(
    model: CodeSimilarityModel,
    loader: DataLoader,
    loss_func,
):
    """Validates the model for one epoch."""
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        sum_loss = 0
        for data in loader:
            loss = compute_loss(data, model, loss_func, is_labeled_data=IS_SUPERVISED)
            sum_loss += loss.item()
        avg_loss = sum_loss / len(loader)
        return avg_loss


def train(
    model: CodeSimilarityModel,
    dataloaders,
    loss_func,
    optimizer, scheduler,
    epochs: int = 5,
):
    writer = None #SummaryWriter()
    tLosses, vLosses = [], []
    training_loader, validation_loader = dataloaders
    for epoch in range(epochs):
        print(f'EPOCH {epoch + 1}/{epochs}')
        # Train then validate
        avg_tLoss = train_epoch(model, training_loader, loss_func, optimizer, epoch, writer)
        avg_vLoss = validate(model, validation_loader, loss_func)
        # Adjust the LR scheduler
        if scheduler is not None:
            scheduler.step()
        # Log the losses
        print(f"EPOCH {epoch + 1}/{epochs}, AVG loss: {avg_tLoss}, AVG validation loss: {avg_vLoss}")
        tLosses.append(avg_tLoss)
        vLosses.append(avg_vLoss)
    if writer is not None:
        writer.close()
    return tLosses, vLosses

In [None]:
# Create embedding pipeline, model, tokenizer, and optimizer

"""
Other checkpoints:
- "microsoft/codebert-base"
- "huggingface/CodeBERTa-small-v1"
"""

pretrained_model="neulab/codebert-python"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
transformer = AutoModel.from_pretrained(pretrained_model).to(device)
transformer.eval()
embedding_pipeline = embedding_pipeline(tokenizer, transformer)
model = CodeSimilarityModel(embedding_pipeline).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-5)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)

In [None]:
# Train the model
epochs = 4
loss_func = ntxent_loss
losses = train(model, (training_loader, validation_loader), loss_func, optimizer, lr_scheduler, epochs)

In [None]:
plt.plot(losses[0])
plt.plot(losses[1])
plt.legend(['training loss', 'validation loss'])
plt.show()

## Notes

### Training results with different parameters

| Checkpoint |`T`| Model Structure | Epochs | Training Loss | Validation Loss |
| ---------- | - | --------------- | ------ | ------------- | --------------- |
| CodeBERTa Small v1 | 0.07 | TFM → mean pool → MLP w/ batchnorm | - | ~1.6 | - |
| CodeBERTa Small v1 | 0.07 | TFM → lin1 → lin2 → MLP w/ batchnorm → max pool | - | ~1.4 | - |
| CodeBERTa Small v1 | 0.50 | (frozen) TFM's pooler output → layernorm → lin1 → lin2 → MLP  | 6 |  ~1.5 | ~1.35 |

`T` is the temperature hyperparameter of the NTXent loss function.

### Data TODOs
- ❎ - Throw away code snippets that are too long
- ❎ - A lot of codes snippets mined from github can't be minified, filter unlabeled code dataset!
- ❎ - Pre calculate data augmentations

### Model TODOs
- ❎ - Try training with transformer's `pooler_output`