# Code Similarity with Contrastive Learning

## Dependencies

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torch.utils.tensorboard import SummaryWriter
from pytorch_metric_learning import losses
# Transformers (for CodeBERT etc.)
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.get_device_name(torch.cuda.current_device()))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)

NVIDIA GeForce GTX 1070
Device: cuda


## Classic Contrastive Loss

In [3]:
def contrastive_loss(x1, x2, label: float, margin: float):
    """
    The label indicates whether the pair is negative.
        - 1.0 means positive
        - 0.0 means negative
    
    The loss is is calculated like this:
        `dist ** 2` if the label is positive `min(margin - dist)` otherwise
    """
    with torch.device(device):
        dist = F.pairwise_distance(x1, x2)
        loss = label * torch.pow(dist, 2) + (1 - label) * torch.pow(torch.clamp(margin - dist, min=0.0), 2)
        return loss.mean()

## Model

In [4]:
# Model

class CodeSimilarityModel(nn.Module):
    def __init__(self,
        pretrained_model="microsoft/codebert-base",
        inputs_size=768,
        hidden_size=(128, 64),
        output_size=16,
        dropout_rate=0.2,
    ):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self.transformer = AutoModel.from_pretrained(pretrained_model)
        self.transformer.to(device)
        self.mlp = nn.Sequential(
            nn.Linear(inputs_size, 
                      hidden_size[0]),
            nn.BatchNorm1d(
                      hidden_size[0]),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size[0],
                      hidden_size[1]),
            nn.BatchNorm1d(
                      hidden_size[1]),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size[1], output_size),
        )
        # NOTE:
        # The base value for hidden_size is the size of size of the mean pooled output of the transformer.
    
    
    def _tokenize(self, code: str):
        return self.tokenizer(code, return_tensors='pt', truncation=True,
                              padding='max_length', max_length=256)
        # NOTE:
        # Max length>256 crashes GTX 1070 with a batch size of 20.
    
    def _embedding(self, code: str) -> torch.Tensor:
        with torch.device(device):
            # Tokenize code snippet
            inputs = self._tokenize(code).to(device)
            # Get the transformer output
            output = self.transformer(**inputs)
            # Pool and squeeze the transformer output
            output = output.last_hidden_state.mean(dim=1)
            output = output.squeeze()
            return output
    
    def embeddings(self, *codes: str) -> tuple[torch.Tensor]:
        """Embed a code snippets with `self.transformer`."""
        with torch.device(device):
            return tuple(map(self._embedding, codes))
        
    def forward(self, *embeddings: torch.Tensor) -> tuple[torch.Tensor]:
        with torch.device(device):
            return tuple(map(self.mlp, embeddings))

In [5]:
# Training dataset

class CodePairDataset(Dataset):
    def __init__(self, code_pairs, labels):
        self.code_pairs = code_pairs
        self.labels = labels

    def __getitem__(self, idx):
        code1, code2 = self.code_pairs[idx]
        label = self.labels[idx]
        return code1, code2, torch.tensor(label, dtype=torch.float).to(device)
    
    def __len__(self):
        return len(self.code_pairs)
    
    @classmethod
    def from_csv_data(cls, path: str = "training.csv", sample_size=0):
        df = pd.read_csv(path)
        if sample_size: df = df.sample(sample_size)
        code_pairs = df.apply(lambda row: (row['src_x'], row['src_y']), axis=1).tolist()
        labels = df['label'].tolist()
        return cls(code_pairs, labels)

# TODO: Code dataset (non-paired)

In [6]:
# Training

def train_epoch(
    model: CodeSimilarityModel, margin: float,
    loader: DataLoader,
    optimizer,
    epochs: int                  = 0,  # number of epochs so far (for logging)
    writer: SummaryWriter | None = None  # for logging loss values
):
    """Trains the model for one epoch."""
    N_BATCHES = len(loader)
    sum_loss = 0  # Loss accumulated per EPOCH
    cum_loss = 0  # Loss accumulated per last 25 batches
    for i, data in enumerate(loader):
        optimizer.zero_grad()
        code1, code2, labels = data
        emb1, emb2 = model.embeddings(code1, code2)  # transformer's embeddings for both code snippets
        emb1, emb2 = model(emb1, emb2)  # MLP's embeddings for both code snippets
        # Compute the loss
        loss = contrastive_loss(emb1, emb2, labels, margin)
        loss.backward()
        # Adjust the weights
        optimizer.step()
        # Increase loss accumulator
        cum_loss += loss.item()
        # Calculate and log loss
        C_BATCHES = 50  # number of batches over which the logged loss is cumulated
        if i % C_BATCHES == C_BATCHES - 1:
            last_loss = cum_loss / C_BATCHES
            print('',f'Batch: {i + 1}/{N_BATCHES}, Loss: {last_loss}')
            sum_loss += cum_loss
            cum_loss = 0
            if writer is not None:  # Log the average loss over the last  batches
                writer.add_scalar("loss/train", last_loss, epochs * N_BATCHES + i + 1)
    # Return the average loss in the epoch
    avg_loss = sum_loss / N_BATCHES
    return avg_loss


def validate(
    model: CodeSimilarityModel, margin: float,
    loader: DataLoader,
):
    """Validates the model for one epoch."""
    with torch.no_grad():
        sum_loss = 0
        for code1, code2, labels in loader:
            emb1, emb2 = model.embeddings(code1, code2)
            emb1, emb2 = model(emb1, emb2)
            loss = contrastive_loss(emb1, emb2, labels, margin)
            sum_loss += loss.item()
        avg_loss = sum_loss / len(loader)
        return avg_loss


def train(
    model: CodeSimilarityModel, 
    margin: float,
    dataloaders,
    optimizer,
    epochs: int = 5,
):
    tLosses, vLosses = [], []
    writer = SummaryWriter()
    training_loader, validation_loader = dataloaders
    model.train()
    for ind_epoch in range(epochs):
        print(f'EPOCH {ind_epoch + 1}/{epochs}')
        # Train then validate
        avg_tLoss = train_epoch(model, margin, training_loader, optimizer, ind_epoch, writer)
        avg_vLoss = validate(model, margin, validation_loader)
        # Log the losses
        print(f"EPOCH {ind_epoch + 1}/{epochs}, AVG loss: {avg_tLoss}, AVG validation loss: {avg_vLoss}")
        tLosses.append(avg_tLoss)
        vLosses.append(avg_vLoss)
    writer.close()
    return tLosses, vLosses

In [7]:
# Define model, tokenizer, and optimizer
model = CodeSimilarityModel(pretrained_model='huggingface/CodeBERTa-small-v1').to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

dataset = CodePairDataset.from_csv_data()
training_data, validation_data = random_split(dataset, [8000, 2000])
training_loader = DataLoader(training_data, batch_size=20, shuffle=True)
validation_loader = DataLoader(validation_data, batch_size=20, shuffle=True)

# Train the model
epochs = 10; margin = 12.0
losses = train(model, margin, (training_loader, validation_loader), optimizer, epochs)

EPOCH 1/15
 Batch: 50/400, Loss: 46.336321716308596
 Batch: 100/400, Loss: 42.10408687591553
 Batch: 150/400, Loss: 41.18445495605469
 Batch: 200/400, Loss: 40.688289108276365
 Batch: 250/400, Loss: 38.6696675491333
 Batch: 300/400, Loss: 39.839705352783206
 Batch: 350/400, Loss: 39.31956039428711
 Batch: 400/400, Loss: 38.28210605621338
EPOCH 1/15, AVG loss: 40.80302400112152, AVG validation loss: 38.36096862792969
EPOCH 2/15
 Batch: 50/400, Loss: 37.76735858917236
 Batch: 100/400, Loss: 38.08632247924805
 Batch: 150/400, Loss: 38.20794364929199
 Batch: 200/400, Loss: 37.86470523834228
 Batch: 250/400, Loss: 38.42124011993408
 Batch: 300/400, Loss: 38.912745323181156
 Batch: 350/400, Loss: 38.46441650390625
 Batch: 400/400, Loss: 37.82049560546875
EPOCH 2/15, AVG loss: 38.19315343856812, AVG validation loss: 38.930386219024655
EPOCH 3/15
 Batch: 50/400, Loss: 38.414871520996094
 Batch: 100/400, Loss: 37.99174503326416
 Batch: 150/400, Loss: 38.36829166412353
 Batch: 200/400, Loss: 38.

KeyboardInterrupt: 

In [None]:
plt.plot(losses[0])
plt.plot(losses[1])
plt.legend(['training loss', 'validation loss'])
plt.show()

In [None]:
# TODO: LR scheduler
# NOTE: CodeBERT tokenization and embedding is really slow on a single GPU like GTX 1070
# NOTE: a batch size of 50 seems to crash 16 gigs of RAM (when running on CPU, using CodeBERT)

In [None]:
# Evaluation
# TODO: eval function, eval dataset

In [None]:
# NTXent Loss
"""
from pytorch_metric_learning import losses
ntxent_loss = losses.NTXentLoss(temperature=0.07)
"""

# in the training method
"""
...
embeddings = torch.cat((embeddings1, embeddings2))
# embeddings shape: 2 * batch_size , output_size
labels = new_labels(labels)  # calculate labels based on PID and status
loss = ntxent_loss(embeddings, labels)
...
"""

# NOTE: 
# With NTXent loss the positive and negative pairs can also be defined using
# a sequence of non-paired solutions.
# The labels for this sequence of solutions have to be defined in a way that all
# solutions with the same PID and 'accepted' status have the same label `l` but
# all other solutions with different PID or status have different labels to `l`.