In [1]:
import argparse
import logging
import random

import numpy as np
import pandas as pd
import torch

from datasets import load_dataset
from info_nce import InfoNCE

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import RobertaModel, RobertaConfig, RobertaTokenizer


class CustomDataset(TensorDataset):

    def __init__(self, dataframe):
        self.tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
        self.doc = dataframe.doc
        self.code = dataframe.code
        # self.targets = dataframe.labels
        self.max_len = 512

    def __len__(self):
        assert len(self.doc) == len(self.code)
        return len(self.doc)

    def __getitem__(self, index):
        doc = str(self.doc[index])
        doc = " ".join(doc.split())
        
        code = str(self.code[index])
        doc_inputs = self.tokenizer.encode_plus(
            doc,
            add_special_tokens=False,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=False
        )
        doc_ids = doc_inputs['input_ids']
        doc_mask = doc_inputs['attention_mask']
        
        code_inputs = self.tokenizer.encode_plus(
            code,
            add_special_tokens=False,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=False
        )
        
        code_ids = code_inputs['input_ids']
        code_mask = code_inputs['attention_mask']

        return {
            'doc_ids': torch.tensor(doc_ids, dtype=torch.long),
            'doc_mask': torch.tensor(doc_mask, dtype=torch.long),
            'code_ids': torch.tensor(code_ids, dtype=torch.long),
            'code_mask': torch.tensor(code_mask, dtype=torch.long),
        }

In [3]:
model = RobertaModel.from_pretrained('microsoft/codebert-base')
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)
model.to(torch.device("cuda"))

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [4]:
train_params = {'batch_size': 7,
                'shuffle': False,
                'num_workers': 0
                }

test_params = {'batch_size': 7,
               'shuffle': False,
               'num_workers': 0
               }

In [5]:
code_search_dataset = load_dataset('code_search_net', 'ruby')

# train_data
train_data = code_search_dataset['train']

function_code = train_data['func_code_string']
function_documentation = train_data['func_documentation_string']

train_df =pd.DataFrame()
train_df['doc'] = function_documentation
train_df['code'] = function_code

# test_data
test_data = code_search_dataset['test']

function_code_test = test_data['func_code_string']
function_documentation_test = test_data['func_documentation_string']

test_df =pd.DataFrame()
test_df['doc'] = function_documentation_test
test_df['code'] = function_code_test

In [6]:
train_size = 0.8
train_dataset = train_df.sample(frac=train_size, random_state=200)
valid_dataset = train_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
test_dataset = test_df.reset_index(drop=True)

In [7]:
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VAL Dataset: {}".format(valid_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

TRAIN Dataset: (39033, 2)
VAL Dataset: (9758, 2)
TEST Dataset: (2279, 2)


In [8]:
training_set = CustomDataset(train_dataset)
loss_formulation = InfoNCE(negative_mode='unpaired')

In [22]:
for epoch in range(1, 1+1):
    #model.train()
    
    losses = []
    
    #shuffle the data
    #random.shuffle(training_set)
    batch_size = train_params['batch_size']
    train_dataloader = DataLoader(training_set, batch_size=7, shuffle=True)
    
    for idx, batch in enumerate(train_dataloader):
        
        if idx > 2:
            break
        
        if len(batch) <= 1:
            continue
        
        # query = doc
        query_id = batch['doc_ids'][0].to(torch.device("cuda")).unsqueeze(0)
        query_mask = batch['doc_mask'][0].to(torch.device("cuda")).unsqueeze(0)
        inputs = {'input_ids': query_id, 'attention_mask': query_mask}
        query = model(**inputs)[1]  # using pooled values
        #keys = code
        code_list = [(batch['code_ids'][i].unsqueeze(0).to(torch.device("cuda")), batch['code_mask'][i].unsqueeze(0).to(torch.device("cuda"))) for i in range(1, batch_size)]
        
        positive_code_key = code_list.pop(0)
        inputs = {'input_ids': positive_code_key[0], 'attention_mask': positive_code_key[1]}
        positive_code_key = model(**inputs)[1] # using pooled values
        
        negative_keys = []
        
        
        for code, mask in code_list:
            inputs = {'input_ids': code, 'attention_mask': mask}
            negative_key = model(**inputs)[1] # using pooled values
        
            negative_keys.append(negative_key.clone().detach())
    
        negative_keys_reshaped = torch.cat(negative_keys, dim=0)    
        
        loss = loss_formulation(query, positive_code_key, negative_keys_reshaped)
        loss.backward()
        
        losses.append(loss)
        num_of_accumulation_steps = 10
        
        if (idx+1) % num_of_accumulation_steps == 0:
            optimizer.zero_grad()
            optimizer.step()
        print(loss)
        
    

tensor(1.9226, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8411, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7655, device='cuda:0', grad_fn=<NllLossBackward0>)


In [None]:
test2 = training_set[2:3]