### Setup

In [1]:
# !git clone https://github.com/graykode/commit-autosuggestions.git
#%cd commit-autosuggestions
# !pip install commit
# !pip install -r requirements.txt

In [1]:
# Utility stuff
import os
import easydict 
import whatthepatch

# Deep learning stuff
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers import (RobertaConfig, RobertaTokenizer)
from tqdm import tqdm
from commit.model import Seq2Seq
from commit.utils import convert_examples_to_features
from commit.model.diff_roberta import RobertaModel

# Constants
MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}

2022-11-05 18:10:23.781804: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-05 18:10:23.971012: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-05 18:10:24.896110: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /app/lib
2022-11-05 18:10:24.896174: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared ob

### Download Greykode's CodeBERT fine-tuned decoder weights. 

In [3]:
# ADD_MODEL='1YrkwfM-0VBCJaa9NYaXUQPODdGPsmQY4'
# DIFF_MODEL='1--gcVVix92_Fp75A-mWH0pJS0ahlni5m'

# !pip install gdown \
#     && mkdir -p weight/added \
#     && mkdir -p weight/diff \
#     && gdown "https://drive.google.com/uc?id=$ADD_MODEL" -O weight/added/pytorch_model.bin \
#     && gdown "https://drive.google.com/uc?id=$DIFF_MODEL" -O weight/diff/pytorch_model.bin

### Running predictions on a single example

In [4]:
class Example(object):
    """A single training/test example."""
    def __init__(self,
                 idx,
                 added,
                 deleted,
                 target,
                 ):
        self.idx = idx
        self.added = added
        self.deleted = deleted
        self.target = target

def get_model(model_class, config, tokenizer, mode):
    encoder = model_class(config=config)
    decoder_layer = nn.TransformerDecoderLayer(
        d_model=config.hidden_size, nhead=config.num_attention_heads
    )
    decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
    model = Seq2Seq(encoder=encoder, decoder=decoder, config=config,
            beam_size=args.beam_size, max_length=args.max_target_length,
            sos_id=tokenizer.cls_token_id, eos_id=tokenizer.sep_token_id)

    assert args.load_model_path
    print("model path: ", os.path.join(args.load_model_path, mode, 'pytorch_model.bin'))
    assert os.path.exists(os.path.join(args.load_model_path, mode, 'pytorch_model.bin'))

    model.load_state_dict(
        torch.load(
            os.path.join(args.load_model_path, mode, 'pytorch_model.bin'),
            map_location=torch.device('cpu')
        ),
        strict=False
    )
    return model

def get_features(examples):
    features = convert_examples_to_features(examples, args.tokenizer, args, stage='test')
    all_source_ids = torch.tensor(
        [f.source_ids[:args.max_source_length] for f in features], dtype=torch.long
    )
    all_source_mask = torch.tensor(
        [f.source_mask[:args.max_source_length] for f in features], dtype=torch.long
    )
    all_patch_ids = torch.tensor(
        [f.patch_ids[:args.max_source_length] for f in features], dtype=torch.long
    )
    return TensorDataset(all_source_ids, all_source_mask, all_patch_ids)

def inference(model, data):
    """
    :data: A torch.utils.data.dataset.TensorDataset object
    """
    # Calculate bleu
    eval_sampler = SequentialSampler(data)
    eval_dataloader = DataLoader(data, sampler=eval_sampler, batch_size=len(data))

    model.eval()
    p = []
    for batch in tqdm(eval_dataloader, total=len(eval_dataloader)):
        batch = tuple(t.to(args.device) for t in batch)
        source_ids, source_mask, patch_ids = batch
        with torch.no_grad():
            preds = model(source_ids=source_ids, source_mask=source_mask, patch_ids=patch_ids)
            for pred in preds:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[:t.index(0)]
                text = args.tokenizer.decode(t, clean_up_tokenization_spaces=False)
                p.append(text)
    return p

args = easydict.EasyDict({
    'load_model_path': 'weight/', 
    'model_type': 'roberta',
    'config_name' : 'microsoft/codebert-base',
    'tokenizer_name' : 'microsoft/codebert-base',
    'max_source_length' : 512,
    'max_target_length' : 128,
    'beam_size' : 10,
    'do_lower_case' : False,
    'device' : torch.device("cuda" if torch.cuda.is_available() else "cpu")
})

print("Is GPU available?  ", torch.cuda.is_available())

Is GPU available?   False


Building the PL-NL model with the fine-tuned weights

In [5]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name)
args.tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, do_lower_case=args.do_lower_case)

# Build model
args.added_model = get_model(model_class=model_class, config=config,
                        tokenizer=args.tokenizer, mode='added').to(args.device)
args.diff_model = get_model(model_class=model_class, config=config,
                        tokenizer=args.tokenizer, mode='diff').to(args.device)

model path:  weight/added/pytorch_model.bin
model path:  weight/diff/pytorch_model.bin


Running prediction

In [6]:
diffmessage = """
diff --git a/newfile.py b/newfile.py
new file mode 100644
index 0000000..cbb72b8
--- /dev/null
+++ b/newfile.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+import numpy as np
+def multiply_vectors(v1, v2):
+    return np.dot(v1, v2)
"""

diffmessage2 = """
diff --git a/test.py b/test.py
index d13f441..1b1b82a 100644
--- a/test.py
+++ b/test.py
@@ -1,6 +1,3 @@

-import torch
-import argparse
-import numpy
-import sklearn
+import matplotlib.pyplot as plt
 def add(a, b):
     return a + b
"""

diffmessage3 = """
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..d13f441
--- /dev/null
+++ b/test.py
@@ -0,0 +1,6 @@
+
+import torch
+import argparse
+
+def add(a, b):
+    return a + b
"""

diffmessage4 = """
diff --git a/newfile.py b/newfile.py
new file mode 100644
index 0000000..8724a42
--- /dev/null
+++ b/newfile.py
@@ -0,0 +1,5 @@
+#!usr/bin/env python3
+
+# Gets the url
+def get_url(domain, path):
+    return domain + "/" + path
diff --git a/ngrok.conf b/ngrok.conf
new file mode 100644
index 0000000..8e50d0f
--- /dev/null
+++ b/ngrok.conf
@@ -0,0 +1,11 @@
+
+authtoken: 1kskZgJ8KpCRvYnzSF63AcodvBr_4RMXxFo4Sa2qLrRaKjhJW
+region: jp
+console_ui: False
+tunnels:
+  input:
+    addr: 5000
+    proto: http    
+  output:
+    addr: 5000
+    proto: http
"""

In [7]:
# Parse the git diff with whatthepatch package
# Retrieve changes (added and deleted lines of code)
for idx, example in enumerate(whatthepatch.parse_patch(diffmessage)):
    if not example.changes:
        print(f"no changes in {idx}")
        continue

    isadded, isdeleted = False, False
    added, deleted = [], []

    # Determine if the line is added or deleted and add it to the corresponding list
    for change in example.changes:
        if change.old == None and change.new != None and change.line != "":
            added.append(change.line)
            isadded = True
        elif change.old != None and change.new == None and change.line != "":
            deleted.append(change.line)
            isdeleted = True
    
    # Tokenization
    added_tokens = args.tokenizer.tokenize(" ".join(added))
    deleted_tokens = args.tokenizer.tokenize(" ".join(deleted))
    print(added_tokens)

    # If code was only added to the file, we can run inference with the added model
    if isadded and not isdeleted:        
        # Create a numerical vector representation
        testsample = [Example(idx, added_tokens, deleted_tokens, target=None)]
        sampledata = get_features(testsample)
        
        # Generate a commit message
        message = inference(model=args.added_model, data=sampledata)
        print("Autogenerated commit message: \n", message)
    
    # If code was deleted in the changed file, we need to run inference with the diff model
    else: 
        # Create a numerical vector representation
        testsample = [Example(idx, added_tokens, deleted_tokens, target=None)]
        sampledata = get_features(testsample)
        
        # Generate a commit message
        message = inference(model=args.diff_model, data=sampledata)
        print("Autogenerated commit message: \n", message)

['#', '!/', 'usr', '/', 'bin', '/', 'env', 'Ġpython', '3', 'Ġimport', 'Ġn', 'umpy', 'Ġas', 'Ġnp', 'Ġdef', 'Ġmultiply', '_', 've', 'ctors', '(', 'v', '1', ',', 'Ġv', '2', '):', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġnp', '.', 'dot', '(', 'v', '1', ',', 'Ġv', '2', ')']


100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.39s/it]

Autogenerated commit message: 
 ['Multiply two vectors .']



