In [1]:
# Install requiered packages
!pip install unidiff bugswarm-client



In [2]:
from collections import defaultdict
from bugswarm.common.rest_api.database_api import DatabaseAPI
import requests
from tqdm import tqdm
from unidiff import PatchSet
from typing import List
import json

session = requests.Session()
bugswarmapi = DatabaseAPI(token="2vFV-ZCG70az8Fg84uNBvXw0ICnthMRvV83APgAjICY")

In [3]:
def extract_added_removed_code(hunk):
    """
    Extract added and removed lines
    """
    added_lines = []
    removed_lines = []
    for line in hunk:
        if line.is_added:
            added_lines.append(line.value.strip())
        if line.is_removed:
            removed_lines.append(line.value.strip())
    return added_lines, removed_lines


def get_diff(repo: str, commit_sha: str):
    """
    Get a dict where keys are filename and value a dict with added_code and removed_code
    added_code and removed_code are list of strings
    """
    files = {}
    req = session.get(f"https://github.com/{repo}/commit/{commit_sha}.diff")
    patched_files = PatchSet(req.text)
    for patched_file in patched_files:
        filepath = patched_file.source_file[2:]  # remove a/
        for hunk in patched_file:
            added_code, removed_code = extract_added_removed_code(hunk)
        files[filepath] = {
            "added_code": added_code,
            "removed_code": removed_code
        }
    return files

print(json.dumps(get_diff("gwtbootstrap3/gwtbootstrap3", "c07f968e099d963eed195c7608487c8515393657"), indent=4))

{
    ".travis.yml": {
        "added_code": [
            "- openjdk8"
        ],
        "removed_code": [
            "- oraclejdk8"
        ]
    },
    "README.md": {
        "added_code": [
            "### Final Release",
            "* 1.0.0 - Released on 26 August 2019.",
            "* Based on Bootstrap v3.4.0",
            "* [Demo](http://gwtbootstrap3.github.io/gwtbootstrap3-demo/) - The GWTBootstrap3 1.0.0 Demo.",
            "* [API Docs](http://gwtbootstrap3.github.io/gwtbootstrap3-demo/apidocs) - The GWTBootstrap3 1.0.0 API Javadoc."
        ],
        "removed_code": [
            "### Current Release",
            "* 0.9.4 - Released on 21 February 2017.",
            "* Based on Bootstrap v3.3.7",
            "* [Demo](http://gwtbootstrap3.github.io/gwtbootstrap3-demo/) - The GWTBootstrap3 0.9.4 Demo.",
            "* [API Docs](http://gwtbootstrap3.github.io/gwtbootstrap3-demo/apidocs) - The GWTBootstrap3 0.9.4 API Javadoc.",
            "### Current Snapshot",
  

In [None]:
api_filter = '{"reproduce_successes":{"$gt":0},"lang":{"$in":["Java","Python"]}}'
bugswarmapi.filter_artifacts(api_filter)

ids, labels, msgs, codes = [],[],[],[]
size_limit = 10

for i, artifact in tqdm(enumerate(bugswarmapi.list_artifacts())):
    if i > size_limit:
        break   
    for job, label in [("failed_job", 0), ("passed_job", 1)]:
        ids.append(artifact[job]["trigger_sha"])
        labels.append(label)
        msgs.append(f"Commit msg for {artifact[job]['trigger_sha']}")
        codes.append(
            [
                diff
                for _, diff in get_diff(artifact["repo"], artifact[job]["trigger_sha"]).items()
            ]
        )

In [None]:
ids, labels, msgs, codes

## Use CC2Vec

In [None]:
import sys
if "CC2Vec" not in sys.path:
    sys.path.append("CC2Vec")

import numpy as np
import pickle
from jit_padding import padding_message, clean_and_reformat_code, padding_commit_code, mapping_dict_msg, mapping_dict_code, convert_msg_to_label
from jit_cc2ftr_extracted import extracted_cc2ftr
from dataclasses import dataclass

In [None]:
def mapping_dict_code(pad_code, dict_code):
    new_pad_code = list()
    for commit in pad_code:
        new_files = list()
        for file in commit:
            new_file = list()
            for line in file:
                new_line = list()
                for token in line.split(' '):
                    print(token, dict_code.keys())
                    return
                    if token.lower() in dict_code.keys():
                        new_line.append(dict_code[token.lower()])
                    else:
                        new_line.append(dict_code['<NULL>'])
                new_file.append(np.array(new_line))
            new_file = np.array(new_file)
            new_files.append(new_file)
        print(new_files[0].shape)
        print(new_files[0])
        new_files = np.array(new_files)
        new_pad_code.append(new_files)
    return np.array(new_pad_code)

In [None]:
# Taken from jit_cc2ftr.py

# parser.add_argument('--msg_length', type=int, default=256, help='the length of the commit message')
# parser.add_argument('--code_file', type=int, default=2, help='the number of files in commit code')
# parser.add_argument('--code_line', type=int, default=10, help='the number of LOC in each hunk of commit code')
# parser.add_argument('--code_length', type=int, default=64, help='the length of each LOC of commit code')

@dataclass
class Params:
    """Structure similar to params generated by argparse"""
    dict_file = "data+model/data/jit/qt_dict.pkl"
    msg_length = 256
    code_file = 2
    code_line = 10
    code_length = 64
    batch_size = 1

params = Params()

with open(params.dict_file, 'rb') as fd:
    dictionary = pickle.load(fd)   
dict_msg, dict_code = dictionary  

pad_msg = padding_message(data=msgs, max_length=params.msg_length)
added_code, removed_code = clean_and_reformat_code(codes)
pad_added_code = padding_commit_code(data=added_code, max_file=params.code_file, max_line=params.code_line, max_length=params.code_length)
pad_removed_code = padding_commit_code(data=removed_code, max_file=params.code_file, max_line=params.code_line, max_length=params.code_length)

pad_msg = mapping_dict_msg(pad_msg=pad_msg, dict_msg=dict_msg)
pad_added_code = mapping_dict_code(pad_code=pad_added_code, dict_code=dict_code)
pad_removed_code = mapping_dict_code(pad_code=pad_removed_code, dict_code=dict_code)
pad_msg_labels = convert_msg_to_label(pad_msg=pad_msg, dict_msg=dict_msg)

data = (pad_added_code, pad_removed_code, pad_msg_labels, dict_msg, dict_code)
extracted_cc2ftr(data=data, params=params)