# RegMiner Empirical Studies

This file contains the script that was used for the empirical study of the RegMiner. To run the script locally, ```git``` needs to be present on the machine. This is required to get the commit history of the different repository. In addition, the relevant repositories should also be downloaded from GitHub. A compilation of the repository for the 537 bugs can be found [here](?).

All of these dependencies can been fulfilled by this Colab session and the code can run as it is.

## Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import os, contextlib, re, subprocess
from datetime import datetime
from difflib import Differ, SequenceMatcher

## Helper functions

In [2]:
# Reference from https://stackoverflow.com/questions/431684/equivalent-of-shell-cd-command-to-change-the-working-directory
@contextlib.contextmanager
def change_dir(directory):
    current_dir= os.getcwd()
    try: 
        os.chdir(directory)
        yield
    finally: os.chdir(current_dir)

### Basic retrieval of data

In [3]:
def get_log(directory, commit):
    with change_dir(directory):
        log = os.popen(f"git log --pretty=oneline {commit} -1").read()
        return log

def get_num_commits(directory, commit):
    with change_dir(directory):
        return int(os.popen(f"git rev-list --count {commit}").read())

def get_author_date(directory, commit):
    with change_dir(directory):
        raw_time = os.popen(f"git show {commit} -s --format=%ad").read().strip()
        cleaned_time = datetime.strptime(raw_time, "%a %b %d %H:%M:%S %Y %z")
        return cleaned_time

def diff_in_commits(directory, new_commit, old_commit):
    return abs(get_num_commits(directory, new_commit) - get_num_commits(directory, old_commit))

def diff_in_time(directory, new_commit, old_commit):
    time_difference = get_author_date(directory, new_commit) - get_author_date(directory, old_commit)
    return abs(time_difference).total_seconds() / (24 * 60 * 60) # get in terms of days

### Advance retrieval of data

In [35]:
def statistics(directory, new_commit, old_commit=None, file=None):
    """
    If an old_commit is not specified, by default, the commit will be compared with the previous commit
    If a file is specified, it will compare the differences between the two files.
    Else, it will compare all files across two directories and return
    (number_files_modified, lines_added, lines_deleted)
    """
    if old_commit is None: old_commit = f"{new_commit}~1"
    with change_dir(directory):
        if file is None:
            stats = os.popen(f"git diff --shortstat {old_commit} {new_commit}").read()
        else:
            stats = os.popen(f"git diff --shortstat {old_commit} {new_commit} -- {file}").read()
        group = re.search(r"\d+", stats)
        files = 0 if group is None else int(group[0])
        group = re.search(r"\d+(?= insertion)", stats)
        insertions = 0 if group is None else int(group[0])
        group = re.search(r"\d+(?= deletion)", stats)
        deletions = 0 if group is None else int(group[0])
        return (files, insertions, deletions)

def files_changed(directory, new_commit, old_commit=None):
    """
    Return all files that are different between two commits.
    By default, if old_commit is not defined, compare with the previous commit.
    """
    if old_commit is None: old_commit = f"{new_commit}~1"
    with change_dir(directory):
        results = os.popen(f"git diff --raw --minimal {old_commit} {new_commit}").readlines()
        files = [re.search(r"[\S]*\.\w*", result) for result in results]
        files = [file[0] for file in files if file is not None]
        return files

def hunks_changed(directory, new_commit, old_commit=None):
    """
    Return all hunks changed as a dictionary with the following details.
    It is possible for method_name to be none if no appropriate method name is found.
    """
    if old_commit is None: old_commit = f"{new_commit}~1"
    files = files_changed(directory, new_commit, old_commit)
    hunks = []
    with change_dir(directory):
        for file in files:
            output = subprocess.run(f"git diff -U1000 {old_commit} {new_commit} -- {file}",
                                    shell=True, capture_output=True).stdout.decode(errors="ignore")
            outputs = output.splitlines()
            outputs = [output for output in outputs if len(output.strip()) > 0]
            was_hunk = False
            for i, output in enumerate(outputs):
                if not re.search(r"^[-+](?![-+])", output):
                    was_hunk = False
                    continue
                if not was_hunk:
                    method_name = None
                    index = i
                    while index >= 0:
                        target = outputs[i]
                        if re.search(r"(private|protected|static|public)", target):
                            method_name = re.search(r"\w+\s*(?=[({])", target)
                        if method_name:
                            method_name = method_name[0].strip()
                            break
                        index -= 1
                    hunk_details = {"FileName": file,
                                    "MethodName": method_name,
                                    "IndexStart": i,
                                    "IndexEnd": i,
                                    "Hunk": [],
                                    "+": [],
                                    "-": []}
                    hunks.append(hunk_details)
                    prev_hunk = hunk_details
                    was_hunk = True
                else:
                    prev_hunk = hunks[-1]
                    prev_hunk["IndexEnd"] = i
                    prev_hunk["Hunk"].append(output)
                    prev_hunk[output[0]].append(output[1:].strip())
        return hunks

### Scoring systems

In [33]:
def compare_similarity(hunk1, hunk2):
    min_len = min(len(hunk1), len(hunk2)) 
    if min_len < 3: # we do not bother with too small changes
        return 0

    d = Differ()
    hunk1 = [line + "\n" for line in hunk1]
    hunk2 = [line + "\n" for line in hunk2]
    result = list(d.compare(hunk1, hunk2))
    match_count, similar_count = 0, 0
    for line in result:
        label = line[0]
        if label == "+" or label == "-": continue
        if label == "?" and re.search(r"\^", line) is not None:
            similar_count += 1
        else:
            match_count += 1
    return (match_count + (similar_count/2)) / min_len

def get_refactor_scores(hunks):
    scores = []
    for hunk in hunks:
        add = hunk["+"]
        max_score = 0
        hunk_ext = re.search(r"\.w+", hunk["FileName"])
        if hunk_ext is not None: hunk_ext = hunk_ext[0]
        for h in hunks:
            h_ext = re.search(r"\.w+", h["FileName"])
            if h_ext is not None: h_ext = h_ext[0]
            if h == hunk or hunk_ext != h_ext: continue
            delete = h["-"]
            max_score = max(compare_similarity(add, delete), max_score)
            scores.append(max_score)
    return scores

def revert_index(hunks1, hunks2):
    revert_indexes = []
    for hunk1 in hunks1:
        for hunk2 in hunks2:
            if hunk1["FileName"] != hunk2["FileName"]:
                continue
            revert_count = 0
            for line in hunk1["+"]:
                if line in hunk2["-"]:
                    revert_count += 1
            for line in hunk1["-"]:
                if line in hunk2["+"]:
                    revert_count += 1
            lines_changed = len(hunk1["Hunk"])
            revert_index = revert_count / lines_changed if lines_changed > 0 else 0
            revert_indexes.append(revert_index)
    return sum(revert_indexes)/len(revert_indexes) if len(revert_indexes) > 0 else 0

def distances_to_fix(hunks1, hunks2):
    distances = []
    for h1 in hunks1:
        for h2 in hunks2:
            file1 = h1["FileName"]
            file2 = h2["FileName"]
            method1 = h1["MethodName"]
            method2 = h2["MethodName"]
            hs1, he1 = h1["IndexStart"], h1["IndexEnd"]
            hs2, he2 = h2["IndexStart"], h2["IndexEnd"]
            if file1 != file2 and SequenceMatcher(None, file1, file2).ratio() < 0.8:
                distances.append(3)
            elif method1 != method2 and (method1 is None or method2 is None or SequenceMatcher(None, method1, method2).ratio() < 0.8):
                distances.append(2)
            elif he1 < hs2 or he2 < hs2:
                distances.append(1)
            else:
                distances.append(0)
    return (min(distances), max(distances), 0 if len(distances) == 0 else sum(distances)/len(distances))

### Advance boolean functions

In [6]:
def is_bug_fix(directory, commit):
    log = get_log(directory, commit).lower()
    return re.search(r"(issue|fix|revert|bug)\w*", log) is not None

def is_refactor(directory, commit, scores):
    log = get_log(directory, commit).lower()
    if re.search(r"refactor", log):
        return True
    if len(scores) == 0 or max(scores) < 0.7:
        return False
    return True

def is_feature_enhancement(hunks, scores):
    total_add, total_delete = 0, 0
    for score, hunk in zip(scores, hunks):
        if score >= 0.7:
            continue # this is a refactored hunk
        add_len = len(hunk["+"])
        del_len = len(hunk["-"])
        if add_len > 10 and (del_len == 0 or add_len/del_len > 3):
            return True
        total_add += add_len
        total_delete += del_len
    return True if total_add > 20 and (total_delete == 0 or total_add / total_delete > 3) else False

## Experiments

First, we will retrieve the data from the relevant websites. A copy of the data can be found [here](https://www.dropbox.com/s/0aoi3ewxtr2xfvx/empirical_studies.tar.xz?dl=1).

In [7]:
#!wget -nv -O data.tar.xz "https://www.dropbox.com/s/0aoi3ewxtr2xfvx/empirical_studies.tar.xz?dl=1" && tar -xf data.tar.xz && rm data.tar.xz

In [8]:
os.chdir("empirical_studies")

In [9]:
regressions = pd.read_csv("regressions.csv")
regressions

Unnamed: 0,Project_Name,BFC_Commit,BIC_Commit,Working_Commit
0,app-maven-plugin,6ff4ae8fbe8c8b37b75a1893c48c7ad2b23552bc,8effcb6b0400f2023505542d0e8a2b01877a9577,53580a805b2acbde6fb968cbadd8a6763c55034b
1,app-maven-plugin,f1f709ea31c57c6378806e608b342fe0249c39d3,8effcb6b0400f2023505542d0e8a2b01877a9577,53580a805b2acbde6fb968cbadd8a6763c55034b
2,aviator,1895817d540479ea079ea433a0cf2aff53b5fbaa,e335bf1b5e17ebc5e549b9cecc4e73c243d4db3b,0ca42184fa0b21338296fd15d71d6e3aee3a80db
3,aviator,af3739881c0ec0c250f75fe9fb6a530e1ba068db,121881f2052a71e824f9de415cfdd457a1c513b9,62229076b9759c91ebecee9c8a2a83621c5b5ba3
4,aviator,afc9c4648510ce4279d470a4bf9aad356418d01c,e335bf1b5e17ebc5e549b9cecc4e73c243d4db3b,0ca42184fa0b21338296fd15d71d6e3aee3a80db
...,...,...,...,...
532,verdict,17dd7ac9bcf687e81e353bc24bc2ea9520e45db1,b6ebb975cacd197691ca487055372fa9447d28da,1248c63d7c8773ceb5ffd92a4dd1486c3f893710
533,verdict,6e11c2bdacdc986db619692cd204898a975e454f,6e775e54370dbd1a0a9393057695af3190131a25,d1ba53707e0147e3932a0d409b0081c91fb7fe06
534,zip4j,13c170672da595561163804dc62451dc21bfc870,d5c5b413a2996bceb65db4adfd353030baf21d94,7be1b0620bf9dbb4c023a1682698ac5436b41fad
535,zip4j,3f15884e338fd2490bb9ee710cf0828aca55d285,c158768c2880615bae983789690b5713e8de6794,e849b62bb94b506de8e76cdc7f0164d7fc145608


In [None]:
try:
    results = pd.read_csv("Intermediate_Result.csv")
except FileNotFoundError:
    results = pd.DataFrame(columns=["Project", "BFC", "BIC", "Commits_Between", "Time_Between",
                                    "BIC_Files_Edited", "BIC_Lines_Added", "BIC_Lines_Deleted",
                                    "BFC_Files_Edited", "BFC_Lines_Added", "BFC_Lines_Deleted",
                                    "BIC_Num_Hunks", "BFC_Num_Hunks",
                                    "Min_Distance_Fix", "Max_Distance_Fix", "Mean_Distance_Fix",
                                    "BIC_Is_Bug_Fix", "BIC_Is_Refactor", "BIC_Is_Feature_Enhancement",
                                    "BFC_Is_Refactor", "BFC_Is_Feature_Enhancement", "Revert_Index"])
print(f"Number of past records: {len(results)}")

for i, row in tqdm(regressions.iterrows(), total=len(regressions)):
    if i < len(results):
        continue
    result = {}
    directory = f"repos/{row.Project_Name}"
    result["Project"] = row.Project_Name
    result["BFC"] = row.BFC_Commit
    result["BIC"] = row.BIC_Commit
    
    # Retrieve some basic statistics first
    result["Commits_Between"] = diff_in_commits(directory, row.BFC_Commit, row.BIC_Commit)
    result["Time_Between"] = diff_in_time(directory, row.BFC_Commit, row.BIC_Commit)
    result["BIC_Files_Edited"], result["BIC_Lines_Added"], result["BIC_Lines_Deleted"] =\
        statistics(directory, row.BIC_Commit, row.Working_Commit)
    result["BFC_Files_Edited"], result["BFC_Lines_Added"], result["BFC_Lines_Deleted"] =\
        statistics(directory, row.BFC_Commit)
    
    # Retrieve advance statistics
    bic_hunks = hunks_changed(directory, row.BIC_Commit, row.Working_Commit)
    bic_scores = get_refactor_scores(bic_hunks)
    bfc_hunks = hunks_changed(directory, row.BFC_Commit)
    bfc_scores = get_refactor_scores(bfc_hunks)
    result["BIC_Num_Hunks"] = len(bic_hunks)
    result["BFC_Num_Hunks"] = len(bfc_hunks)
    result["Min_Distance_Fix"], result["Max_Distance_Fix"], result["Mean_Distance_Fix"] =\
        distances_to_fix(bic_hunks, bfc_hunks)
    result["BIC_Is_Bug_Fix"] = is_bug_fix(directory, row.BIC_Commit)
    result["BIC_Is_Refactor"] = is_refactor(directory, row.BIC_Commit, bic_scores)
    result["BIC_Is_Feature_Enhancement"] = is_feature_enhancement(bic_hunks, bic_scores)
    result["BFC_Is_Refactor"] = is_refactor(directory, row.BFC_Commit, bfc_scores)
    result["BFC_Is_Feature_Enhancement"] = is_feature_enhancement(bfc_hunks, bfc_scores)
    result["Revert_Index"] = revert_index(bfc_hunks, bic_hunks)

    results = results.append(result, ignore_index=True)
    results.to_csv("Intermediate_Result.csv", index=False)

Number of past records: 440




In [14]:
row

Project_Name                                      HikariCP
BFC_Commit        5ea5688db367f880141e2b330e2ea7538aabc1e9
BIC_Commit        b5967fc5a14b9a733be43a4771175eace8a856bd
Working_Commit    1eb35356c5a72aa97e24e4a35f889ae945843a82
Name: 347, dtype: object

In [31]:
files = files_changed(directory, row.BFC_Commit, f"{row.BFC_Commit}~1")
for file in files:
    print(file)
    output = subprocess.run(f"git diff -U1000 {row.BFC_Commit}~1 {row.BFC_Commit} -- {file}",
                            shell=True, capture_output=True).stdout.decode(errors="Ignore")
    print(output)

hikaricp-common/src/main/java/com/zaxxer/hikari/pool/BaseHikariPool.java

hikaricp-common/src/test/java/com/zaxxer/hikari/TestConnectionTimeoutRetry.java

