## Read Data Files

In [None]:
import re

def generate_dataset(files):
    """return dataset, a list of dict data points"""
    dataset = []
    attributes = ('type', 'sub_type', 'sub_sub_type', 
                  'cons_type', 'sub_cons_type')
    
    re_patch_bp = re.compile("""\((?P<type>[bp]):
                                (?P<sub_type>[^(->)]*)
                                (->(?P<sub_sub_type>[^(->)\)]*))?
                                (->[^(->)\)]*)*\):
                                \(s:(?P<cons_type>[^(->)]*)
                                (->(?P<sub_cons_type>[^\)]*))?\)""", re.VERBOSE)
    re_patch_c = re.compile("""\((?P<type>c):
                                (?P<sub_type>[^(->)]*)
                                (->(?P<sub_sub_type>[^(->)\)]*))?\)""", re.VERBOSE)
    re_patch_misc = re.compile("""\((?P<type>misc)->
                                    (?P<sub_type>[^\)]*)\)""", re.VERBOSE)
    re_patch_f = re.compile("\((?P<type>f)\)")
    
    for fname in files:
        with open(fname, 'r') as f:
            lines = f.readlines()
            
        ptr = 0
        while ptr <= len(lines) - 2:
            line = lines[ptr]
            next_line = lines[ptr + 1]
            if line.startswith(' ') or line.startswith('\t'):
                try:
                    assert(next_line.lstrip().startswith('.'))
                except:
                    """
                    print('-------- Error line ---------')
                    print('line:%s' % line)
                    print('next:%s' % next_line)
                    print('-----------------------------')
                    """
                    ptr += 1
                    continue
                dp = {}
                dp['subject'] = line.lstrip().rstrip()
                
                m = re_patch_bp.search(next_line) or \
                    re_patch_c.search(next_line) or \
                    re_patch_misc.search(next_line) or \
                    re_patch_f.search(next_line)
                if m != None:
                    gd = m.groupdict()
                    for attr in attributes:
                        if attr in gd:
                            dp[attr] = gd[attr]
                    dataset.append(dp)
                    ptr += 2
                else:
                    # skip lines with multiple sub_types and consequences
                    ptr += 1
            else:
                ptr += 1
            
    return dataset

fss = ['ext3', 'ext4', 'btrfs', 'xfs', 'jfs', 'reiserfs']
datasets = {fs : generate_dataset(['../data/fs-patch/%s-patch' % fs]) for fs in fss}
total = 0
for fs in fss:
    total += len(datasets[fs])
    print('------ %s -------' % fs)
    print('Number of patches:', len(datasets[fs]))
    print('Percentage of bugs: %.2f' % (sum([1 for dp in datasets[fs] if dp['type'] == 'b']) / len(datasets[fs])))
    print()
print('Number of patches in total:', total)

## Supplement Data With Info from Linux Repo

In [None]:
from git import Repo
r = Repo('../repos/linux-kernel')
commits = list(r.iter_commits(rev='efd375d7ab44f68d97e6fb7582bb2af9f6d7f9f0..v3.0'))
print('Number of commits to scan: ', len(commits))
subject_to_sha = {}
for commit in commits:
    subject = commit.message.split('\n', 1)[0].lstrip().rstrip()
    subject_to_sha[subject] = commit.binsha

In [None]:
from git import Commit
from graphs.patch_parser import PatchParser

count = 0
parser = PatchParser()
print('-----------------------------------------------')
print('------- Patches not matched to a commit -------')
print('-----------------------------------------------')
for fs in fss:
    for dp in datasets[fs]:
        if dp['subject'] in subject_to_sha:
            c = Commit(r, subject_to_sha[dp['subject']])
            diff_index = c.diff(c.parents[0], create_patch=True, R=True)
            num_adds, num_dels = 0, 0
            for diff in diff_index:
                additions, deletions = parser.parse(diff.diff.decode('utf-8', 'replace'))
                for a in additions:
                    num_adds += a[1]
                for d in deletions:
                    num_dels += (d[1] - d[0] + 1)
                
            dp['message'] = c.message
            dp['num_files'] = len(diff_index)
            dp['num_adds'] = num_adds
            dp['num_dels'] = num_dels

            count += 1
        else:
            print(dp['subject'])
print('Number of patches matched: ', count)

## Correct Misspelled Bug Consequence Type

In [None]:
for fs in fss:
    for dp in datasets[fs]:
        if dp['type'] == 'b' and dp['cons_type'] == 'corrpution':
            dp['cons_type'] = 'corruption'

## Save Data for Future Convenience

In [None]:
import pickle
pickle.dump(datasets, open("../data/fs-patch/fs_datasets.pickle", 'wb'), True)