## Loading dataset

In [1]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [2]:
import warnings
warnings.simplefilter('ignore')

import pickle
datasets = pickle.load(open("../data/fs-patch/fs_datasets.pickle", 'rb'))

## Training classifiers

In [None]:
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
import sklearn.feature_extraction
from sklearn.svm import SVC
import functools

def binary_bug(dp):
    if dp['type'] == 'b':
        return 1
    else:
        return 0
    
def multi_patch_type(dp):
    return dp['type']

def multi_bug_cons(dp):
    return dp['cons_type']

def only_bug_filter(dp):
    return dp['type'] == 'b'
        
rf = functools.partial(sklearn.ensemble.RandomForestClassifier, n_estimators=300)
svm = functools.partial(SVC, kernel='linear')

def leave_one_dataset_out(datasets, text_feature, label_func, init_clf, 
                          ngram_range=(1, 1), extra_features=None, 
                          dp_filter=lambda dp: True):
    """Prepare datasets for training in a leave-one-out style
    
    Args:
        datasets: A dictionary, keys are dataset names, each dataset
            is a list of data points (also dictionaries).
        text_feature: A string, can be either 'message' or 'subject'.
        label_func: A function, takes a data point as input and
            return its target label.
        init_clf: A function, return a classifier which supports
            'fit' and 'score' method
        extra_features: A list of strings.
        dp_filter: A function decides which data point to exclude.
        
    Returns:
        
        
    """
    assert(text_feature in ('message', 'subject'))
    
    train_texts = {}
    train_extras = {}
    train_targets = {}
    train_vectors = {}
    
    test_texts = {}
    test_extras = {}
    test_targets = {}
    test_vectors = {}
    
    classifiers = {}
    vectorizers = {}

    for fs in fss:
        train_texts[fs] = []
        train_targets[fs] = []
        train_extras[fs] = []
        for fs2 in fss:
            if fs2 != fs:
                for dp in datasets[fs2]:
                    if text_feature in dp and dp_filter(dp):
                        train_texts[fs].append(dp[text_feature])
                        train_targets[fs].append(label_func(dp))
                        

        test_texts[fs] = []
        test_targets[fs] = []
        for dp in datasets[fs]:
            if text_feature in dp and dp_filter(dp):
                test_texts[fs].append(dp[text_feature])
                test_targets[fs].append(label_func(dp))
            
        vectorizers[fs] = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=ngram_range)
        train_vectors[fs] = vectorizers[fs].fit_transform(train_texts[fs])
        test_vectors[fs] = vectorizers[fs].transform(test_texts[fs])

        classifiers[fs] = init_clf()
        classifiers[fs].fit(train_vectors[fs], train_targets[fs])

        print('----- Test Accuracy for %s -----' % fs)
        print('Classifier: %.3f' % classifiers[fs].score(test_vectors[fs], test_targets[fs]))

        pred2 = []
        for text in test_texts[fs]:
            if 'fix' in text.lower() or 'fixes' in text.lower() or 'fixed' in text.lower():
                pred2.append(1)
            else:
                pred2.append(0)
        print('Naive: %.3f' % sklearn.metrics.accuracy_score(test_targets[fs], pred2))

In [None]:
leave_one_dataset_out(datasets, 'subject', binary_bug, svm)

In [None]:
leave_one_dataset_out(datasets, 'message', binary_bug, svm, ngram_range=(1, 2))

In [None]:
leave_one_dataset_out(datasets, 'subject', multi_patch_type, svm)

In [None]:
leave_one_dataset_out(datasets, 'subject', multi_bug_cons, svm, dp_filter=only_bug_filter)

## Explaining with lime

In [None]:
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizers['ext3'], classifiers['ext3'])
print(c.predict_proba([test_texts['ext3'][1]]))

In [None]:
from lime.lime_text import LimeTextExplainer
class_names = ['not-bug', 'bug']
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
import pprint
pp = pprint.PrettyPrinter()

def explain_pred(idx, fs):
    c = make_pipeline(vectorizers[fs], classifiers[fs])
    exp = explainer.explain_instance(test_texts[fs][idx].lower(), c.predict_proba, num_features=8)
    print('Patch id: %d' % idx)
    print('Probability(bug) =', c.predict_proba([test_texts[fs][idx]])[0,1])
    print('True class: %s' % class_names[test_targets[fs][idx]])
    print('Text: %s' % test_texts[fs][idx])
    pp.pprint(exp.as_list())
    # exp.show_in_notebook(text=True)

In [None]:
vocab = vectorizers['ext3'].vocabulary_
names = [None] * len(vocab)
for v in vocab:
    names[vocab[v]] = v
pp.pprint(sorted(zip(map(lambda x: '%.4f' % x, classifiers['ext3'].feature_importances_), names), reverse=True)[:20])

In [None]:
from sklearn.pipeline import make_pipeline
fs = 'ext3'
c = make_pipeline(vectorizers[fs], classifiers[fs])
for i in range(len(test_texts[fs])):
    if ('fix' not in test_texts[fs][i].lower() 
        and c.predict_proba([test_texts[fs][i]])[0,1] > 0.5
        and test_targets[fs][i] == 1):
        explain_pred(i, fs)

In [None]:
# keyword 'fix'
for i in [22, 24]:
    explain_pred(i, 'ext3')

In [None]:
# when keyword 'fix' is absent
for i in [23, 25]:
    explain_pred(i, 'ext3')

In [None]:
# interesting case
for i in [5, 26]:
    explain_pred(i, 'ext3')

In [None]:
for commit in list(r.iter_commits(rev='v2.6.12')):
    if 'Linux 2.6.0' in commit.message.split('\n', 1)[0]:
        print(commit.hexsha)

In [None]:
b'\xd0\x9f\xe4\nG\xb5\xe8G6\x88(\x95e\xac\xd4\xb2Q\r\xd8b'
[PATCH] ext3: fix determination of inode journalling mode