## Loading dataset

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

In [None]:
import warnings
warnings.simplefilter('ignore')

import pickle
datasets = pickle.load(open("../data/fs-patch/fs_datasets.pickle", 'rb'))
fss = ['ext3', 'ext4', 'btrfs', 'xfs', 'jfs', 'reiserfs']

## Training classifiers

In [None]:
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import functools

def binary_bug(dp):
    if dp['type'] == 'b':
        return 1
    else:
        return 0
    
def multi_patch_type(dp):
    return dp['type']

def multi_bug_cons(dp):
    return dp['cons_type']

def only_bug_filter(dp):
    return dp['type'] == 'b'
        
rf = functools.partial(sklearn.ensemble.RandomForestClassifier, n_estimators=300)

class Experiment():
    
    def __init__(self, datasets):
        """
        Args:
            datasets: A dictionary, keys are dataset names, each dataset
                is a list of data points (also dictionaries).
        """
        self.datasets = datasets
        self.fss = list(datasets.keys())

    def _clean(self):
        self.train_texts = {}
        self.train_extras = {}
        self.train_targets = {}
        self.train_vectors = {}
        
        self.test_texts = {}
        self.test_extras = {}
        self.test_targets = {}
        self.test_vectors = {}

        self.vectorizers = {}
        self.feature_labels = {}
        self.classifiers = {}

    def leave_one_dataset_out(self, text_feature, label_func, init_clf, 
                              ngram_range=(1, 1), extra_features=None, 
                              dp_filter=lambda dp: True):
        """Perform experiment in a leave-one-out style
        
        Args:
            text_feature: A string, can be either 'message' or 'subject'.
            label_func: A function, takes a data point as input and
                return its target label.
            init_clf: A function, return a classifier which supports
                'fit' and 'score' method
            extra_features: A list of strings.
            dp_filter: A function decides which data point to exclude.
        """
        
        def gather_data(fs, text_list, target_list, extra_list):
            """Helper function to gather data from a single dataset"""
            for dp in self.datasets[fs]:
                if text_feature in dp and dp_filter(dp):
                    text_list.append(dp[text_feature])
                    target_list.append(label_func(dp))
                    if extra_features != None:
                        extra_list.append([dp[f] for f in extra_features])
        
        assert(text_feature in ('message', 'subject'))
        self._clean()

        for fs in self.fss:

            self.train_texts[fs], self.train_targets[fs], \
                self.train_extras[fs] = [], [], []
            self.test_texts[fs], self.test_targets[fs], \
                self.test_extras[fs] = [], [], []

            for fs2 in self.fss:
                if fs2 != fs:
                    gather_data(fs2, self.train_texts[fs],
                        self.train_targets[fs], self.train_extras[fs])

            gather_data(fs, self.test_texts[fs], self.test_targets[fs],
                self.test_extras[fs])

            # vectorize texts
            self.vectorizers[fs] = TfidfVectorizer(ngram_range=ngram_range)
            self.train_vectors[fs] = self.vectorizers[fs].fit_transform(self.train_texts[fs])
            self.test_vectors[fs] = self.vectorizers[fs].transform(self.test_texts[fs])

            vocab = self.vectorizers[fs].vocabulary_
            self.feature_labels[fs] = [None] * len(vocab)
            for v in vocab:
                self.feature_labels[fs][vocab[v]] = v

            if extra_features != None:
                self.train_extras[fs] = np.array(self.train_extras[fs])
                self.test_extras[fs] = np.array(self.test_extras[fs])

                # scale to have zero mean and unit variance
                scaler = preprocessing.StandardScaler()
                self.train_extras[fs] = scaler.fit_transform(self.train_extras[fs])
                self.test_extras[fs] = scaler.transform(self.test_extras[fs])
                                
                self.train_vectors[fs] = np.hstack((self.train_vectors[fs].todense(), self.train_extras[fs]))
                self.test_vectors[fs] = np.hstack((self.test_vectors[fs].todense(), self.test_extras[fs]))

                for i, f in enumerate(extra_features):
                    self.feature_labels[fs].append(f)

            self.classifiers[fs] = init_clf()
            self.classifiers[fs].fit(self.train_vectors[fs], 
                self.train_targets[fs])

            print('----- Test Accuracy for %s -----' % fs)
            print('Classifier: %.3f' % self.classifiers[fs].score(
                self.test_vectors[fs], self.test_targets[fs]))

            pred2 = []
            for text in self.test_texts[fs]:
                if 'fix' in text.lower(): 
                    pred2.append(1)
                else:
                    pred2.append(0)
            print('Naive: %.3f' % sklearn.metrics.accuracy_score(
                self.test_targets[fs], pred2))
            
exp = Experiment(datasets)

In [None]:
# binary classification of bug patch, using linear SVM with extra features
exp.leave_one_dataset_out('message', binary_bug, LinearSVC, 
                          extra_features=['num_files', 'num_adds', 'num_dels'])

In [None]:
# binary classification of bug patch, using SVM (without extra features)
exp.leave_one_dataset_out('message', binary_bug, LinearSVC)

In [None]:
# binary classification of bug patch, using linear SVM with 2-gram
exp.leave_one_dataset_out('message', binary_bug, LinearSVC, ngram_range=(1, 2))

In [None]:
# binary classification of bug patch, using linear SVM with 2-gram and extra features
exp.leave_one_dataset_out('message', binary_bug, LinearSVC, ngram_range=(1, 2), 
                          extra_features=['num_files', 'num_adds', 'num_dels'])

In [None]:
# multi-class classification of patch type, using linear SVM
exp.leave_one_dataset_out('message', multi_patch_type, LinearSVC)

In [None]:
# multi-class classification of patch type, using linear SVM with 2-gram
exp.leave_one_dataset_out('message', multi_patch_type, LinearSVC, ngram_range=(1, 2))

In [None]:
# multi-class classification of patch type, using linear SVM with 2-gram and extra features
exp.leave_one_dataset_out('message', multi_patch_type, LinearSVC, ngram_range=(1, 2),
                         extra_features=['num_files', 'num_adds', 'num_dels'])

In [None]:
# multi-class classification of bug consequences, using linear SVM
exp.leave_one_dataset_out('message', multi_bug_cons, LinearSVC, dp_filter=only_bug_filter)

In [None]:
# multi-class classification of bug consequences, using linear SVM with 2-gram
exp.leave_one_dataset_out('message', multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, ngram_range=(1, 2))

In [None]:
# multi-class classification of bug consequences, using linear SVM with 2-gram and extra features
exp.leave_one_dataset_out('message', multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, ngram_range=(1, 2),
                         extra_features=['num_files', 'num_adds', 'num_dels'])

In [None]:
# multi-class classification of bug consequences, using random forest
exp.leave_one_dataset_out('message', multi_bug_cons, rf, dp_filter=only_bug_filter)

## Explaning Random Forest

In [None]:
import pprint
pp = pprint.PrettyPrinter()

In [None]:
# explaning random forest classifier
def rank_features_by_importance(exp, top_n=20):
    if 'feature_importances_' in dir(exp.classifiers['ext3']):
        for fs in fss:
            print('------- important features for %s -------' % fs)
            truncated_importances = map(lambda x: '%.4f' % x, exp.classifiers[fs].feature_importances_)
            pp.pprint(sorted(zip(truncated_importances, exp.feature_labels[fs]), reverse=True)[:top_n])
    else:
        print("classifiers don't have attribute feature_importance_")

In [None]:
rank_features_by_importance(exp, top_n=20)

## Explaining SVM

In [None]:
# explaining linear SVM classifier
def rank_features_by_weight(classifiers, classes, feature_labels, 
                            top_n=20, individual_class=False):
    """Rank features by the absolute value of associated primal weight
    
    Args:
        classifiers: A dictionary, with key being the file system name
            and value being the corresponding classifier.
        classes: A list of class labels.
        feature_labels: A dictionary of list of feature names, each key
            is a file system name, features are in the order as classifier
            sees them.
        top_n: An integer, specifies number of top features to print.
        individual_class: whether use sum of absolute weights across classes
    """
    
    fss = list(classifiers.keys())
    if 'coef_' in dir(classifiers[fss[0]]):
        for fs in fss:
            print('------- %s -------' % fs)
            if individual_class:
                coef = classifiers[fs].coef_
                for i, cl in enumerate(classes):
                    print('\t------ %s -------' % cl)
                    order = np.argsort(np.absolute(coef[i]))
                    pp.pprint(np.array(feature_labels[fs])[order][-top_n:])
                print()
            else:
                num_features = classifiers[fs].coef_.shape[1]
                coef = np.zeros(num_features)
                for i, cl in enumerate(classes):
                    coef += np.absolute(classifiers[fs].coef_[i])
                order = np.argsort(np.absolute(coef))
                pp.pprint(np.array(feature_labels[fs])[order][-top_n:])
    else:
        print("classifiers don't have attribute coef_")

In [None]:
rank_features_by_weight(exp.classifiers, np.unique(exp.train_targets['ext3']), 
                        exp.feature_labels, top_n=20)

In [None]:
rank_features_by_weight(exp.classifiers, np.unique(exp.train_targets['ext3']), 
                        exp.feature_labels, top_n=20, individual_class=True)

## LIME

In [None]:
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizers['ext3'], classifiers['ext3'])
print(c.predict_proba([test_texts['ext3'][1]]))

In [None]:
from lime.lime_text import LimeTextExplainer
class_names = ['not-bug', 'bug']
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
def explain_pred(idx, fs):
    c = make_pipeline(vectorizers[fs], classifiers[fs])
    exp = explainer.explain_instance(test_texts[fs][idx].lower(), c.predict_proba, num_features=8)
    print('Patch id: %d' % idx)
    print('Probability(bug) =', c.predict_proba([test_texts[fs][idx]])[0,1])
    print('True class: %s' % class_names[test_targets[fs][idx]])
    print('Text: %s' % test_texts[fs][idx])
    pp.pprint(exp.as_list())
    # exp.show_in_notebook(text=True)

In [None]:
from sklearn.pipeline import make_pipeline
fs = 'ext3'
c = make_pipeline(vectorizers[fs], classifiers[fs])
for i in range(len(test_texts[fs])):
    if ('fix' not in test_texts[fs][i].lower() 
        and c.predict_proba([test_texts[fs][i]])[0,1] > 0.5
        and test_targets[fs][i] == 1):
        explain_pred(i, fs)

In [None]:
# keyword 'fix'
for i in [22, 24]:
    explain_pred(i, 'ext3')

In [None]:
# when keyword 'fix' is absent
for i in [23, 25]:
    explain_pred(i, 'ext3')

In [None]:
# interesting case
for i in [5, 26]:
    explain_pred(i, 'ext3')

In [None]:
for commit in list(r.iter_commits(rev='v2.6.12')):
    if 'Linux 2.6.0' in commit.message.split('\n', 1)[0]:
        print(commit.hexsha)