## Loading dataset

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

In [None]:
import warnings
warnings.simplefilter('ignore')

import pickle
datasets = pickle.load(open('data/fs-patch.pickle', 'rb'))
fss = ['ext3', 'ext4', 'btrfs', 'xfs', 'jfs', 'reiserfs']

import spacy
nlp = spacy.load('en')


## Training classifiers

In [None]:
import array
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone, is_classifier
from sklearn.preprocessing import StandardScaler, label_binarize, binarize, normalize, LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
import functools
import nltk
from nltk.stem.porter import PorterStemmer
from scipy.stats import norm
from scipy.sparse import csr_matrix, csc_matrix
from joblib import Parallel, delayed

stemmer = PorterStemmer()
rf = functools.partial(sklearn.ensemble.RandomForestClassifier, n_estimators=300)

def binary_bug(dp):
    if dp['type'] == 'b':
        return 1
    else:
        return 0
    
def multi_patch_type(dp):
    return dp['type']

def multi_bug_cons(dp):
    return dp['cons_type']

def only_bug_filter(dp):
    return dp['type'] == 'b'
        
def lemmatize(text):
    lemmatized = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ != 'PUNCT':
            lemmatized.append(token.lemma_)
    return lemmatized

def stem(text):
    stemmed = []
    tokens = nltk.word_tokenize(text)
    for token in tokens:
        stemmed.append(stemmer.stem(token))
    return stemmed

def bns(tprs, fprs):
    """
    Args:
        tprs: A row vector of shape (1, num_features)
        fprs: A row vector of shape (1, num_features)
    """
    num_features = tprs.shape[1]
    bns = np.zeros(num_features)
    for i in range(num_features):
        bns[i] = np.abs(norm.ppf(tprs[0, i]) - norm.ppf(fprs[0, i]))
    return bns

def tfbns(tfs, bns):
    """
    Args:
        tfs: A sparse matrix of shape (num_samples, num_features)
            in csr format
        bns: A numpy array of shape (num_features,)
    """
    cx = tfs.tocoo()
    data, rows, cols = [], [], []
    for i, j, v in zip(cx.row, cx.col, cx.data):
        data.append(v * bns[j])
        rows.append(i)
        cols.append(j)
    return csr_matrix((data, (rows, cols)), shape=tfs.shape)

class TextTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, use_bns):
        self.use_bns = use_bns
        if self.use_bns:
            self.tf_trans = TfidfTransformer(use_idf=False, norm=None)
        else:
            self.tf_trans = TfidfTransformer()
        
    def fit(self, X, y=None):
        self.tf_trans.fit(X)
        
        if self.use_bns:
            binary_counts = binarize(X)

            pos = np.sum(y)
            neg = np.size(y) - pos

            tps = np.sum(binary_counts[np.nonzero(y)[0]], axis=0)
            fps = np.sum(binary_counts[np.argwhere(y == 0)[:, 0]], axis=0)

            tprs = np.clip(tps / pos, 0.0005, 0.9995)
            fprs = np.clip(fps / neg, 0.0005, 0.9995)

            self.bns_values = bns(tprs, fprs)
            
        return self
        
    def transform(self, counts):
        tfs = self.tf_trans.transform(counts)
        if self.use_bns:
            return normalize(tfbns(tfs, self.bns_values))
        else:
            return tfs
        
class FeatureLabelExtractor(BaseEstimator, TransformerMixin):
    """Cannot be used in Pipeline"""
    
    def __init__(self, datasets, text_feature, label_func, dp_filter):
        self.datasets = datasets
        self.text_feature = text_feature
        self.label_func = label_func
        self.dp_filter = dp_filter
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, fs_list):
        num_samples = sum([
            sum([1 for dp in self.datasets[fs] if self.dp_filter(dp)]) 
            for fs in fs_list])
        
        features = {}
        features['text'] = [None] * num_samples
        features['frc'] = np.zeros((num_samples, 3))
        labels = [None] * num_samples
        ind = 0
        for fs in fs_list:
            for dp in self.datasets[fs]:
                if self.dp_filter(dp):
                    features['text'][ind] = dp[self.text_feature]
                    features['frc'][ind] = np.array([dp['num_files'],
                                                     dp['num_adds'],
                                                     dp['num_dels']])
                    labels[ind] = self.label_func(dp)
                    ind += 1
        return features, labels

class ItemSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

def _fit_binary(estimator, use_bns,
                use_text, use_frc, k, X, y):
    estimator = clone(estimator)
    text = make_pipeline(
        ItemSelector(key='count'), TextTransformer(use_bns=use_bns))
    frc = make_pipeline(
        ItemSelector(key='frc'), StandardScaler())
        
    if use_text and use_frc:
        union = make_union(text, frc)
    elif not use_text and use_frc:
        union = frc
    elif use_text and not use_frc:
        union = text
        
    if k:
        pipeline = make_pipeline(union,
                                 SelectKBest(mutual_info_classif, k=k),
                                 estimator)
    else:
        pipeline = make_pipeline(union, estimator)
    
    pipeline.fit(X, y)
    return pipeline 

def _predict_binary(pipeline, X):
    """Make predictions using a single binary estimator"""
    try:
        score = np.ravel(pipeline.decision_function(X))
    except (AttributeError, NotImplementedError):
        score = pipeline.predict_proba(X)[:, 1]
    return score
    

class BNSClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, estimator, use_bns, 
                 use_text, use_frc, k, n_jobs=1):
        assert(use_text or use_frc)
        self.estimator = estimator
        self.n_jobs = n_jobs
        self.use_bns = use_bns
        self.use_text = use_text
        self.use_frc = use_frc
        self.k = k
        
    def fit(self, X, y):
        self.label_binarizer = LabelBinarizer(sparse_output=True)
        Y = self.label_binarizer.fit_transform(y)
        Y = Y.tocsc()
        self.classes = self.label_binarizer.classes_
        columns = (col.toarray().ravel() for col in Y.T)
        
        self.pipelines = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_binary)(
                self.estimator, self.use_bns, 
                self.use_text, self.use_frc, self.k, X, column)
            for column in columns)
        
    def predict(self, X):
        if(hasattr(self.pipelines[0], 'decision_function') and
              is_classifier(self.pipelines[0])):
            thresh = 0
        else:
            thresh = 0.5
            
        n_samples = X['count'].shape[0]
        if self.label_binarizer.y_type_ == 'multiclass':
            maxima = np.empty(n_samples, dtype=float)
            maxima.fill(-np.inf)
            argmaxima = np.zeros(n_samples, dtype=int)
            for i, p in enumerate(self.pipelines):
                pred = _predict_binary(p, X)
                np.maximum(maxima, pred, out=maxima)
                argmaxima[maxima == pred] = i
            return self.classes[np.array(argmaxima.T)]
        else:
            indices = array.array('i')
            indptr = array.array('i', [0])
            for p in self.pipelines:
                indices.extend(np.where(_predict_binary(p, X) > thresh)[0])
                indptr.append(len(indices))
            data = np.ones(len(indices), dtype=int)
            indicator = csc_matrix((data, indices, indptr),
                shape=(n_samples, len(self.pipelines)))
            return self.label_binarizer.inverse_transform(indicator)


class Experiment():
    
    def __init__(self, datasets, fss):
        """
        Args:
            datasets: A dictionary, keys are dataset names, each dataset
                is a list of data points (also dictionaries).
            fss: A list of file system names.
        """
        self.datasets = datasets
        self.fss = fss

    def _clean(self):
        self.feature_labels = {}
        self.classifiers = {}

    def run(self, label_func, estimator, ngram_range=(1, 1),
            text_feature='message', use_text=True, use_frc=False,
            dp_filter=lambda dp: True, tokenizer=None, 
            max_features=None, min_df=1, use_bns=False, k=None,
            n_jobs=1):
        """Perform experiment in a leave-one-out style
        
        Args:
            label_func: A function, takes a data point as input and
                return its target label.
            estimator: A function, return a classifier which supports
                'fit' and 'predict' method
            ngram_range: A tuple of two integers, specify what range of 
                ngram to use
            text_feature: A string, can be either 'message' or 'subject'.
                If set to None, then texts will not be used.
            use_text: A boolean flag, whether to use text feature
            use_frc: A boolean flag, whether to use frc
            dp_filter: A function decides which data point to exclude.
            tokenizer: A function takes a string and return a list of tokens.
            max_features: An int or None. If not None, only consider
                top max_features ordered by term frequency across the corpus.
            min_df: An int, ignore terms when building vocabulary if their
                document frequency is strictly lower than this threshold.
            use_bns: A boolean flag, use BNS if True, otherwise use IDF
            k: An int, number of top features to keep during feature
                selection. 
        """
        self._clean()
        
        fle = FeatureLabelExtractor(
            self.datasets, text_feature, label_func, dp_filter)

        for fs in self.fss:
            ofs_list = [ofs for ofs in self.fss if ofs != fs]

            train_X, train_y = fle.transform(ofs_list)
            test_X, test_y = fle.transform([fs])

            cv = CountVectorizer(tokenizer=tokenizer,
                                 ngram_range=ngram_range,
                                 max_features=max_features,
                                 min_df=min_df)
            
            train_X['count'] = cv.fit_transform(train_X['text'])
            test_X['count'] = cv.transform(test_X['text'])

            # TODO set n_jobs
            clf = BNSClassifier(estimator(), 
                                use_bns=use_bns,
                                use_text=use_text,
                                use_frc=use_frc,
                                k=k,
                                n_jobs=n_jobs)
            clf.fit(train_X, train_y)

            print('----- Test Accuracy for %s -----' % fs)
            print('Classifier: %.3f' % clf.score(test_X, test_y))
            
            self.classifiers[fs] = clf
            
exp = Experiment(datasets, fss)

## Binary Classification of Bug Fix Patch

In [None]:
# only using frc
exp.run(binary_bug, LinearSVC, use_text=False, use_frc=True)

In [None]:
# only using text (TF * IDF)
exp.run(binary_bug, LinearSVC)

In [None]:
# only using text (TF * BNS)
exp.run(binary_bug, LinearSVC, use_bns=True)

In [None]:
# using both text (TF * IDF) and frc
exp.run(binary_bug, LinearSVC, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
exp.run(binary_bug, LinearSVC, use_bns=True, use_frc=True)

In [None]:
# using both text (TF * IDF) and frc
# drop terms with frequency lower than min_df
exp.run(binary_bug, LinearSVC, min_df=3, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
# drop terms with frequency lower than min_df
exp.run(binary_bug, LinearSVC, use_bns=True, min_df=3, use_frc=True)

In [None]:
# using both stemmed text (TF * BNS) and frc
exp.run(binary_bug, LinearSVC, use_bns=True, use_frc=True, tokenizer=stem)

In [None]:
# using both lemmatized text (TF * BNS) and frc
exp.run(binary_bug, LinearSVC, use_bns=True, use_frc=True, tokenizer=lemmatize)

In [None]:
# using both text (TF * BNS) and frc
# select top k features by mutual information
exp.run(binary_bug, LinearSVC, use_bns=True, use_frc=True, k=5000)

## Multi-Class Classification of Patch Type

In [None]:
# only using frc
exp.run(multi_patch_type, LinearSVC, use_text=False, use_frc=True)

In [None]:
# only using text (TF * IDF)
exp.run(multi_patch_type, LinearSVC)

In [None]:
# only using text (TF * BNS)
exp.run(multi_patch_type, LinearSVC, use_bns=True)

In [None]:
# using both text (TF * IDF) and frc
exp.run(multi_patch_type, LinearSVC, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
exp.run(multi_patch_type, LinearSVC, use_bns=True, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
# drop terms with frequency lower than min_df
exp.run(multi_patch_type, LinearSVC, use_bns=True, min_df=3, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
# select top k features by mutual information
exp.run(multi_patch_type, LinearSVC, use_bns=True, use_frc=True, k=5000)

## Multi-Class Classification of Bug Consequences

In [None]:
# only using text (TF * IDF)
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter)

In [None]:
# only using text (TF * BNS)
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, use_bns=True)

In [None]:
# using both text (TF * IDF) and frc
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, 
        use_bns=True, use_frc=True)

In [None]:
# using both bigram text (TF * BNS) and frc
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, 
        use_bns=True, use_frc=True, ngram_range=(1, 2))

In [None]:
# only using text (TF * IDF)
# select top k features by mutual information
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, k=5000)

## Explaning Random Forest

In [None]:
import pprint
pp = pprint.PrettyPrinter()

In [None]:
# explaning random forest classifier
def rank_features_by_importance(exp, top_n=20):
    if 'feature_importances_' in dir(exp.classifiers['ext3']):
        for fs in fss:
            print('------- important features for %s -------' % fs)
            truncated_importances = map(lambda x: '%.4f' % x, exp.classifiers[fs].feature_importances_)
            pp.pprint(sorted(zip(truncated_importances, exp.feature_labels[fs]), reverse=True)[:top_n])
    else:
        print("classifiers don't have attribute feature_importance_")

In [None]:
rank_features_by_importance(exp, top_n=20)

## Explaining SVM

In [None]:
# explaining linear SVM classifier
def rank_features_by_weight(classifiers, classes, feature_labels, 
                            top_n=20, individual_class=False):
    """Rank features by the absolute value of associated primal weight
    
    Args:
        classifiers: A dictionary, with key being the file system name
            and value being the corresponding classifier.
        classes: A list of class labels.
        feature_labels: A dictionary of list of feature names, each key
            is a file system name, features are in the order as classifier
            sees them.
        top_n: An integer, specifies number of top features to print.
        individual_class: A boolean value, whether use sum of absolute weights 
            across classes. In binary tasks, this value doesn't matter.
    """
    
    fss = list(classifiers.keys())
    if 'coef_' in dir(classifiers[fss[0]]):
        for fs in fss:
            print('------- %s -------' % fs)
            if individual_class:
                coef = classifiers[fs].coef_
                for i in range(coef.shape[0]):
                    print('\t------ %s ------' % classes[i])
                    # print('\t------  ------')
                    order = np.argsort(np.absolute(coef[i]))
                    pp.pprint(np.array(feature_labels[fs])[order][-top_n:])
                print()
            else:
                coef = classifiers[fs].coef_
                num_features = coef.shape[1]
                abs_coef_sum = np.zeros(num_features)
                for i in range(coef.shape[0]):
                    abs_coef_sum += np.absolute(coef[i])
                order = np.argsort(abs_coef_sum)
                pp.pprint(np.array(feature_labels[fs])[order][-top_n:])
    else:
        print("classifiers don't have attribute coef_")

In [None]:
rank_features_by_weight(exp.classifiers, np.unique(exp.train_targets['ext3']), 
                        exp.feature_labels, top_n=20)

In [None]:
rank_features_by_weight(exp.classifiers, np.unique(exp.train_targets['ext3']), 
                        exp.feature_labels, top_n=20, individual_class=True)

## LIME

In [None]:
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizers['ext3'], classifiers['ext3'])
print(c.predict_proba([test_texts['ext3'][1]]))

In [None]:
from lime.lime_text import LimeTextExplainer
class_names = ['not-bug', 'bug']
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
def explain_pred(idx, fs):
    c = make_pipeline(vectorizers[fs], classifiers[fs])
    exp = explainer.explain_instance(test_texts[fs][idx].lower(), c.predict_proba, num_features=8)
    print('Patch id: %d' % idx)
    print('Probability(bug) =', c.predict_proba([test_texts[fs][idx]])[0,1])
    print('True class: %s' % class_names[test_targets[fs][idx]])
    print('Text: %s' % test_texts[fs][idx])
    pp.pprint(exp.as_list())
    # exp.show_in_notebook(text=True)

In [None]:
from sklearn.pipeline import make_pipeline
fs = 'ext3'
c = make_pipeline(vectorizers[fs], classifiers[fs])
for i in range(len(test_texts[fs])):
    if ('fix' not in test_texts[fs][i].lower() 
        and c.predict_proba([test_texts[fs][i]])[0,1] > 0.5
        and test_targets[fs][i] == 1):
        explain_pred(i, fs)

In [None]:
# keyword 'fix'
for i in [22, 24]:
    explain_pred(i, 'ext3')

In [None]:
# when keyword 'fix' is absent
for i in [23, 25]:
    explain_pred(i, 'ext3')

In [None]:
# interesting case
for i in [5, 26]:
    explain_pred(i, 'ext3')