## Loading dataset

In [None]:
import classifier
from classifier import binary_bug, multi_patch_type, limited_patch_type, only_bug_filter, stem, lemmatize
from sklearn.svm import LinearSVC

import warnings
warnings.simplefilter('ignore')

In [None]:
import pickle
datasets = pickle.load(open('../data/fs-patch.pickle', 'rb'))

In [None]:
fss = ['ext3', 'ext4', 'btrfs', 'xfs', 'jfs', 'reiserfs']
exp = classifier.Classifier(datasets, fss)

## Binary Classification of Bug Fix Patch

In [None]:
# only using frc
exp.run(binary_bug, LinearSVC, use_text=False, use_frc=True)

In [None]:
# only using text (TF * IDF)
exp.run(binary_bug, LinearSVC)

In [None]:
# only using text (TF * BNS)
exp.run(binary_bug, LinearSVC, use_bns=True)

In [None]:
# using both text (TF * IDF) and frc
exp.run(binary_bug, LinearSVC, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
exp.run(binary_bug, LinearSVC, use_bns=True, use_frc=True)

In [None]:
# using both text (TF * IDF) and frc
# drop terms with frequency lower than min_df
exp.run(binary_bug, LinearSVC, min_df=3, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
# drop terms with frequency lower than min_df
exp.run(binary_bug, LinearSVC, use_bns=True, min_df=3, use_frc=True)

In [None]:
# using both stemmed text (TF * BNS) and frc
exp.run(binary_bug, LinearSVC, use_bns=True, use_frc=True, tokenizer=stem)

In [None]:
# using both lemmatized text (TF * BNS) and frc
exp.run(binary_bug, LinearSVC, use_bns=True, use_frc=True, tokenizer=lemmatize)

In [None]:
# using both text (TF * BNS) and frc
# select top k features by mutual information
exp.run(binary_bug, LinearSVC, use_bns=True, use_frc=True, k=5000)

## Multi-Class Classification of Patch Type

In [None]:
# only using frc
exp.run(multi_patch_type, LinearSVC, use_text=False, use_frc=True)

In [None]:
# only using text (TF * IDF)
exp.run(multi_patch_type, LinearSVC)

In [None]:
# only using text (TF * BNS)
exp.run(multi_patch_type, LinearSVC, use_bns=True)

In [None]:
# using both text (TF * IDF) and frc
exp.run(multi_patch_type, LinearSVC, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
exp.run(multi_patch_type, LinearSVC, use_bns=True, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
# drop terms with frequency lower than min_df
exp.run(multi_patch_type, LinearSVC, use_bns=True, min_df=3, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
# select top k features by mutual information
exp.run(multi_patch_type, LinearSVC, use_bns=True, use_frc=True, k=5000)

## Multi-Class Classification of Bug Consequences

In [None]:
# only using text (TF * IDF)
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter)

In [None]:
# only using text (TF * BNS)
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, use_bns=True)

In [None]:
# using both text (TF * IDF) and frc
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, use_frc=True)

In [None]:
# using both text (TF * BNS) and frc
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, 
        use_bns=True, use_frc=True)

In [None]:
# using both bigram text (TF * BNS) and frc
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, 
        use_bns=True, use_frc=True, ngram_range=(1, 2))

In [None]:
# only using text (TF * IDF)
# select top k features by mutual information
exp.run(multi_bug_cons, LinearSVC, dp_filter=only_bug_filter, k=5000)

## Explaning Random Forest

In [None]:
import pprint
pp = pprint.PrettyPrinter()

In [None]:
# explaning random forest classifier
def rank_features_by_importance(exp, top_n=20):
    if 'feature_importances_' in dir(exp.classifiers['ext3']):
        for fs in fss:
            print('------- important features for %s -------' % fs)
            truncated_importances = map(lambda x: '%.4f' % x, exp.classifiers[fs].feature_importances_)
            pp.pprint(sorted(zip(truncated_importances, exp.feature_labels[fs]), reverse=True)[:top_n])
    else:
        print("classifiers don't have attribute feature_importance_")

In [None]:
rank_features_by_importance(exp, top_n=20)

## Explaining SVM

In [None]:
# explaining linear SVM classifier
def rank_features_by_weight(classifiers, classes, feature_labels, 
                            top_n=20, individual_class=False):
    """Rank features by the absolute value of associated primal weight
    
    Args:
        classifiers: A dictionary, with key being the file system name
            and value being the corresponding classifier.
        classes: A list of class labels.
        feature_labels: A dictionary of list of feature names, each key
            is a file system name, features are in the order as classifier
            sees them.
        top_n: An integer, specifies number of top features to print.
        individual_class: A boolean value, whether use sum of absolute weights 
            across classes. In binary tasks, this value doesn't matter.
    """
    
    fss = list(classifiers.keys())
    if 'coef_' in dir(classifiers[fss[0]]):
        for fs in fss:
            print('------- %s -------' % fs)
            if individual_class:
                coef = classifiers[fs].coef_
                for i in range(coef.shape[0]):
                    print('\t------ %s ------' % classes[i])
                    # print('\t------  ------')
                    order = np.argsort(np.absolute(coef[i]))
                    pp.pprint(np.array(feature_labels[fs])[order][-top_n:])
                print()
            else:
                coef = classifiers[fs].coef_
                num_features = coef.shape[1]
                abs_coef_sum = np.zeros(num_features)
                for i in range(coef.shape[0]):
                    abs_coef_sum += np.absolute(coef[i])
                order = np.argsort(abs_coef_sum)
                pp.pprint(np.array(feature_labels[fs])[order][-top_n:])
    else:
        print("classifiers don't have attribute coef_")

In [None]:
rank_features_by_weight(exp.classifiers, np.unique(exp.train_targets['ext3']), 
                        exp.feature_labels, top_n=20)

In [None]:
rank_features_by_weight(exp.classifiers, np.unique(exp.train_targets['ext3']), 
                        exp.feature_labels, top_n=20, individual_class=True)

## LIME

In [None]:
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizers['ext3'], classifiers['ext3'])
print(c.predict_proba([test_texts['ext3'][1]]))

In [None]:
from lime.lime_text import LimeTextExplainer
class_names = ['not-bug', 'bug']
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
def explain_pred(idx, fs):
    c = make_pipeline(vectorizers[fs], classifiers[fs])
    exp = explainer.explain_instance(test_texts[fs][idx].lower(), c.predict_proba, num_features=8)
    print('Patch id: %d' % idx)
    print('Probability(bug) =', c.predict_proba([test_texts[fs][idx]])[0,1])
    print('True class: %s' % class_names[test_targets[fs][idx]])
    print('Text: %s' % test_texts[fs][idx])
    pp.pprint(exp.as_list())
    # exp.show_in_notebook(text=True)

In [None]:
from sklearn.pipeline import make_pipeline
fs = 'ext3'
c = make_pipeline(vectorizers[fs], classifiers[fs])
for i in range(len(test_texts[fs])):
    if ('fix' not in test_texts[fs][i].lower() 
        and c.predict_proba([test_texts[fs][i]])[0,1] > 0.5
        and test_targets[fs][i] == 1):
        explain_pred(i, fs)

In [None]:
# keyword 'fix'
for i in [22, 24]:
    explain_pred(i, 'ext3')

In [None]:
# when keyword 'fix' is absent
for i in [23, 25]:
    explain_pred(i, 'ext3')

In [None]:
# interesting case
for i in [5, 26]:
    explain_pred(i, 'ext3')