In [None]:
import sys
sys.path.append('../lib')
import classifier
from classifier import binary_bug, multi_patch_type, limited_patch_type, stem, lemmatize
from sklearn.svm import LinearSVC

import warnings
warnings.simplefilter('ignore')

In [None]:
import pickle
datasets = pickle.load(open('./data-jira/jira-issues.pickle', 'rb'))

In [None]:
project_set = ['ace', 'activemq', 'aurora', 'beam', 'cassandra', 'couchdb', 'hbase', 'hive', 'incubator-systemml', 'maven', 'spark', 'zookeeper']
exp = classifier.Classifier(datasets, project_set)

# 1. Binary Classification of Bug Fix Patch

## 1.1 Use TF-IDF

In [None]:
# only using title (TF * IDF) 
exp.run(binary_bug, LinearSVC, text_feature='title')

In [None]:
# only using description (TF * IDF)  
exp.run(binary_bug, LinearSVC, text_feature='description')

In [None]:
# only using comment (TF * IDF) 
exp.run(binary_bug, LinearSVC, text_feature='comment')

In [None]:
# using both title + description (TF * IDF)
exp.run(binary_bug, LinearSVC, text_feature='title', use_description = True)

In [None]:
# using all title + description + comment (TF * IDF) 
exp.run(binary_bug, LinearSVC, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using all text (TF * IDF) and drop terms with frequency lower than min_df
exp.run(binary_bug, LinearSVC, min_df=3, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using stemmed text (TF * IDF) 
exp.run(binary_bug, LinearSVC, tokenizer=stem, text_feature='title')

In [None]:
# using stemmed text and descrption/comments (TF * IDF) 
exp.run(binary_bug, LinearSVC, tokenizer=stem, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using lemmatized text (TF * IDF)
exp.run(binary_bug, LinearSVC, tokenizer=lemmatize, text_feature='title')

In [None]:
# using lemmatized text and descrption/comments (TF * IDF)
exp.run(binary_bug, LinearSVC, tokenizer=lemmatize, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using only title(TF * IDF) 
# select top k features by mutual information
exp.run(binary_bug, LinearSVC, k=5000, text_feature='title')

In [None]:
# using all texts (TF * IDF) 
# select top k features by mutual information
exp.run(binary_bug, LinearSVC, k=5000, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using both stemmed text and stem=3000 (TF * IDF) 
exp.run(binary_bug, LinearSVC, tokenizer=stem, k=3000, text_feature='title')

In [None]:
# using both stemmed text and stem=5000 (TF * IDF) 
exp.run(binary_bug, LinearSVC, tokenizer=stem, k=5000, text_feature='title')

## 1.2 Use bns not tf-idf

In [None]:
# only using text (TF * BNS)
exp.run(binary_bug, LinearSVC, use_bns=True)

In [None]:
# using all title + description + comment (TF * BNS)
exp.run(binary_bug, LinearSVC, use_bns=True, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using all text (TF * BNS) and drop terms with frequency lower than min_df
exp.run(binary_bug, LinearSVC, use_bns=True, min_df=3, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using both stemmed text (TF * BNS)
exp.run(binary_bug, LinearSVC, use_bns=True, tokenizer=stem, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using both lemmatized text (TF * BNS) 
exp.run(binary_bug, LinearSVC, use_bns=True, tokenizer=lemmatize, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using both text (TF * BNS) and select top k features by mutual information
exp.run(binary_bug, LinearSVC, use_bns=True, k=5000, text_feature='title', use_description = True, use_comment = True)

# 2. Multi-Class Classification of Patch Type

## 2.1 Merged-Type Classification

In [None]:
# only using title (TF * IDF)
exp.run(limited_patch_type, LinearSVC, text_feature='title')

In [None]:
# using title and discription (TF * IDF)
exp.run(limited_patch_type, LinearSVC, text_feature='title', use_description = True)

In [None]:
# using title, discription and comments (TF * IDF)
exp.run(limited_patch_type, LinearSVC, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using title and stem
exp.run(limited_patch_type, LinearSVC, tokenizer=stem, k=3000, text_feature='title')

## 2.2 Non-Merged-Type Classification

In [None]:
# only using text (TF * IDF)
exp.run(multi_patch_type, LinearSVC, text_feature='title')

In [None]:
# only using title (TF * IDF)
exp.run(multi_patch_type, LinearSVC, text_feature='description')

In [None]:
# only using title (TF * IDF)
exp.run(multi_patch_type, LinearSVC, text_feature='comment')

In [None]:
# using title and description (TF * IDF)
exp.run(multi_patch_type, LinearSVC, text_feature='title', use_description = True)

In [None]:
# using all text (TF * IDF)
exp.run(multi_patch_type, LinearSVC, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using both stemmed text (TF * IDF) 
exp.run(multi_patch_type, LinearSVC, tokenizer=stem, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using both text (TF * IDF) and drop terms with frequency lower than min_df
exp.run(multi_patch_type, LinearSVC, min_df=3, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using both text (TF * IDF) and select top k features by mutual information
exp.run(multi_patch_type, LinearSVC, k=5000, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using both text (TF * IDF) and select top k features by mutual information
exp.run(multi_patch_type, LinearSVC, tokenizer=stem, k=3000, text_feature='title')

In [None]:
# using all text (TF * BNS)
exp.run(multi_patch_type, LinearSVC, use_bns=True, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using all text (TF * BNS) and drop terms with frequency lower than min_df
exp.run(multi_patch_type, LinearSVC, use_bns=True, min_df=3, text_feature='title', use_description = True, use_comment = True)

In [None]:
# using all text (TF * BNS) and select top k features by mutual information
exp.run(multi_patch_type, LinearSVC, use_bns=True, k=5000, text_feature='title', use_description = True, use_comment = True)

# 3. Random Forest

In [None]:
import pprint
from sklearn.ensemble import RandomForestClassifier
pp = pprint.PrettyPrinter()

## 3.1 Binary Classification

In [None]:
exp.run(binary_bug, RandomForestClassifier, text_feature='title', use_rf=True, num_estimators = 100)

In [None]:
exp.run(binary_bug, RandomForestClassifier, text_feature='title', use_rf=True, num_estimators = 300)

In [None]:
exp.run(binary_bug, RandomForestClassifier, text_feature='title', use_rf=True, num_estimators = 500)

In [None]:
exp.run(binary_bug, RandomForestClassifier, text_feature='description', use_rf=True)

In [None]:
exp.run(binary_bug, RandomForestClassifier, text_feature='comment', use_rf=True)

In [None]:
exp.run(binary_bug, RandomForestClassifier, text_feature='title', use_rf=True, use_description = True)

In [None]:
exp.run(binary_bug, RandomForestClassifier, text_feature='title', use_rf=True, use_description = True, use_comment = True)

In [None]:
exp.run(binary_bug, RandomForestClassifier, tokenizer=stem, k=3000, text_feature='title', use_rf=True)

## 3.2 Merged Type Classification

In [None]:
exp.run(limited_patch_type, RandomForestClassifier, text_feature='title', use_rf=True)

In [None]:
exp.run(limited_patch_type, RandomForestClassifier, text_feature='title', use_rf=True, use_description = True)

In [None]:
exp.run(limited_patch_type, RandomForestClassifier, text_feature='title', use_rf=True, use_description = True, use_comment = True)

In [None]:
exp.run(limited_patch_type, RandomForestClassifier, tokenizer=stem, k=3000, text_feature='title', use_rf=True)

## 3.3 Non-Merged Type Classification

In [None]:
exp.run(multi_patch_type, RandomForestClassifier, text_feature='title', use_rf=True)

In [None]:
exp.run(multi_patch_type, RandomForestClassifier, text_feature='title', use_rf=True, use_description = True)

In [None]:
exp.run(multi_patch_type, RandomForestClassifier, text_feature='title', use_rf=True, use_description = True, use_comment = True)

In [None]:
exp.run(multi_patch_type, RandomForestClassifier, tokenizer=stem, k=3000, text_feature='title', use_rf=True)

# 4. KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
exp.run(binary_bug, KNeighborsClassifier, text_feature='title')

In [None]:
exp.run(binary_bug, KNeighborsClassifier, text_feature='description')

In [None]:
exp.run(binary_bug, KNeighborsClassifier, text_feature='comment')

In [None]:
exp.run(binary_bug, KNeighborsClassifier, text_feature='title', use_description = True, use_comment = True)

In [None]:
exp.run(multi_patch_type, KNeighborsClassifier, text_feature='title', use_description = True, use_comment = True)

# 5. SVM 

In [None]:
from sklearn import svm

In [None]:
exp.run(binary_bug, svm.SVC, text_feature='title', use_svm=True, svm_type='linear')

In [None]:
exp.run(binary_bug, svm.SVC, text_feature='title', use_svm=True, svm_type='poly')

In [None]:
exp.run(binary_bug, svm.SVC, text_feature='title', use_svm=True, svm_type='rbf')