In [20]:
from collections import defaultdict
import numpy as np,os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, power_transform, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, \
        confusion_matrix, matthews_corrcoef, precision_recall_curve, auc
from imblearn.ensemble import BalancedRandomForestClassifier
import seaborn as sns,matplotlib.pyplot as plt

from library.utils import evaluate, read_data

In [16]:
def oob_eval(clf,X,y_noisy,y_real):
    scores = defaultdict(list)
    for _ in range(100):
        train_id = np.random.choice(range(len(X)),size=len(X))
        clf = clf.fit(X[train_id],y_noisy[train_id])
        test_mask = np.ones_like(y_noisy,dtype='bool')
        test_mask[train_id] = 0
        probs = clf.predict_proba(X[test_mask])
        labels = np.argmax(probs,axis=1)
        assert probs.shape==(test_mask.sum(),2) 
        assert labels.shape==(test_mask.sum(),)
        scores['pr'] = precision_score(y_real[test_mask],labels)
        scores['rec'] = recall_score(y_real[test_mask],labels)
        scores['f1'] = f1_score(y_real[test_mask],labels)
        scores['roc'] = roc_auc_score(y_real[test_mask],probs[:,1])
#         scores['mathew'] = matthews_corrcoef(y_real[test_mask],labels)
#         prec, rec, _ = precision_recall_curve(y_real[test_mask],probs[:,1])
#         scores['APRC'] = auc(rec,prec)
    for k in scores:
        scores[k] = np.array(scores[k]).mean()
        print(k,scores[k])
    return scores

In [17]:
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
DATASETS = shuffle(DATASETS)

In [21]:
rf = RandomForestClassifier(n_estimators=50,n_jobs=-1,random_state=42)
lr = LogisticRegression(max_iter=1000,random_state=42)

In [22]:
%%time
res = []
for d in DATASETS[:5]:
    X,y_noisy,y_real = read_data(d)
    print(d)
    tn, fp, fn, tp = confusion_matrix(y_real,y_noisy).ravel()
    print(f"Real:{y_real.sum()}, Heu:{y_noisy.sum()}, Actual % Bugs,Clean: {tp/(tp+fp):.3f},{tn/(tn+fn):.3f}")
    r = oob_eval(rf,X,y_noisy,y_real)
    res.append(r)
    print()

noise:0.085, imb:8.463,80,677, Shape:(757, 65)
groovy-1_5_7.csv
Real:26, Heu:80, Actual % Bugs,Clean: 0.263,0.993
pr 0.3125
rec 0.7142857142857143
f1 0.43478260869565216
roc 0.9348279457768508
mathew 0.4534102097062684
APRC 0.42738897445770757

noise:0.044, imb:34.600,200,6920, Shape:(7120, 65)
camel-2.9.0.csv
Real:199, Heu:200, Actual % Bugs,Clean: 0.215,0.977
pr 0.42857142857142855
rec 0.039473684210526314
f1 0.07228915662650602
roc 0.7567069424534099
mathew 0.12329391263424652
APRC 0.16780366786474793

noise:0.234, imb:17.341,91,1578, Shape:(1669, 65)
hbase-0.95.0.csv
Real:383, Heu:91, Actual % Bugs,Clean: 0.462,0.784
pr 1.0
rec 0.0546875
f1 0.1037037037037037
roc 0.6266425862970711
mathew 0.20890294972548062
APRC 0.3947981495219468

noise:0.185, imb:6.037,190,1147, Shape:(1337, 65)
lucene-3.0.0.csv
Real:155, Heu:190, Actual % Bugs,Clean: 0.258,0.908
pr 0.125
rec 0.06666666666666667
f1 0.08695652173913045
roc 0.7094679186228482
mathew 0.001245454601036818
APRC 0.19829206083454418

n

In [None]:
np.random.choice(DATASETS,5)

In [None]:
np.unique(train).shape

In [None]:
358/978