In [1]:
from collections import defaultdict
import numpy as np,os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, power_transform, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.ensemble import BalancedRandomForestClassifier
import seaborn as sns,matplotlib.pyplot as plt

from library.utils import evaluate, read_data

In [2]:
def oob_eval(clf,X,y_noisy,y_real):
    scores = defaultdict(list)
    for _ in range(100):
        train_id = np.random.choice(range(len(X)),size=len(X))
        clf = clf.fit(X[train_id],y_noisy[train_id])
        test_mask = np.ones_like(y_noisy,dtype='bool')
        test_mask[train_id] = 0
        probs = clf.predict_proba(X[test_mask])
        labels = np.argmax(probs,axis=1)
        assert probs.shape==(test_mask.sum(),2) 
        assert labels.shape==(test_mask.sum(),)
        scores['pr'] = precision_score(y_real[test_mask],labels)
        scores['rec'] = recall_score(y_real[test_mask],labels)
        scores['f1'] = f1_score(y_real[test_mask],labels)
        scores['roc'] = roc_auc_score(y_real[test_mask],probs[:,1])
    for k in scores:
        scores[k] = np.array(scores[k]).mean()
        print(k,scores[k])
    return scores

In [3]:
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]

In [4]:
rf = BalancedRandomForestClassifier(n_estimators=50,n_jobs=-1,random_state=42)

In [None]:
%%time
res = []
for d in DATASETS:
    X,y_noisy,y_real = read_data(d)
    print(d)
    tn, fp, fn, tp = confusion_matrix(y_real,y_noisy).ravel()
    print(f"Real:{y_real.sum()}, Heu:{y_noisy.sum()}, Actual % Bugs,Clean: {tp/(tp+fp):.3f},{tn/(tn+fn):.3f}")
    r = oob_eval(rf,X,y_noisy,y_real)
    res.append(r)
    print()

noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
activemq-5.8.0.csv
Real:206, Heu:203, Actual % Bugs,Clean: 0.522,0.969
pr 0.11370262390670553
rec 0.6842105263157895
f1 0.19499999999999998
roc 0.8218634822349993

noise:0.128, imb:6.017,117,704, Shape:(821, 65)
groovy-1_6_BETA_1.csv
Real:70, Heu:117, Actual % Bugs,Clean: 0.350,0.959
pr 0.23076923076923078
rec 0.782608695652174
f1 0.3564356435643565
roc 0.8273836765827612

noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
activemq-5.3.0.csv
Real:258, Heu:142, Actual % Bugs,Clean: 0.627,0.924
pr 0.3333333333333333
rec 0.6470588235294118
f1 0.44
roc 0.817630549629178

noise:0.164, imb:4.806,288,1384, Shape:(1672, 65)
wicket-1.3.0-incubating-beta-1.csv
Real:101, Heu:288, Actual % Bugs,Clean: 0.198,0.968
pr 0.18064516129032257
rec 0.7777777777777778
f1 0.2931937172774869
roc 0.8352941176470587

noise:0.175, imb:3.540,161,570, Shape:(731, 65)
jruby-1.1.csv
Real:87, Heu:161, Actual % Bugs,Clean: 0.373,0.953
pr 0.23333333333333334
rec 0.8076

In [None]:
np.random.choice(DATASETS,5)

In [None]:
np.unique(train).shape

In [None]:
358/978