In [20]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from imblearn.under_sampling import InstanceHardnessThreshold, EditedNearestNeighbours
from imblearn.combine import SMOTEENN
from imblearn.pipeline import make_pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import CV, SCORERS,CLFS
from library.utils import evaluate, read_data

In [11]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [44]:
#ENN already done in Noisy.csv, but kind_sel='mode' here, so here aggressive than before
imbs = {'ENN':EditedNearestNeighbours(n_neighbors=5,kind_sel='mode'), 
       'SmoteEnc':SMOTEENN(),'IHThreshold':InstanceHardnessThreshold()}

bases = {'nb':GaussianNB(),'dt20':DecisionTreeClassifier(max_depth=20,max_features='sqrt'),
        'dt10':DecisionTreeClassifier(max_depth=10)}

models = {}
for im,samp in imbs.items():
        models[(im,"BagNB")] = BaggingClassifier(base_estimator=make_pipeline(samp,bases['nb']),n_estimators=20)
        models[(im,"BagDT")] = BaggingClassifier(base_estimator=make_pipeline(samp,bases['dt20']),n_estimators=20)

models.keys(),len(models)

(dict_keys([('ENN', 'BagNB'), ('ENN', 'BagDT'), ('SmoteEnc', 'BagNB'), ('SmoteEnc', 'BagDT'), ('IHThreshold', 'BagNB'), ('IHThreshold', 'BagDT')]),
 6)

In [45]:
path = "Builtin32_Bag.csv"
cols = pd.MultiIndex.from_product([imbs.keys(),["AdaNB","BagNB","AdaDT","BagDT"],
                                   [f.__name__ for f in SCORERS]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)

In [None]:
%%time
for it,d in enumerate(DATASETS):
    print(it)
    X,y_noisy,y_real = read_data(d,stats=True)
    for k in models:
        if df.loc[d,(k[0],k[1],slice(None))].isna().sum()==0:
            continue
        print(k)
        sd = perf_counter()
        r = evaluate(models[k],X,y_noisy,y_real,CV,SCORERS)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = np.nanmean(r[f])
        print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])
    print()
    df.to_csv(path)

0
activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
('ENN', 'BagNB')
53.87 [0.292, 0.373]
('ENN', 'BagDT')
54.68 [0.268, 0.321]
('SmoteEnc', 'BagNB')
72.39 [0.286, 0.399]
('SmoteEnc', 'BagDT')
73.58 [0.245, 0.282]
('IHThreshold', 'BagNB')
303.23 [0.269, 0.447]
('IHThreshold', 'BagDT')
303.92 [0.278, 0.302]

1
groovy-1_6_BETA_1.csv noise:0.128, imb:6.017,117,704, Shape:(821, 65)
('ENN', 'BagNB')
6.98 [0.24, 0.365]
('ENN', 'BagDT')
7.31 [0.452, 0.451]
('SmoteEnc', 'BagNB')
8.27 [0.267, 0.411]
('SmoteEnc', 'BagDT')
8.53 [0.446, 0.407]
('IHThreshold', 'BagNB')
131.62 [0.266, 0.403]
('IHThreshold', 'BagDT')
131.47 [0.285, 0.429]

2
activemq-5.3.0.csv noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
('ENN', 'BagNB')
33.41 [0.325, 0.453]
('ENN', 'BagDT')
33.94 [0.31, 0.444]
('SmoteEnc', 'BagNB')
44.9 [0.331, 0.458]
('SmoteEnc', 'BagDT')
68.49 [0.382, 0.453]
('IHThreshold', 'BagNB')
325.31 [0.312, 0.481]
('IHThreshold', 'BagDT')
341.91 [0.34, 0.448]

3
wicket-1.3.0-incuba

In [None]:
np.nanmean(p)