In [2]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from collections import defaultdict
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.pipeline import Pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import IMBS, CLFS, ENSEMBLES, CV, SCORERS
from library.utils import evaluate, read_data
from library.cleaners import kDN, ih_prob,FilteringEstimator

In [3]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
class CLNI(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator, detector, K=5, threshold=.999, random_state=None):
        self.estimator = estimator
        self.detector = detector
        self.threshold = threshold
        self.K = K
        self.random_state = random_state
        
    def clean(self,X,Y, sample_weight):
        N,alpha = 5,.60
        Xt,Yt = X.copy(),Y.copy()
        while True:
            ne = self.detector(Xt,Yt,K=self.K)
            cidx = ne<=alpha
            #print(cidx.sum(),len(Xt),cidx.sum()/len(Xt))
            N = len(Xt)
            Xt,Yt = Xt[cidx],Yt[cidx]
            try:
                sample_weight = sample_weight[cidx]
            except:
                pass
            if cidx.sum()/N>=.99:
                break
        return Xt,Yt,sample_weight

    def fit(self, X, Y,sample_weight=None):
        Xf,Yf,sample_weight = self.clean(X, Y, sample_weight)
#         a,b = np.unique(Y,return_counts=True)[1],np.unique(Yf,return_counts=True)[1]
#         print(a.max()/a.min(),b.max()/b.min())
        try:
            self.estimator = self.estimator.fit(Xf, Yf,sample_weight=sample_weight)
        except TypeError as e:
            self.estimator = self.estimator.fit(Xf, Yf)
        return self

    @property
    def classes_(self):
        return self.estimator.classes_

    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

In [9]:
models = {}
for im,samp in IMBS.items():
    for c,clf in CLFS.items():
        models[(im,c)] = Pipeline([('samp',samp),('clf',CLNI(clf,kDN))])
    
models.keys(),len(models)

(dict_keys([('smote', 'dt'), ('smote', 'lr'), ('smote', 'nb'), ('smote', 'svm'), ('smote', 'knn'), ('smote', 'rf'), ('rus', 'dt'), ('rus', 'lr'), ('rus', 'nb'), ('rus', 'svm'), ('rus', 'knn'), ('rus', 'rf'), ('wilson', 'dt'), ('wilson', 'lr'), ('wilson', 'nb'), ('wilson', 'svm'), ('wilson', 'knn'), ('wilson', 'rf'), ('tomek', 'dt'), ('tomek', 'lr'), ('tomek', 'nb'), ('tomek', 'svm'), ('tomek', 'knn'), ('tomek', 'rf'), ('None', 'dt'), ('None', 'lr'), ('None', 'nb'), ('None', 'svm'), ('None', 'knn'), ('None', 'rf')]),
 30)

In [5]:
path = "CLNI_consensus.csv"
cols = pd.MultiIndex.from_product([IMBS.keys(),CLFS.keys(),[f.__name__ for f in SCORERS]],names=['imb','clf','metric'])
#df = pd.DataFrame(index=DATASETS,columns=cols)
#df = pd.read_csv(path,header=[0,1,2],index_col=0)

In [10]:
for it,d in enumerate(DATASETS):
    print(it)
    X,y_noisy,y_real = read_data(d,stats=True)
    if df.loc[d,:].isna().sum()==0:
        print(f"Skipping {d}")
        continue
    for k in models:
        print(k)
        sd = perf_counter()
        r = evaluate(models[k],X,y_noisy,y_real,CV,SCORERS)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
        print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])
    print()
    df.to_csv(path)

0
activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
Skipping activemq-5.8.0.csv
1
groovy-1_6_BETA_1.csv noise:0.128, imb:6.017,117,704, Shape:(821, 65)
Skipping groovy-1_6_BETA_1.csv
2
activemq-5.3.0.csv noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
Skipping activemq-5.3.0.csv
3
wicket-1.3.0-incubating-beta-1.csv noise:0.164, imb:4.806,288,1384, Shape:(1672, 65)
Skipping wicket-1.3.0-incubating-beta-1.csv
4
jruby-1.1.csv noise:0.175, imb:3.540,161,570, Shape:(731, 65)
Skipping jruby-1.1.csv
5
jruby-1.4.0.csv noise:0.190, imb:3.890,200,778, Shape:(978, 65)
Skipping jruby-1.4.0.csv
6
lucene-2.3.0.csv noise:0.204, imb:4.031,160,645, Shape:(805, 65)
Skipping lucene-2.3.0.csv
7
hbase-0.95.2.csv noise:0.260, imb:15.088,114,1720, Shape:(1834, 65)
('smote', 'dt')
11.05 [0.156, 0.439]
('smote', 'lr')
14.65 [0.182, 0.418]
('smote', 'nb')
9.44 [0.324, 0.52]
('smote', 'svm')
45.34 [0.196, 0.415]
('smote', 'knn')
10.3 [0.168, 0.454]
('smote', 'rf')
16.44 [0.23, 0.475]
('rus

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


6.3 [0.145, 0.58]
('None', 'lr')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


7.08 [0.113, 0.46]
('None', 'nb')
7.71 [0.222, 0.493]
('None', 'svm')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


8.54 [0.151, 0.289]
('None', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


7.99 [0.117, 0.496]
('None', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


8.07 [0.14, 0.446]

8
lucene-3.0.0.csv noise:0.185, imb:6.037,190,1147, Shape:(1337, 65)
('smote', 'dt')
11.82 [0.245, 0.431]
('smote', 'lr')
13.0 [0.316, 0.362]
('smote', 'nb')
11.32 [0.31, 0.335]
('smote', 'svm')
24.18 [0.322, 0.362]
('smote', 'knn')
12.73 [0.269, 0.455]
('smote', 'rf')
14.75 [0.335, 0.305]
('rus', 'dt')
9.73 [0.264, 0.494]
('rus', 'lr')
11.98 [0.332, 0.331]
('rus', 'nb')
9.53 [0.284, 0.355]
('rus', 'svm')
9.97 [0.327, 0.323]
('rus', 'knn')
9.62 [0.264, 0.397]
('rus', 'rf')
11.18 [0.337, 0.324]
('wilson', 'dt')
8.02 [0.3, 0.454]
('wilson', 'lr')
10.01 [0.327, 0.338]
('wilson', 'nb')
8.04 [0.301, 0.369]
('wilson', 'svm')
9.58 [0.298, 0.314]
('wilson', 'knn')
8.07 [0.302, 0.373]
('wilson', 'rf')
10.12 [0.329, 0.319]
('tomek', 'dt')
12.2 [0.168, 0.3]
('tomek', 'lr')
13.93 [0.206, 0.308]
('tomek', 'nb')
11.84 [0.25, 0.338]
('tomek', 'svm')
13.63 [0.135, 0.26]
('tomek', 'knn')
12.2 [0.138, 0.256]
('tomek', 'rf')
13.65 [0.123, 0.296]
('None', 'dt')
10.47 [0.15, 0.283]
('No

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


56.85 [0.18, 0.256]
('tomek', 'nb')
52.75 [0.281, 0.292]
('tomek', 'svm')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


58.86 [0.151, 0.156]
('tomek', 'knn')
60.02 [0.175, 0.259]
('tomek', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


56.43 [0.177, 0.24]
('None', 'dt')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


26.68 [0.17, 0.301]
('None', 'lr')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


26.8 [0.162, 0.258]
('None', 'nb')
25.7 [0.274, 0.276]
('None', 'svm')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


29.77 [0.136, 0.147]
('None', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


35.78 [0.164, 0.263]
('None', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


31.06 [0.131, 0.247]

10
wicket-1.5.3.csv noise:0.064, imb:26.720,93,2485, Shape:(2578, 65)
('smote', 'dt')
20.73 [0.121, 0.168]
('smote', 'lr')
23.67 [0.169, 0.152]
('smote', 'nb')
17.89 [0.206, 0.352]
('smote', 'svm')
84.85 [0.15, 0.135]
('smote', 'knn')
20.14 [0.151, 0.269]
('smote', 'rf')
28.69 [0.155, 0.13]
('rus', 'dt')
8.2 [0.198, 0.42]
('rus', 'lr')
10.02 [0.203, 0.19]
('rus', 'nb')
8.71 [0.22, 0.391]
('rus', 'svm')
9.21 [0.217, 0.152]
('rus', 'knn')
8.85 [0.201, 0.355]
('rus', 'rf')
10.26 [0.219, 0.204]
('wilson', 'dt')
12.08 [0.117, 0.171]
('wilson', 'lr')
13.77 [0.137, 0.195]
('wilson', 'nb')
11.03 [0.222, 0.294]
('wilson', 'svm')
13.47 [0.146, 0.149]
('wilson', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


12.65 [0.111, 0.175]
('wilson', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


13.31 [0.135, 0.19]
('tomek', 'dt')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


IndexError: index 1 is out of bounds for axis 1 with size 1

In [11]:
DATASETS.index("jruby-1.1.csv")

4