In [2]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from collections import defaultdict
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from imblearn.under_sampling import InstanceHardnessThreshold, RandomUnderSampler, EditedNearestNeighbours
from imblearn.pipeline import Pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import CLFS, CV, SCORERS
from library.utils import evaluate, read_data
from library.cleaners import kDN, ih_prob,FilteringEstimator

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
class Spyder(BaseEstimator, ClassifierMixin):
    def __init__(self,estimator,K=5):
        self.estimator = estimator
        self.K = K
        
    def sample(self,X,Y,sample_weight):    # SPIDER2, relabel=False and ampl=weak by default
        if sample_weight is None:
            sample_weight = np.ones_like(Y)

        # First step, Remove unsafe samples from majority, 
        enn = EditedNearestNeighbours(sampling_strategy='majority',n_neighbors=5,kind_sel='mode')
        Xs,Ys = enn.fit_resample(X,Y)
        sample_weight = sample_weight[enn.sample_indices_]
            
        #Second, upsample unsafe minority samples
        disagreement = (kDN(Xs,Ys,K=5,weight='uniform') * 5).astype('int')
        agreement = 5 - disagreement

        n = disagreement - agreement  #Number of times to upsample
        n[Ys==0] = 0  #Don't upsample majority-class samples
        n[n<0] = 0

        Xt,Yt,SWt = Xs.copy(),Ys.copy(),sample_weight.copy()
        while n.sum()>0:
            Xtmp, Ytmp, SWtmp = Xs[n>0].copy(),Ys[n>0].copy(),sample_weight[n>0].copy()
            Xt,Yt,SWt = np.concatenate((Xt,Xtmp)),np.concatenate((Yt,Ytmp)),np.concatenate((SWt,SWtmp))
            
            n -= 1
            n[n==-1] = 0
            
        return Xt,Yt,SWt
    
    def fit(self, X, Y,sample_weight=None):
        Xf,Yf,sample_weight = self.sample(X, Y, sample_weight)
#         a,b = np.unique(Y,return_counts=True)[1],np.unique(Yf,return_counts=True)[1]
#         print(a.max()/a.min(),b.max()/b.min(),a,b,Xf.shape,len(Xf)/len(X))
        assert len(np.unique(Yf))==2,"Pos class completely filtered out"
        try:
            self.estimator = self.estimator.fit(Xf, Yf,sample_weight=sample_weight)
        except TypeError as e:
            self.estimator = self.estimator.fit(Xf, Yf)
        return self

    @property
    def classes_(self):
        return self.estimator.classes_

    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X) 

In [9]:
models = {}
for c,clf in CLFS.items():
    models[('Spyder',c)] = Spyder(clf)
    
models.keys(),len(models)

(dict_keys([('Spyder', 'dt'), ('Spyder', 'lr'), ('Spyder', 'nb'), ('Spyder', 'svm'), ('Spyder', 'knn'), ('Spyder', 'rf')]),
 6)

In [10]:
path = "Spyder.csv"
cols = pd.MultiIndex.from_product([['Spyder'],CLFS.keys(),[f.__name__ for f in SCORERS]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)

In [11]:
%%time
for it,d in enumerate(DATASETS):
    print(it)
    X,y_noisy,y_real = read_data(d,stats=True)
    for k in models:
        print(k)
        sd = perf_counter()
        r = evaluate(models[k],X,y_noisy,y_real,CV,SCORERS)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
        print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])
    print()
    df.to_csv(path)

0
activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
('Spyder', 'dt')
11.26 [0.189, 0.264]
('Spyder', 'lr')
17.59 [0.288, 0.304]
('Spyder', 'nb')
10.2 [0.281, 0.385]
('Spyder', 'svm')
68.58 [0.293, 0.329]
('Spyder', 'knn')
11.95 [0.181, 0.208]
('Spyder', 'rf')
15.22 [0.278, 0.319]

1
groovy-1_6_BETA_1.csv noise:0.128, imb:6.017,117,704, Shape:(821, 65)
('Spyder', 'dt')
3.76 [0.316, 0.438]
('Spyder', 'lr')
5.96 [0.346, 0.354]
('Spyder', 'nb')
3.59 [0.226, 0.399]
('Spyder', 'svm')
7.97 [0.353, 0.404]
('Spyder', 'knn')
3.77 [0.291, 0.342]
('Spyder', 'rf')
5.63 [0.444, 0.461]

2
activemq-5.3.0.csv noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
('Spyder', 'dt')
7.73 [0.259, 0.373]
('Spyder', 'lr')
11.59 [0.364, 0.456]
('Spyder', 'nb')
7.38 [0.336, 0.453]
('Spyder', 'svm')
32.17 [0.394, 0.486]
('Spyder', 'knn')
8.39 [0.31, 0.344]
('Spyder', 'rf')
10.81 [0.327, 0.472]

3
wicket-1.3.0-incubating-beta-1.csv noise:0.164, imb:4.806,288,1384, Shape:(1672, 65)
('Spyder', 'dt'

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


12.32 [0.097, 0.156]

11
lucene-3.1.csv noise:0.120, imb:7.477,331,2475, Shape:(2806, 65)
('Spyder', 'dt')
10.04 [0.146, 0.257]
('Spyder', 'lr')
14.35 [0.189, 0.244]
('Spyder', 'nb')
9.5 [0.173, 0.233]
('Spyder', 'svm')
54.82 [0.184, 0.167]
('Spyder', 'knn')
11.27 [0.152, 0.173]
('Spyder', 'rf')
14.29 [0.182, 0.155]

12
groovy-1_6_BETA_2.csv noise:0.096, imb:7.583,103,781, Shape:(884, 65)
('Spyder', 'dt')
3.84 [0.291, 0.399]
('Spyder', 'lr')
6.06 [0.215, 0.302]
('Spyder', 'nb')
3.67 [0.221, 0.36]
('Spyder', 'svm')
8.56 [0.312, 0.305]
('Spyder', 'knn')
3.89 [0.341, 0.43]
('Spyder', 'rf')
5.67 [0.372, 0.45]

13
activemq-5.2.0.csv noise:0.113, imb:12.247,154,1886, Shape:(2040, 65)
('Spyder', 'dt')
6.9 [0.24, 0.356]
('Spyder', 'lr')
10.83 [0.366, 0.42]
('Spyder', 'nb')
6.53 [0.377, 0.536]
('Spyder', 'svm')
30.74 [0.466, 0.535]
('Spyder', 'knn')
7.54 [0.226, 0.291]
('Spyder', 'rf')
9.77 [0.269, 0.413]

14
groovy-1_5_7.csv noise:0.085, imb:8.463,80,677, Shape:(757, 65)
('Spyder', 'dt')
3.65 

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


12.54 [0.033, 0.176]

31
hive-0.9.0.csv noise:0.179, imb:25.717,53,1363, Shape:(1416, 65)
('Spyder', 'dt')
5.14 [0.225, 0.484]
('Spyder', 'lr')
8.25 [0.317, 0.532]
('Spyder', 'nb')
4.92 [0.459, 0.606]
('Spyder', 'svm')
9.82 [0.306, 0.523]
('Spyder', 'knn')
5.34 [0.211, 0.435]
('Spyder', 'rf')
7.03 [0.253, 0.524]

CPU times: user 1h 49min 35s, sys: 39.8 s, total: 1h 50min 15s
Wall time: 58min 9s
