In [4]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from collections import defaultdict
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from imblearn.under_sampling.base import BaseUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import CLFS, CV, SCORERS
from library.utils import evaluate, read_data
from library.cleaners import kDN, ih_prob,FilteringEstimator

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [5]:
class NCL(BaseUnderSampler):
    def _fit_resample(self, X, y,sample=None):
        knn = KNeighborsClassifier(n_neighbors=5).fit(X,y)
        neighs = knn.kneighbors(return_distance=False)
        R = []
        for v in np.argwhere(y==1):
            nn = neighs[v]
            idx = y[nn]==0
            R.extend(list(nn[idx]))
        mask = np.ones_like(y,dtype='bool')
        mask[list(set(R))] = 0
        if sample:
            return X[mask],y[mask],sample[mask]
        return X[mask],y[mask]

In [6]:
models = {}
for c,clf in CLFS.items():
    models[('NCL',c)] = Pipeline([('samp',NCL()),('clf',clf)])
    
models.keys(),len(models)

(dict_keys([('NCL', 'dt'), ('NCL', 'lr'), ('NCL', 'nb'), ('NCL', 'svm'), ('NCL', 'knn'), ('NCL', 'rf')]),
 6)

In [7]:
path = "NCL.csv"
cols = pd.MultiIndex.from_product([['NCL'],CLFS.keys(),[f.__name__ for f in SCORERS]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)
#df = pd.read_csv(path,header=[0,1,2],index_col=0)

In [8]:
%%time
for it,d in enumerate(DATASETS):
    print(it)
    X,y_noisy,y_real = read_data(d,stats=True)
    for k in models:
        print(k)
        sd = perf_counter()
        r = evaluate(models[k],X,y_noisy,y_real,CV,SCORERS)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
        print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])
    print()
    df.to_csv(path)

0
activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
('NCL', 'dt')
7.02 [0.206, 0.294]
('NCL', 'lr')
9.0 [0.271, 0.323]
('NCL', 'nb')
6.38 [0.289, 0.409]
('NCL', 'svm')
19.64 [0.309, 0.312]
('NCL', 'knn')
8.01 [0.233, 0.266]
('NCL', 'rf')
10.27 [0.274, 0.334]

1
groovy-1_6_BETA_1.csv noise:0.128, imb:6.017,117,704, Shape:(821, 65)
('NCL', 'dt')
0.85 [0.33, 0.483]
('NCL', 'lr')
1.72 [0.423, 0.422]
('NCL', 'nb')
0.74 [0.266, 0.418]
('NCL', 'svm')
1.97 [0.366, 0.423]
('NCL', 'knn')
0.89 [0.4, 0.474]
('NCL', 'rf')
2.44 [0.449, 0.47]

2
activemq-5.3.0.csv noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
('NCL', 'dt')
4.26 [0.312, 0.42]
('NCL', 'lr')
5.66 [0.371, 0.484]
('NCL', 'nb')
3.88 [0.339, 0.46]
('NCL', 'svm')
9.62 [0.4, 0.476]
('NCL', 'knn')
4.8 [0.334, 0.43]
('NCL', 'rf')
6.67 [0.382, 0.493]

3
wicket-1.3.0-incubating-beta-1.csv noise:0.164, imb:4.806,288,1384, Shape:(1672, 65)
('NCL', 'dt')
2.66 [0.223, 0.433]
('NCL', 'lr')
3.98 [0.299, 0.416]
('NCL', 'nb')
2.3

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


10.98 [0.023, 0.128]
('NCL', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


5.86 [0.025, 0.133]
('NCL', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


8.03 [0.045, 0.165]

31
hive-0.9.0.csv noise:0.179, imb:25.717,53,1363, Shape:(1416, 65)
('NCL', 'dt')
1.99 [0.314, 0.527]
('NCL', 'lr')
3.69 [0.323, 0.552]
('NCL', 'nb')
1.78 [0.469, 0.606]
('NCL', 'svm')
3.14 [0.309, 0.506]
('NCL', 'knn')
2.16 [0.298, 0.526]
('NCL', 'rf')
3.64 [0.351, 0.561]

CPU times: user 39min 8s, sys: 15.8 s, total: 39min 24s
Wall time: 27min 28s
