In [None]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from collections import defaultdict
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from imblearn.under_sampling.base import BaseUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import CLFS, CV, SCORERS
from library.utils import evaluate, read_data

%load_ext autoreload
%autoreload 2

In [None]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

In [None]:
class NCL(BaseUnderSampler):
    def _fit_resample(self, X, y,sample=None):
        knn = KNeighborsClassifier(n_neighbors=5).fit(X,y)
        neighs = knn.kneighbors(return_distance=False)
        R = []
        for v in np.argwhere(y==1):
            nn = neighs[v]
            idx = y[nn]==0
            R.extend(list(nn[idx]))
        mask = np.ones_like(y,dtype='bool')
        mask[list(set(R))] = 0
        if sample:
            return X[mask],y[mask],sample[mask]
        return X[mask],y[mask]

In [None]:
models = {}
for c,clf in CLFS.items():
    models[('NCL',c)] = Pipeline([('samp',NCL()),('clf',clf)])
    
models.keys(),len(models)

In [None]:
path = "NCL.csv"
cols = pd.MultiIndex.from_product([['NCL'],CLFS.keys(),[f.__name__ for f in SCORERS]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)
#df = pd.read_csv(path,header=[0,1,2],index_col=0)

In [None]:
%%time
for it,d in enumerate(DATASETS):
    print(it)
    X,y_noisy,y_real = read_data(d,stats=True)
    for k in models:
        print(k)
        sd = perf_counter()
        r = evaluate(models[k],X,y_noisy,y_real,CV,SCORERS)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
        print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])
    print()
    df.to_csv(path)