In [75]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from collections import defaultdict
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, ShuffleSplit
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.pipeline import Pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import IMBS, CLFS, ENSEMBLES, CV, SCORERS
from library.utils import evaluate, read_data
from library.cleaners import kDN, ih_prob,FilteringEstimator

In [11]:
DATASET = "lucene-2.3.0.csv"

In [86]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
BOO = 0
T = perf_counter()

class CLNI(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator, detector, K=5, threshold=.999, random_state=None):
        self.estimator = estimator
        self.detector = detector
        self.threshold = threshold
        self.K = K
        self.random_state = random_state
        
    def clean(self,X,Y, sample_weight):
        N,alpha = 5,.60
        Xt,Yt = X.copy(),Y.copy()
        while True:
            ne = self.detector(Xt,Yt,K=self.K)
            cidx = ne<=alpha
            #print(cidx.sum(),len(Xt),cidx.sum()/len(Xt))
            N = len(Xt)
            Xt,Yt = Xt[cidx],Yt[cidx]
            try:
                sample_weight = sample_weight[cidx]
            except:
                pass
            if cidx.sum()/N>=.99:
                break
        return Xt,Yt,sample_weight

    def fit(self, X, Y,sample_weight=None):
        global BOO,T
        BOO += 1
        print("Fitting...",BOO,round(perf_counter()-T,3))
        T = perf_counter()
        
        Xf,Yf,sample_weight = self.clean(X, Y, sample_weight)
#         a,b = np.unique(Y,return_counts=True)[1],np.unique(Yf,return_counts=True)[1]
#         print(a.max()/a.min(),b.max()/b.min())
        try:
            self.estimator = self.estimator.fit(Xf, Yf,sample_weight=sample_weight)
        except TypeError as e:
            self.estimator = self.estimator.fit(Xf, Yf)
        return self

    @property
    def classes_(self):
        return self.estimator.classes_

    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

In [87]:
bal_nb = GaussianNB()
bal_dt_bag = DecisionTreeClassifier(max_depth=20,max_features='sqrt')
bal_dt_boost = DecisionTreeClassifier(max_depth=10)
ENSEMBLES = {
    'rboost_DT': RUSBoostClassifier(base_estimator=bal_dt_boost,algorithm='SAMME',n_estimators=50),
    'rboost_NB': RUSBoostClassifier(base_estimator=bal_nb,algorithm='SAMME',n_estimators=50),
    'bbag_DT': BalancedBaggingClassifier(base_estimator=bal_dt_bag,n_estimators=50),
    'bbag_NB': BalancedBaggingClassifier(base_estimator=bal_nb,n_estimators=50),
}

models = {}
for m,ens in ENSEMBLES.items():
    ens = clone(ens)
    ens.base_estimator = CLNI(ens.base_estimator,kDN)  #Consensus among 5 neighbors
    models[('ens',m)] = ens
    
models.keys(),len(models)

(dict_keys([('ens', 'rboost_DT'), ('ens', 'rboost_NB'), ('ens', 'bbag_DT'), ('ens', 'bbag_NB')]),
 4)

In [88]:
X,y_noisy,y_real = read_data(DATASET,stats=True)
CV = ShuffleSplit(n_splits=1,test_size=.25)
for k in models:
    if 'bag' in k[1]: continue
    print(k)
    sd = perf_counter()
    r = evaluate(models[k],X,y_noisy,y_real,CV,SCORERS)
    print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])

lucene-2.3.0.csv noise:0.204, imb:4.031,160,645, Shape:(805, 65)
('ens', 'rboost_DT')
Fitting... 1 0.506
Fitting... 2 0.763
Fitting... 3 0.678
Fitting... 4 0.629
2.74 [0.444, 0.654]
('ens', 'rboost_NB')
Fitting... 5 0.663
Fitting... 6 0.562
Fitting... 7 0.921
2.14 [0.472, 0.643]
