In [2]:
import numpy as np,os
import pandas as pd
from collections import defaultdict
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.pipeline import Pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.utils import evaluate, read_data
from library.cleaners import kDN, ih_prob,FilteringEstimator

## Noise removal using Filtering
By setting 3 thresholds on noise probability.

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
class CleaningEstimator(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator, detector, K, threshold, random_state=None):
        self.estimator = estimator
        self.detector = detector
        self.threshold = threshold
        self.K = K
        self.random_state = random_state

    def fit(self, X, Y,sample_weight=None):
        noise_prob = self.detector(X, Y, K=self.K)
        to_keep = noise_prob<self.threshold
        Xf, Yf = X[to_keep],Y[to_keep]
        a,b = np.unique(Y,return_counts=True)[1],np.unique(Yf,return_counts=True)[1]
        #print(a.max()/a.min(),b.max()/b.min())
        try:
            self.estimator = self.estimator.fit(Xf, Yf,sample_weight=sample_weight[to_keep])
        except TypeError as e:
            self.estimator = self.estimator.fit(Xf, Yf)
        return self

    @property
    def classes_(self):
        return self.estimator.classes_

    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

In [4]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [5]:
imbs = {
    'smote': SMOTE(k_neighbors=5),
    'rus': RandomUnderSampler('not minority'),
    'wilson':EditedNearestNeighbours(n_neighbors=5),  #Default was 3
    'tomek': TomekLinks(),
}
clfs = {
    'dt': DecisionTreeClassifier(max_depth=20),
    'lr': LogisticRegression(solver='lbfgs',max_iter=1000),
    'nb': GaussianNB(),
    'knn': KNeighborsClassifier(n_neighbors=5),
    'rf': RandomForestClassifier(n_estimators=50),
}
bal_nb = CleaningEstimator(GaussianNB(),kDN,K=5,threshold=.999)
bal_dt_20 = CleaningEstimator(DecisionTreeClassifier(max_depth=20,max_features='sqrt'),kDN,K=5,threshold=.999)
bal_dt_boost = CleaningEstimator(DecisionTreeClassifier(max_depth=10),kDN,K=5,threshold=.999)
ensembles = {
    'rboost_DT': RUSBoostClassifier(base_estimator=clone(bal_dt_boost),algorithm='SAMME',n_estimators=10),
    'rboost_NB': RUSBoostClassifier(base_estimator=clone(bal_nb),algorithm='SAMME',n_estimators=10),
    'bbag_DT': BalancedBaggingClassifier(base_estimator=clone(bal_dt_20),n_estimators=50),
    'bbag_NB': BalancedBaggingClassifier(base_estimator=clone(bal_nb),n_estimators=50),
}
simples = {
    'LR': LogisticRegression(solver='lbfgs',max_iter=1000),
    'RF': RandomForestClassifier(n_estimators=50)
}

In [6]:
models = {}
for im,samp in imbs.items():
    for c,clf in clfs.items():
        models[(im,c)] = Pipeline([('samp',samp),('clf',CleaningEstimator(clf,kDN,K=5,threshold=.999))])

for m,ens in ensembles.items():
    models[('ens',m)] = ens
    
for m,clf in simples.items():
    models[('sim',m)] = CleaningEstimator(clf,kDN,K=5,threshold=.999)    
    
models.keys(),len(models)

(dict_keys([('smote', 'dt'), ('smote', 'lr'), ('smote', 'nb'), ('smote', 'knn'), ('smote', 'rf'), ('rus', 'dt'), ('rus', 'lr'), ('rus', 'nb'), ('rus', 'knn'), ('rus', 'rf'), ('wilson', 'dt'), ('wilson', 'lr'), ('wilson', 'nb'), ('wilson', 'knn'), ('wilson', 'rf'), ('tomek', 'dt'), ('tomek', 'lr'), ('tomek', 'nb'), ('tomek', 'knn'), ('tomek', 'rf'), ('ens', 'rboost_DT'), ('ens', 'rboost_NB'), ('ens', 'bbag_DT'), ('ens', 'bbag_NB'), ('sim', 'LR'), ('sim', 'RF')]),
 26)

In [7]:
cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=None)
def pr_rec_score(y,yp):
    prec, rec, _ = precision_recall_curve(y,yp)
    return auc(rec,prec)
scorers = [matthews_corrcoef,pr_rec_score]

In [8]:
cols = pd.MultiIndex.from_product([imbs.keys(),clfs.keys(),[f.__name__ for f in scorers]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)

In [None]:
for d in DATASETS:
    X,y_noisy,y_real = read_data(d,stats=True)
    for k in models:
        print(k)
        r = evaluate(models[k],X,y_noisy,y_real,cv,scorers)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
    df.to_csv("Balancing->Filtering.csv")

activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
('smote', 'dt')
('smote', 'lr')
('smote', 'nb')
('smote', 'knn')
('smote', 'rf')
('rus', 'dt')
('rus', 'lr')
('rus', 'nb')
('rus', 'knn')
('rus', 'rf')
('wilson', 'dt')
('wilson', 'lr')
('wilson', 'nb')
('wilson', 'knn')
('wilson', 'rf')
('tomek', 'dt')
('tomek', 'lr')
('tomek', 'nb')
('tomek', 'knn')
('tomek', 'rf')
('ens', 'rboost_DT')
('ens', 'rboost_NB')
('ens', 'bbag_DT')
('ens', 'bbag_NB')
('sim', 'LR')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


('sim', 'RF')
groovy-1_6_BETA_1.csv noise:0.128, imb:6.017,117,704, Shape:(821, 65)
('smote', 'dt')
('smote', 'lr')
('smote', 'nb')
('smote', 'knn')
('smote', 'rf')
('rus', 'dt')
('rus', 'lr')
('rus', 'nb')
('rus', 'knn')
('rus', 'rf')
('wilson', 'dt')
('wilson', 'lr')
('wilson', 'nb')
('wilson', 'knn')
('wilson', 'rf')
('tomek', 'dt')
('tomek', 'lr')
('tomek', 'nb')
('tomek', 'knn')
('tomek', 'rf')
('ens', 'rboost_DT')
('ens', 'rboost_NB')
('ens', 'bbag_DT')
('ens', 'bbag_NB')
('sim', 'LR')
('sim', 'RF')
activemq-5.3.0.csv noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
('smote', 'dt')
('smote', 'lr')
('smote', 'nb')
('smote', 'knn')
('smote', 'rf')
('rus', 'dt')
('rus', 'lr')
('rus', 'nb')
('rus', 'knn')
('rus', 'rf')
('wilson', 'dt')
('wilson', 'lr')
('wilson', 'nb')
('wilson', 'knn')
('wilson', 'rf')
('tomek', 'dt')
('tomek', 'lr')
('tomek', 'nb')
('tomek', 'knn')
('tomek', 'rf')
('ens', 'rboost_DT')
('ens', 'rboost_NB')
('ens', 'bbag_DT')
('ens', 'bbag_NB')


In [20]:
bal_dt_boost.estimator.fit(X,y_noisy)
hasattr(bal_dt_boost.estimator,'classes_')

True