In [2]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from collections import defaultdict
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from imblearn.under_sampling import InstanceHardnessThreshold, RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import IMBS, CLFS, ENSEMBLES, CV, SCORERS
from library.utils import evaluate, read_data
from library.cleaners import kDN, ih_prob,FilteringEstimator

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
class IHFilter(BaseEstimator, ClassifierMixin):
    def __init__(self,estimator,threshold=.5):
        self.estimator = estimator
        self.threshold = threshold
        
    def clean(self,X,Y,sample_weight):
        skf = StratifiedKFold(n_splits=5, shuffle=True)
        rf = RandomForestClassifier(n_estimators=50,n_jobs=4)
        
        probabilities = np.zeros(Y.shape[0], dtype=float)
        for train_index, test_index in skf.split(X,Y):
            rf.fit(X[train_index], Y[train_index])
            probs = rf.predict_proba(X[test_index])
            probabilities[test_index] = probs[range(len(test_index)), Y[test_index]]
            
        hardness = 1 - probabilities
        clean_idx = hardness <= self.threshold
        
        try:
            sample_weight = sample_weight[clean_idx]
        except:
            pass
        Xt,Yt = X[clean_idx],Y[clean_idx]
        return Xt,Yt,sample_weight
    
    def fit(self, X, Y,sample_weight=None):
        Xf,Yf,sample_weight = self.clean(X, Y, sample_weight)
#         a,b = np.unique(Y,return_counts=True)[1],np.unique(Yf,return_counts=True)[1]
#         print(a.max()/a.min(),b.max()/b.min(),a,b,Xf.shape,len(Xf)/len(X))
        assert len(np.unique(Yf))==2,"Pos class completely filtered out"
        try:
            self.estimator = self.estimator.fit(Xf, Yf,sample_weight=sample_weight)
        except TypeError as e:
            self.estimator = self.estimator.fit(Xf, Yf)
        return self

    @property
    def classes_(self):
        return self.estimator.classes_

    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X) 

In [5]:
models = {}
for im,samp in IMBS.items():
    for c,clf in CLFS.items():
        models[(im,c)] = Pipeline([('samp',samp),('clf',IHFilter(clf))])

for m,ens in ENSEMBLES.items():
    ens = clone(ens)
    ens.base_estimator = IHFilter(ens.base_estimator)
    models[('ens',m)] = ens
    
models.keys(),len(models)

(dict_keys([('smote', 'dt'), ('smote', 'lr'), ('smote', 'nb'), ('smote', 'svm'), ('smote', 'knn'), ('smote', 'rf'), ('rus', 'dt'), ('rus', 'lr'), ('rus', 'nb'), ('rus', 'svm'), ('rus', 'knn'), ('rus', 'rf'), ('wilson', 'dt'), ('wilson', 'lr'), ('wilson', 'nb'), ('wilson', 'svm'), ('wilson', 'knn'), ('wilson', 'rf'), ('tomek', 'dt'), ('tomek', 'lr'), ('tomek', 'nb'), ('tomek', 'svm'), ('tomek', 'knn'), ('tomek', 'rf'), ('None', 'dt'), ('None', 'lr'), ('None', 'nb'), ('None', 'svm'), ('None', 'knn'), ('None', 'rf'), ('ens', 'rboost_DT'), ('ens', 'rboost_NB'), ('ens', 'bbag_DT'), ('ens', 'bbag_NB')]),
 34)

In [6]:
cols = pd.MultiIndex.from_product([IMBS.keys(),CLFS.keys(),[f.__name__ for f in SCORERS]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)
#df = pd.read_csv("IHFilter.csv",header=[0,1,2],index_col=0)

In [None]:
for d in DATASETS:
    X,y_noisy,y_real = read_data(d,stats=True)
    if df.loc[d,:].isna().sum()==0:
        print(f"SKIPPING {d}\n")
        continue
    for k in models:
        print(k)
        sd = perf_counter()
        r = evaluate(models[k],X,y_noisy,y_real,CV,SCORERS)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
        print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])
    print()
    df.to_csv("IHFilter.csv")

activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
('smote', 'dt')
43.01 [0.202, 0.282]
('smote', 'lr')
50.67 [0.259, 0.305]
('smote', 'nb')
40.56 [0.29, 0.402]
('smote', 'svm')
198.38 [0.251, 0.201]
('smote', 'knn')
42.85 [0.215, 0.29]
('smote', 'rf')
52.37 [0.266, 0.289]
('rus', 'dt')
27.45 [0.229, 0.466]
('rus', 'lr')
27.98 [0.247, 0.311]
('rus', 'nb')
27.25 [0.273, 0.451]
('rus', 'svm')
27.69 [0.243, 0.243]
('rus', 'knn')
27.62 [0.241, 0.407]
('rus', 'rf')
28.88 [0.253, 0.314]
('wilson', 'dt')
33.73 [0.263, 0.328]
('wilson', 'lr')
36.19 [0.282, 0.322]
('wilson', 'nb')
33.06 [0.293, 0.399]
('wilson', 'svm')
37.72 [0.299, 0.294]
('wilson', 'knn')
36.85 [0.251, 0.304]
('wilson', 'rf')
39.35 [0.299, 0.338]
('tomek', 'dt')
34.18 [0.246, 0.372]
('tomek', 'lr')
35.08 [0.266, 0.34]
('tomek', 'nb')
33.48 [0.303, 0.346]
('tomek', 'svm')
35.73 [0.273, 0.207]
('tomek', 'knn')
34.22 [0.25, 0.392]
('tomek', 'rf')
34.68 [0.28, 0.341]
('None', 'dt')
28.84 [0.25, 0.391]
('None', 