In [7]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from collections import defaultdict
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks, RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline, make_pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.utils import evaluate, read_data
from library.cleaners import kDN, ih_prob,FilteringEstimator

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Notes
+ Need to take best value among samplers, since some don't completely balance, and IPF sucks there

In [3]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
class IPF(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator, n=5, max_iter = 3,random_state=None):
        self.estimator = estimator
        self.n = n
        self.max_iter = max_iter
        self.random_state = random_state
        
    def clean(self,X,Y, sample_weight):
        Xt,Yt = shuffle(X,Y)
        orig_size = len(X)
        n_iters_with_small_change = 0
        tmp = 0
        while n_iters_with_small_change<self.max_iter:
            tmp += 1
            cur_size = len(Xt)
            breaks = [(len(Xt)//self.n)*i for i in range(1,self.n)]
            Xs,Ys = np.split(Xt,breaks),np.split(Yt,breaks)
            
            clfs = []
            for i in range(self.n):
                c = DecisionTreeClassifier(max_depth=2).fit(Xs[i],Ys[i])
                clfs.append(c)

            preds = np.zeros((len(Xt),self.n))
            for i in range(self.n):
                preds[:,i] = clfs[i].predict(Xt)
            eqs = preds==Yt.reshape(-1,1)  # Shape: (len(Xt),self.n)
            clean_idx = eqs.sum(axis=1)>=(self.n/2)  # Idx of clean samples
            
            try:
                sample_weight = sample_weight[clean_idx]
            except:
                pass
            Xt,Yt = Xt[clean_idx],Yt[clean_idx]
            
            cur_change = cur_size - len(Xt)
            if cur_change<=.01*orig_size:
                n_iters_with_small_change += 1
            else:
                n_iters_with_small_change = 0  #Because these small change has to be consecutively 3 times
            #print(tmp,cur_change,orig_size,cur_change/orig_size)  
        return Xt,Yt,sample_weight


    def fit(self, X, Y,sample_weight=None):
        Xf,Yf,sample_weight = self.clean(X, Y, sample_weight)
        a,b = np.unique(Y,return_counts=True)[1],np.unique(Yf,return_counts=True)[1]
        #print(a.max()/a.min(),b.max()/b.min(),a,b,Xf.shape,len(Xf)/len(X))
        assert len(np.unique(Yf))==2,"Pos class completely filtered out"
        try:
            self.estimator = self.estimator.fit(Xf, Yf,sample_weight=sample_weight)
        except TypeError as e:
            self.estimator = self.estimator.fit(Xf, Yf)
        return self

    @property
    def classes_(self):
        return self.estimator.classes_

    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

In [None]:
imbs = {
    'smote': SMOTE(k_neighbors=5),
    'rus': RandomUnderSampler('not minority'),
    'wilson':EditedNearestNeighbours(n_neighbors=5,kind_sel='all'),  #Default was 3
    'tomek': TomekLinks(),
    'None': 'passthrough',
}
clfs = {
    'dt': DecisionTreeClassifier(max_depth=20),
    'lr': LogisticRegression(solver='lbfgs',max_iter=1000),
    'nb': GaussianNB(),
    'svm': SVC(probability=True),
    'knn': KNeighborsClassifier(n_neighbors=5),
    'rf': RandomForestClassifier(n_estimators=50),
}
bal_nb = IPF(GaussianNB())
bal_dt_bag = IPF(DecisionTreeClassifier(max_depth=20,max_features='sqrt'))
bal_dt_boost = IPF(DecisionTreeClassifier(max_depth=10))
ensembles = {
    'rboost_DT': RUSBoostClassifier(base_estimator=clone(bal_dt_boost),algorithm='SAMME',n_estimators=50),
    'rboost_NB': RUSBoostClassifier(base_estimator=clone(bal_nb),algorithm='SAMME',n_estimators=50),
    'bbag_DT': BalancedBaggingClassifier(base_estimator=clone(bal_dt_bag),n_estimators=50),
    'bbag_NB': BalancedBaggingClassifier(base_estimator=clone(bal_nb),n_estimators=50),
}

In [8]:
models = {}
# for im,samp in imbs.items():
#     for c,clf in clfs.items():
#         models[(im,c)] = Pipeline([('samp',samp),('clf',IPF(clf))])

# for m,ens in ensembles.items():
#     models[('ens',m)] = ens

from sklearn.ensemble import BaggingClassifier
nb = GaussianNB()
dt = DecisionTreeClassifier(max_depth=20,max_features='sqrt')
models[('Spyder',"BagNB")] = make_pipeline(SMOTE(),IPF(BaggingClassifier(base_estimator=nb,n_estimators=20)))
models[('Spyder',"BagDT")] = make_pipeline(SMOTE(),IPF(BaggingClassifier(base_estimator=dt,n_estimators=20)))
    
    
    
models.keys(),len(models)

(dict_keys([('Spyder', 'BagNB'), ('Spyder', 'BagDT')]), 2)

In [9]:
cv = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=99)
def pr_rec_score(y,yp):
    prec, rec, _ = precision_recall_curve(y,yp)
    return auc(rec,prec)
scorers = [matthews_corrcoef,pr_rec_score]

In [11]:
cols = pd.MultiIndex.from_product([['Spyder'],["BagNB","BagDT"],[f.__name__ for f in scorers]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)
#df = pd.read_csv("IPF.csv",header=[0,1,2],index_col=0)

In [12]:
for d in DATASETS:
    X,y_noisy,y_real = read_data(d,stats=True)
    if df.loc[d,:].isna().sum()==0:
        print(f"SKIPPING {d}\n")
        continue
    for k in models:
        print(k)
        sd = perf_counter()
        r = evaluate(models[k],X,y_noisy,y_real,cv,scorers)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
        print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])
    print()
    df.to_csv("Smote_IPF.csv")

activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
('Spyder', 'BagNB')
1.79 [0.262, 0.421]
('Spyder', 'BagDT')
2.06 [0.246, 0.278]

groovy-1_6_BETA_1.csv noise:0.128, imb:6.017,117,704, Shape:(821, 65)
('Spyder', 'BagNB')
0.6 [0.302, 0.433]
('Spyder', 'BagDT')
0.62 [0.386, 0.515]

activemq-5.3.0.csv noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
('Spyder', 'BagNB')
1.44 [0.333, 0.464]
('Spyder', 'BagDT')
1.8 [0.328, 0.405]

wicket-1.3.0-incubating-beta-1.csv noise:0.164, imb:4.806,288,1384, Shape:(1672, 65)
('Spyder', 'BagNB')
0.89 [0.293, 0.442]
('Spyder', 'BagDT')
0.92 [0.283, 0.393]

jruby-1.1.csv noise:0.175, imb:3.540,161,570, Shape:(731, 65)
('Spyder', 'BagNB')
0.55 [0.403, 0.611]
('Spyder', 'BagDT')
0.54 [0.438, 0.595]

jruby-1.4.0.csv noise:0.190, imb:3.890,200,778, Shape:(978, 65)
('Spyder', 'BagNB')
0.72 [0.433, 0.609]
('Spyder', 'BagDT')
0.69 [0.381, 0.589]

lucene-2.3.0.csv noise:0.204, imb:4.031,160,645, Shape:(805, 65)
('Spyder', 'BagNB')
0.57 [0.48

In [13]:
ipf = pd.read_csv("IPF.csv",header=[0,1,2],index_col=0)
ipf = ipf.drop(columns=['rboost_DT','rboost_NB'],level=1)
smote_ipf = ipf['smote']

In [21]:
bag = pd.read_csv("Smote_IPF.csv",header=[0,1,2],index_col=0).droplevel(0,axis=1)

In [22]:
smote_ipf.shape,bag.shape

((32, 12), (32, 4))

In [23]:
smote_ipf.columns,bag.columns

(MultiIndex([( 'dt', 'matthews_corrcoef'),
             ( 'dt',      'pr_rec_score'),
             ( 'lr', 'matthews_corrcoef'),
             ( 'lr',      'pr_rec_score'),
             ( 'nb', 'matthews_corrcoef'),
             ( 'nb',      'pr_rec_score'),
             ('svm', 'matthews_corrcoef'),
             ('svm',      'pr_rec_score'),
             ('knn', 'matthews_corrcoef'),
             ('knn',      'pr_rec_score'),
             ( 'rf', 'matthews_corrcoef'),
             ( 'rf',      'pr_rec_score')],
            names=['clf', 'metric']),
 MultiIndex([('BagNB', 'matthews_corrcoef'),
             ('BagNB',      'pr_rec_score'),
             ('BagDT', 'matthews_corrcoef'),
             ('BagDT',      'pr_rec_score')],
            names=['clf', 'metric']))

In [24]:
final = pd.concat([smote_ipf,bag],axis=1)

In [27]:
final.to_csv("Smote-IPF.csv")