In [3]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from collections import defaultdict
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.pipeline import Pipeline

from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
import seaborn as sns,matplotlib.pyplot as plt

from library.utils import evaluate, read_data
from library.cleaners import kDN, ih_prob,FilteringEstimator

In [4]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
#DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

10

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
class CLNI(BaseEstimator, ClassifierMixin):
    def __init__(self, n, iters = 3,random_state=None):
        self.estimator = DecisionTreeClassifier(max_depth=20)
        self.n = n
        self.iter = iters
        self.random_state = random_state
        
    def clean(self,X,Y, sample_weight):
        Xt,Yt = X.copy(),Y.copy()
        breaks = [(len(X)//self.n)*i for i in range(1,self.n)]
        Xs,Ys = np.split(Xt),np.split(Yt)
        clfs = []
        for i in range(self.n):
            c = clone(self.estimator).fit(Xs[i],Ys[i])
            clfs.append(c)

    def fit(self, X, Y,sample_weight=None):
        Xf,Yf,sample_weight = self.clean(X, Y, sample_weight)
#         a,b = np.unique(Y,return_counts=True)[1],np.unique(Yf,return_counts=True)[1]
#         print(a.max()/a.min(),b.max()/b.min())
        try:
            self.estimator = self.estimator.fit(Xf, Yf,sample_weight=sample_weight)
        except TypeError as e:
            self.estimator = self.estimator.fit(Xf, Yf)
        return self

    @property
    def classes_(self):
        return self.estimator.classes_

    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

In [11]:
N = 5
X,y_noisy,y_real = read_data(DATASETS[4])

groovy-1_6_BETA_1.csv noise:0.128, imb:6.017,117,704, Shape:(821, 65)


In [12]:
Xt,Yt = RandomUnderSampler().fit_sample(X,y_real)
Xt,Yt = shuffle(Xt,Yt)
breaks = [(len(Xt)//N)*i for i in range(1,N)]
Xs,Ys = np.split(Xt,breaks),np.split(Yt,breaks)
clfs = []
breaks

[28, 56, 84, 112]

In [13]:
for i in range(N):
    print(Xs[i].shape)
    c = DecisionTreeClassifier(max_depth=10).fit(Xs[i],Ys[i])
    clfs.append(c)

(28, 65)
(28, 65)
(28, 65)
(28, 65)
(28, 65)


In [14]:
preds = np.zeros((len(Xt),N))
preds.shape,Yt.shape

((140, 5), (140,))

In [15]:
for i in range(N):
    preds[:,i] = clfs[i].predict(Xt)

In [16]:
preds[:10]

array([[0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1.],
       [0., 0., 1., 0., 1.],
       [0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 1.],
       [0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 1.],
       [1., 0., 0., 0., 0.],
       [1., 1., 0., 1., 1.],
       [1., 1., 0., 1., 1.]])

In [17]:
Yt[:10]

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int8)

In [18]:
eqs = preds==Yt.reshape(-1,1)

In [23]:
ns = eqs.sum(axis=1)<N/2
ns.sum()

27

In [24]:
len(ns)

140

In [25]:
ns.sum()/len(ns)

0.19285714285714287

In [26]:
N//2

2