In [1]:
import numpy as np,os
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier, RUSBoostClassifier
from imblearn.pipeline import make_pipeline
from imblearn.metrics import geometric_mean_score
from libs import read_data, evaluate

In [2]:
class NCL(RandomUnderSampler):
    def _fit_resample(self, X, y,sample=None):
        knn = KNeighborsClassifier(n_neighbors=3).fit(X,y)
        neighs = knn.kneighbors(return_distance=False)
        R = []
        for v in np.argwhere(y):
            nn = neighs[v]
            idx = y[nn]==0
            R.extend(list(nn[idx]))
        mask = np.ones_like(y,dtype='bool')
        mask[list(set(R))] = 0
        if sample:
            return X[mask],y[mask],sample[mask]
        return X[mask],y[mask]
    
class BaseAdaboost(AdaBoostClassifier):
    def fit(self,X,y,sample_weight=None):
        sampler = RandomUnderSampler()
        sampler.fit_resample(X,y)
        idx = sampler.sample_indices_
        if sample_weight is not None:
            return super().fit(X[idx],y[idx],sample_weight[idx])
        return super().fit(X[idx],y[idx])
    
class SMTree(LogisticRegression):
    def fit(self,X,y,sample_weight=None):
        sampler = SMOTE()
        sampler.fit_resample(X,y)
        idx = sampler.sample_indices_
        if sample_weight is not None:
            return super().fit(X[idx],y[idx],sample_weight[idx])
        return super().fit(X[idx],y[idx])

In [3]:
#X,y_noisy,y_real = read_data("JIRA/derby-10.5.1.1.csv")
cv = RepeatedStratifiedKFold(n_splits=4,n_repeats=10,random_state=42)
models = {}

In [4]:
erus_base = BaseAdaboost(n_estimators=5)
models['ERUS'] = AdaBoostClassifier(base_estimator=erus_base,n_estimators=5)
models['proposed'] = make_pipeline(NCL(),models['ERUS'])
models['rf'] = RandomForestClassifier(n_estimators=100)
models['nb'] = GaussianNB()
models['log+nb'] = Pipeline([('log',FunctionTransformer(np.log1p,validate=False)),('nb',models['nb'])])  #Power needs to be checked
models['sm+nb'] = make_pipeline(SMOTE(),models['nb'])
models['rus+nb'] = make_pipeline(RandomUnderSampler(),models['nb'])
models['SMTBoost'] = RUSBoostClassifier(sampling_strategy='not majority')  #THIS IS DIFFERENT FROM PAPER
models['RUSBoost'] = RUSBoostClassifier(sampling_strategy='not minority')
models['BalancedRF'] = BalancedRandomForestClassifier(n_estimators=100)
models['log+BalancedRF'] = Pipeline([('log',FunctionTransformer(np.log1p,validate=False)),('bRF',models['BalancedRF'])])
models['log+ERUS'] = Pipeline([('log',FunctionTransformer(np.log1p,validate=False)),('erus',models['ERUS'])])
models['log+prop'] = Pipeline([('log',FunctionTransformer(np.log1p,validate=False)),('prop',models['proposed'])])
len(models)

13

In [None]:
for name,clf in models.items():
    try:
        print(name,evaluate(clf,X,y_noisy,y_real,cv))
    except Exception as e:
        print(name,e)

In [8]:
for f in sorted(os.listdir("JIRA/")):
    if not f.endswith('.csv'):
        continue
    dataset = '.'.join(f.split('.')[:-1])
    print(dataset)
    try:
        df = pd.read_csv(f"JIRA/results/{dataset}.csv",index_col=0)
        print("Appending to existing Dataframe")
    except:
        df = pd.DataFrame(columns=['auc_mean','auc_std','gmean_mean','gmean_std'])
        print("Creating new Dataframe")
        
    X,y_noisy,y_real = read_data("JIRA/"+f)
    for name,clf in models.items():
        if name in df.index:
            continue
        try:
            res = evaluate(clf,X,y_noisy,y_real,cv)
        except Exception as e:
            res = [-1,-1,-1,-1]
        df.loc[name,:] = res
        
    df.to_csv(f"JIRA/results/{dataset}.csv")

activemq-5.0.0
Appending to existing Dataframe
noise:0.13853503184713375, imb:0.046,82,1802, Shape:(1884, 65)
activemq-5.1.0
Appending to existing Dataframe
noise:0.08274111675126904, imb:0.076,139,1831, Shape:(1970, 65)
activemq-5.2.0
Appending to existing Dataframe
noise:0.11323529411764706, imb:0.082,154,1886, Shape:(2040, 65)
activemq-5.3.0
Appending to existing Dataframe
noise:0.09378960709759189, imb:0.064,142,2225, Shape:(2367, 65)
activemq-5.8.0
Appending to existing Dataframe
noise:0.05760233918128655, imb:0.063,203,3217, Shape:(3420, 65)
camel-1.4.0
Appending to existing Dataframe
noise:0.2811881188118812, imb:0.315,363,1152, Shape:(1515, 65)
camel-2.10.0
Appending to existing Dataframe
noise:0.05294414960828911, imb:0.041,311,7603, Shape:(7914, 65)
camel-2.11.0
Appending to existing Dataframe
noise:0.023513452407867962, imb:0.023,200,8646, Shape:(8846, 65)
camel-2.9.0
Appending to existing Dataframe
noise:0.04396067415730337, imb:0.029,200,6920, Shape:(7120, 65)
derby-10.2.1

In [6]:
df

Unnamed: 0,auc_mean,auc_std,gmean_mean,gmean_std
ERUS,0.740254,0.026282,0.735015,0.028282
proposed,0.737993,0.023988,0.733518,0.026655
rf,0.521987,0.01212,0.206414,0.062329
nb,0.666078,0.024298,0.604302,0.040115
log+nb,0.736829,0.028413,0.718678,0.036066
sm+nb,0.693483,0.025815,0.650268,0.037477
rus+nb,0.679755,0.03135,0.641565,0.047094
SMTBoost,0.549106,0.024573,0.310749,0.087712
RUSBoost,0.616546,0.04021,0.553274,0.086641
BalancedRF,0.771767,0.027423,0.767988,0.029692


In [None]:
df.set_index('Unnamed: 0')

In [None]:
df.columns