In [2]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, power_transform, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.utils import shuffle
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.ensemble import RUSBoostClassifier, BalancedRandomForestClassifier, BalancedBaggingClassifier
from imblearn.pipeline import Pipeline
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import IMBS, CLFS, ENSEMBLES, CV, SCORERS
from library.utils import evaluate, read_data

In [3]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [4]:
models = {}
# for im,samp in IMBS.items():
#     for c,clf in CLFS.items():
#         models[(im,c)] = Pipeline([('samp',samp),('clf',clf)])

bases = {'nb':GaussianNB(),'dt':DecisionTreeClassifier(max_depth=20,max_features='sqrt')}
models[('UBag',"BagNB")] = BalancedBaggingClassifier(base_estimator=bases['nb'],n_estimators=20)
models[('UBag',"BagDT")] = BalancedBaggingClassifier(base_estimator=bases['dt'],n_estimators=20)
    
models.keys(),len(models)

(dict_keys([('UBag', 'BagNB'), ('UBag', 'BagDT')]), 2)

In [5]:
path = "Noisy_Bag.csv"
cols = pd.MultiIndex.from_product([['UBag'],["BagNB","BagDT"],[f.__name__ for f in SCORERS]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)
#df = pd.read_csv(path,header=[0,1,2],index_col=0)

In [6]:
%%time
for it,d in enumerate(DATASETS):
    print(it)
    X,y_noisy,y_real = read_data(d,stats=True)
    for k in models:
        print(k)
        sd = perf_counter()
        r = evaluate(models[k],X,y_noisy,y_real,CV,SCORERS)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
        print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])
    print()
    df.to_csv(path)

0
activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
('UBag', 'BagNB')
0.63 [0.284, 0.375]
('UBag', 'BagDT')
0.68 [0.258, 0.284]

1
groovy-1_6_BETA_1.csv noise:0.128, imb:6.017,117,704, Shape:(821, 65)
('UBag', 'BagNB')
0.45 [0.249, 0.374]
('UBag', 'BagDT')
0.47 [0.375, 0.406]

2
activemq-5.3.0.csv noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
('UBag', 'BagNB')
0.51 [0.328, 0.456]
('UBag', 'BagDT')
0.55 [0.346, 0.433]

3
wicket-1.3.0-incubating-beta-1.csv noise:0.164, imb:4.806,288,1384, Shape:(1672, 65)
('UBag', 'BagNB')
0.48 [0.327, 0.394]
('UBag', 'BagDT')
0.6 [0.279, 0.291]

4
jruby-1.1.csv noise:0.175, imb:3.540,161,570, Shape:(731, 65)
('UBag', 'BagNB')
0.4 [0.381, 0.651]
('UBag', 'BagDT')
0.46 [0.439, 0.56]

5
jruby-1.4.0.csv noise:0.190, imb:3.890,200,778, Shape:(978, 65)
('UBag', 'BagNB')
0.47 [0.45, 0.613]
('UBag', 'BagDT')
0.5 [0.411, 0.567]

6
lucene-2.3.0.csv noise:0.204, imb:4.031,160,645, Shape:(805, 65)
('UBag', 'BagNB')
0.41 [0.417, 0.61]
('UBag

In [None]:
df.shape

## Effect of Noise on Performance

In [None]:
noise = pd.read_csv("Imb X Clf- Noise.csv",header=[0,1,2],index_col=0)
clean = pd.read_csv("Imb X Clf- Clean.csv",header=[0,1,2],index_col=0)
noise.shape,clean.shape

In [None]:
wilcoxon(noise.values.reshape(-1),clean.values.reshape(-1))

In [None]:
noise.values.mean(),clean.values.mean()

## Statistical Analysis

In [None]:
from scipy.stats import wilcoxon, friedmanchisquare
import scikit_posthocs as sp

In [None]:
boo = df.rename_axis('Datasets').reset_index()

In [None]:
res = pd.melt(boo,id_vars=['Datasets'])
res.columns

In [None]:
res

### Compare DT & RF & NB

In [None]:
mathew =  res[res['metric']=='matthews_corrcoef'].drop(columns=['metric'])

In [None]:
mathew = mathew[mathew['clf'].isin(['dt','rf','nb'])]
mathew.shape

In [None]:
tmp = mathew.set_index(['Datasets','imb'])
tmp

In [None]:
hey = tmp.pivot(columns='clf')
hey

In [None]:
dog = hey.reset_index(drop=True)
dog.columns = dog.columns.droplevel(0)
dog

In [None]:
friedmanchisquare(*(dog[c] for c in dog.columns))

In [None]:
dog.mean()f

In [None]:
pvals = pd.DataFrame(columns=dog.columns,index=dog.columns,dtype='float')
for c in dog.columns:
    for d in dog.columns:
        if c==d: continue
        pvals.loc[c,d] = float(wilcoxon(dog[c],dog[d]).pvalue)
pvals

In [None]:
from statsmodels.stats.multitest import multipletests

In [None]:
ps = pvals.values.reshape(-1)
idx = np.isnan(ps)
idx

In [None]:
ps[~idx]

In [None]:
multipletests(ps[~idx],method='fdr_by')