In [10]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, power_transform, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.utils import shuffle
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.ensemble import RUSBoostClassifier, BalancedRandomForestClassifier, BalancedBaggingClassifier
from imblearn.pipeline import Pipeline
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import IMBS, CLFS, ENSEMBLES, CV, SCORERS
from library.utils import evaluate, read_data

In [11]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [12]:
models = {}
for im,samp in IMBS.items():
    for c,clf in CLFS.items():
        models[(im,c)] = Pipeline([('samp',samp),('clf',clf)])
    
models.keys(),len(models)

(dict_keys([('smote', 'dt'), ('smote', 'lr'), ('smote', 'nb'), ('smote', 'svm'), ('smote', 'knn'), ('smote', 'rf'), ('rus', 'dt'), ('rus', 'lr'), ('rus', 'nb'), ('rus', 'svm'), ('rus', 'knn'), ('rus', 'rf'), ('wilson', 'dt'), ('wilson', 'lr'), ('wilson', 'nb'), ('wilson', 'svm'), ('wilson', 'knn'), ('wilson', 'rf'), ('tomek', 'dt'), ('tomek', 'lr'), ('tomek', 'nb'), ('tomek', 'svm'), ('tomek', 'knn'), ('tomek', 'rf'), ('None', 'dt'), ('None', 'lr'), ('None', 'nb'), ('None', 'svm'), ('None', 'knn'), ('None', 'rf')]),
 30)

In [13]:
path = "Clean.csv"
cols = pd.MultiIndex.from_product([IMBS.keys(),CLFS.keys(),[f.__name__ for f in SCORERS]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)
#df = pd.read_csv(path,header=[0,1,2],index_col=0)

In [None]:
%%time
for it,d in enumerate(DATASETS):
    print(it)
    X,y_noisy,y_real = read_data(d,stats=True)
    for k in models:
        print(k)
        sd = perf_counter()
        r = evaluate(models[k],X,y_real,y_real,CV,SCORERS)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
        print(round(perf_counter()-sd,2),[round(r[f].mean(),3) for f in r])
    print()
    df.to_csv(path)

0
activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
('smote', 'dt')
3.56 [0.19, 0.256]
('smote', 'lr')
10.38 [0.298, 0.317]
('smote', 'nb')
0.32 [0.29, 0.391]
('smote', 'svm')
165.59 [0.292, 0.234]
('smote', 'knn')
3.22 [0.251, 0.31]
('smote', 'rf')
15.1 [0.262, 0.306]
('rus', 'dt')
0.2 [0.212, 0.431]
('rus', 'lr')
1.25 [0.276, 0.312]
('rus', 'nb')
0.14 [0.288, 0.391]
('rus', 'svm')
1.18 [0.266, 0.253]
('rus', 'knn')
0.47 [0.241, 0.298]
('rus', 'rf')
1.85 [0.276, 0.278]
('wilson', 'dt')
7.16 [0.241, 0.336]
('wilson', 'lr')
9.7 [0.339, 0.356]
('wilson', 'nb')
6.61 [0.284, 0.422]
('wilson', 'svm')
19.25 [0.32, 0.346]
('wilson', 'knn')
8.1 [0.281, 0.295]
('wilson', 'rf')
9.95 [0.318, 0.352]
('tomek', 'dt')
6.04 [0.224, 0.292]
('tomek', 'lr')
8.8 [0.297, 0.355]
('tomek', 'nb')
5.17 [0.294, 0.395]
('tomek', 'svm')
23.49 [0.3, 0.334]
('tomek', 'knn')
6.91 [0.254, 0.285]
('tomek', 'rf')
9.85 [0.311, 0.368]
('None', 'dt')
1.12 [0.188, 0.261]
('None', 'lr')
3.56 [0.28, 0.354

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


1.51 [0.26, 0.428]
('None', 'knn')
0.27 [0.459, 0.57]
('None', 'rf')
1.93 [0.518, 0.629]

2
activemq-5.3.0.csv noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
('smote', 'dt')
1.96 [0.278, 0.396]
('smote', 'lr')
5.46 [0.371, 0.47]
('smote', 'nb')
0.24 [0.337, 0.461]
('smote', 'svm')
74.48 [0.384, 0.434]
('smote', 'knn')
1.8 [0.28, 0.391]
('smote', 'rf')
8.84 [0.39, 0.481]
('rus', 'dt')
0.18 [0.264, 0.477]
('rus', 'lr')
1.17 [0.368, 0.465]
('rus', 'nb')
0.1 [0.335, 0.457]
('rus', 'svm')
1.57 [0.35, 0.441]
('rus', 'knn')
0.38 [0.306, 0.409]
('rus', 'rf')
1.82 [0.34, 0.453]
('wilson', 'dt')
3.84 [0.348, 0.484]
('wilson', 'lr')
5.38 [0.387, 0.485]
('wilson', 'nb')
3.58 [0.328, 0.477]
('wilson', 'svm')
10.26 [0.393, 0.469]
('wilson', 'knn')
4.24 [0.341, 0.42]
('wilson', 'rf')
6.34 [0.387, 0.479]
('tomek', 'dt')
3.71 [0.289, 0.399]
('tomek', 'lr')
5.7 [0.385, 0.49]
('tomek', 'nb')
3.14 [0.336, 0.455]
('tomek', 'svm')
15.85 [0.375, 0.474]
('tomek', 'knn')
4.27 [0.353, 0.431]
('tomek', 'rf')

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


15.74 [0.096, 0.167]
('tomek', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


6.55 [0.103, 0.168]
('tomek', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


8.39 [0.111, 0.211]
('None', 'dt')
0.66 [0.132, 0.188]
('None', 'lr')
2.21 [0.14, 0.22]
('None', 'nb')
0.11 [0.241, 0.307]
('None', 'svm')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


11.62 [0.11, 0.166]
('None', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


1.58 [0.096, 0.168]
('None', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


3.75 [0.159, 0.206]

11
lucene-3.1.csv noise:0.120, imb:7.477,331,2475, Shape:(2806, 65)
('smote', 'dt')
3.66 [0.102, 0.154]
('smote', 'lr')
9.1 [0.163, 0.168]
('smote', 'nb')
0.22 [0.168, 0.256]
('smote', 'svm')
131.18 [0.145, 0.112]
('smote', 'knn')
3.05 [0.154, 0.199]
('smote', 'rf')
14.19 [0.13, 0.164]
('rus', 'dt')
0.13 [0.116, 0.366]
('rus', 'lr')
0.9 [0.157, 0.165]
('rus', 'nb')
0.1 [0.171, 0.241]
('rus', 'svm')
0.5 [0.166, 0.122]
('rus', 'knn')
0.26 [0.129, 0.16]
('rus', 'rf')
1.54 [0.171, 0.146]
('wilson', 'dt')
7.1 [0.189, 0.251]
('wilson', 'lr')
9.17 [0.18, 0.2]
('wilson', 'nb')
6.25 [0.168, 0.239]
('wilson', 'svm')
15.35 [0.159, 0.172]
('wilson', 'knn')
7.66 [0.152, 0.145]
('wilson', 'rf')
9.77 [0.257, 0.257]
('tomek', 'dt')
5.91 [0.143, 0.194]
('tomek', 'lr')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


8.22 [0.198, 0.2]
('tomek', 'nb')
5.01 [0.17, 0.229]
('tomek', 'svm')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


17.9 [0.103, 0.165]
('tomek', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


6.76 [0.119, 0.161]
('tomek', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


9.35 [0.197, 0.275]
('None', 'dt')
1.0 [0.135, 0.184]
('None', 'lr')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


2.99 [0.197, 0.2]
('None', 'nb')
0.12 [0.17, 0.227]
('None', 'svm')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


14.07 [0.058, 0.163]
('None', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


1.9 [0.119, 0.169]
('None', 'rf')
5.09 [0.207, 0.266]

12
groovy-1_6_BETA_2.csv noise:0.096, imb:7.583,103,781, Shape:(884, 65)
('smote', 'dt')
0.95 [0.362, 0.451]
('smote', 'lr')
2.8 [0.398, 0.428]
('smote', 'nb')
0.14 [0.275, 0.41]
('smote', 'svm')
9.4 [0.427, 0.411]
('smote', 'knn')
0.45 [0.38, 0.524]
('smote', 'rf')
4.14 [0.487, 0.575]
('rus', 'dt')
0.1 [0.299, 0.494]
('rus', 'lr')
0.6 [0.372, 0.363]
('rus', 'nb')
0.09 [0.256, 0.393]
('rus', 'svm')
0.25 [0.379, 0.317]
('rus', 'knn')
0.12 [0.374, 0.465]
('rus', 'rf')
1.44 [0.425, 0.508]
('wilson', 'dt')
0.98 [0.402, 0.508]
('wilson', 'lr')
1.83 [0.402, 0.392]
('wilson', 'nb')
0.89 [0.236, 0.411]
('wilson', 'svm')
1.86 [0.331, 0.35]
('wilson', 'knn')
1.02 [0.417, 0.439]
('wilson', 'rf')
2.57 [0.492, 0.535]
('tomek', 'dt')
0.9 [0.391, 0.47]
('tomek', 'lr')
2.03 [0.415, 0.471]
('tomek', 'nb')
0.73 [0.262, 0.395]
('tomek', 'svm')
2.47 [0.223, 0.416]
('tomek', 'knn')
0.94 [0.38, 0.461]
('tomek', 'rf')
2.63 [0.532, 0.622]
('None', 'dt')
0

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


1.89 [0.235, 0.432]
('None', 'knn')
0.28 [0.379, 0.461]
('None', 'rf')
2.04 [0.539, 0.638]

13
activemq-5.2.0.csv noise:0.113, imb:12.247,154,1886, Shape:(2040, 65)
('smote', 'dt')
1.55 [0.374, 0.476]
('smote', 'lr')
4.89 [0.453, 0.606]
('smote', 'nb')
0.22 [0.419, 0.576]
('smote', 'svm')
47.87 [0.473, 0.55]
('smote', 'knn')
1.53 [0.383, 0.492]
('smote', 'rf')
6.91 [0.525, 0.607]
('rus', 'dt')
0.16 [0.325, 0.514]
('rus', 'lr')
1.07 [0.446, 0.606]
('rus', 'nb')
0.1 [0.416, 0.571]
('rus', 'svm')
1.12 [0.445, 0.567]
('rus', 'knn')
0.33 [0.366, 0.505]
('rus', 'rf')
1.69 [0.444, 0.574]
('wilson', 'dt')
3.49 [0.446, 0.56]
('wilson', 'lr')
4.76 [0.512, 0.619]
('wilson', 'nb')
3.12 [0.428, 0.587]
('wilson', 'svm')
7.99 [0.517, 0.578]
('wilson', 'knn')
3.85 [0.45, 0.551]
('wilson', 'rf')
5.92 [0.522, 0.596]
('tomek', 'dt')
3.46 [0.399, 0.496]
('tomek', 'lr')
5.14 [0.523, 0.623]
('tomek', 'nb')
2.92 [0.433, 0.577]
('tomek', 'svm')
11.22 [0.513, 0.59]
('tomek', 'knn')
3.84 [0.438, 0.504]
('tomek'

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


1.71 [0.412, 0.436]
('tomek', 'nb')
0.67 [0.307, 0.463]
('tomek', 'svm')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


1.46 [0.382, 0.428]
('tomek', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


0.89 [0.36, 0.419]
('tomek', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


2.46 [0.464, 0.42]
('None', 'dt')
0.2 [0.257, 0.302]
('None', 'lr')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


0.93 [0.414, 0.437]
('None', 'nb')
0.06 [0.308, 0.458]
('None', 'svm')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


0.82 [0.392, 0.416]
('None', 'knn')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


0.25 [0.353, 0.445]
('None', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


1.83 [0.411, 0.423]

15
hbase-0.95.0.csv noise:0.234, imb:17.341,91,1578, Shape:(1669, 65)
('smote', 'dt')
1.46 [0.347, 0.561]
('smote', 'lr')
5.45 [0.356, 0.565]
('smote', 'nb')
0.39 [0.297, 0.472]
('smote', 'svm')
34.66 [0.406, 0.577]
('smote', 'knn')
1.21 [0.379, 0.577]
('smote', 'rf')
6.15 [0.501, 0.664]
('rus', 'dt')
0.31 [0.313, 0.575]
('rus', 'lr')
1.91 [0.345, 0.536]
('rus', 'nb')
0.1 [0.292, 0.462]
('rus', 'svm')
3.72 [0.369, 0.549]
('rus', 'knn')
0.44 [0.381, 0.557]
('rus', 'rf')
2.38 [0.451, 0.632]
('wilson', 'dt')
2.23 [0.363, 0.604]
('wilson', 'lr')
3.9 [0.353, 0.538]
('wilson', 'nb')
1.97 [0.289, 0.484]
('wilson', 'svm')
7.12 [0.394, 0.541]
('wilson', 'knn')
2.4 [0.402, 0.608]
('wilson', 'rf')
4.45 [0.45, 0.62]
('tomek', 'dt')
2.57 [0.353, 0.562]
('tomek', 'lr')
5.01 [0.386, 0.566]
('tomek', 'nb')
2.02 [0.297, 0.462]
('tomek', 'svm')
12.58 [0.371, 0.584]
('tomek', 'knn')
2.66 [0.423, 0.595]
('tomek', 'rf')
5.48 [0.479, 0.657]
('None', 'dt')
0.66 [0.332, 0.544]
('None', 'l

In [21]:
df.shape

(32, 42)

## Effect of Noise on Performance

In [None]:
noise = pd.read_csv("Imb X Clf- Noise.csv",header=[0,1,2],index_col=0)
clean = pd.read_csv("Imb X Clf- Clean.csv",header=[0,1,2],index_col=0)
noise.shape,clean.shape

In [None]:
wilcoxon(noise.values.reshape(-1),clean.values.reshape(-1))

In [None]:
noise.values.mean(),clean.values.mean()

## Statistical Analysis

In [None]:
from scipy.stats import wilcoxon, friedmanchisquare
import scikit_posthocs as sp

In [None]:
boo = df.rename_axis('Datasets').reset_index()

In [None]:
res = pd.melt(boo,id_vars=['Datasets'])
res.columns

In [None]:
res

### Compare DT & RF & NB

In [None]:
mathew =  res[res['metric']=='matthews_corrcoef'].drop(columns=['metric'])

In [None]:
mathew = mathew[mathew['clf'].isin(['dt','rf','nb'])]
mathew.shape

In [None]:
tmp = mathew.set_index(['Datasets','imb'])
tmp

In [None]:
hey = tmp.pivot(columns='clf')
hey

In [None]:
dog = hey.reset_index(drop=True)
dog.columns = dog.columns.droplevel(0)
dog

In [None]:
friedmanchisquare(*(dog[c] for c in dog.columns))

In [None]:
dog.mean()f

In [None]:
pvals = pd.DataFrame(columns=dog.columns,index=dog.columns,dtype='float')
for c in dog.columns:
    for d in dog.columns:
        if c==d: continue
        pvals.loc[c,d] = float(wilcoxon(dog[c],dog[d]).pvalue)
pvals

In [None]:
from statsmodels.stats.multitest import multipletests

In [None]:
ps = pvals.values.reshape(-1)
idx = np.isnan(ps)
idx

In [None]:
ps[~idx]

In [None]:
multipletests(ps[~idx],method='fdr_by')