In [4]:
import numpy as np,os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, power_transform, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.utils import shuffle
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.ensemble import RUSBoostClassifier, BalancedRandomForestClassifier, BalancedBaggingClassifier
from imblearn.pipeline import make_pipeline
import seaborn as sns,matplotlib.pyplot as plt

from library.utils import evaluate, read_data

In [5]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']

In [24]:
imbs = {
    'smote': SMOTE(k_neighbors=5),
    'rus': RandomUnderSampler('not minority'),
    'wilson':EditedNearestNeighbours(n_neighbors=5),  #Default was 3
}
clfs = {
    'dt': DecisionTreeClassifier(max_depth=20),
    'lr': LogisticRegression(solver='lbfgs',max_iter=1000),
    'nb': GaussianNB(),
    'knn': KNeighborsClassifier(n_neighbors=5),
    'rf': RandomForestClassifier(n_estimators=50),
}
ensembles = {
    'rboost_DT': RUSBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),algorithm='SAMME',n_estimators=10),
    'rboost_NB': RUSBoostClassifier(base_estimator=GaussianNB(),algorithm='SAMME',n_estimators=10),
    'bbag_DT': BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=20,max_features='sqrt')),
    'bbag_NB': BalancedBaggingClassifier(base_estimator=GaussianNB()),
}

In [25]:
models = {}
for im,samp in imbs.items():
    for c,clf in clfs.items():
        models[(im,c)] = make_pipeline(samp,clf)

for m,ens in ensembles.items():
    models[('ens',m)] = ens
models.keys()

dict_keys([('smote', 'dt'), ('smote', 'lr'), ('smote', 'nb'), ('smote', 'knn'), ('smote', 'rf'), ('rus', 'dt'), ('rus', 'lr'), ('rus', 'nb'), ('rus', 'knn'), ('rus', 'rf'), ('wilson', 'dt'), ('wilson', 'lr'), ('wilson', 'nb'), ('wilson', 'knn'), ('wilson', 'rf'), ('ens', 'rboost_DT'), ('ens', 'rboost_NB'), ('ens', 'bbag_DT'), ('ens', 'bbag_NB')])

In [8]:
cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=42)
def pr_rec_score(y,yp):
    prec, rec, _ = precision_recall_curve(y,yp)
    return auc(rec,prec)
scorers = [matthews_corrcoef,pr_rec_score]

In [9]:
# X,y_noisy,y_real = read_data(SHORT[5])
# evaluate(models[('smote','dt')],X,y_noisy,y_real,cv,scorers)

In [10]:
cols = pd.MultiIndex.from_product([imbs.keys(),clfs.keys(),[f.__name__ for f in scorers]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)

In [28]:
for k in models:
    print(k)
    for d in DATASETS:
        try:
            if df.loc[d,(k[0],k[1],slice(None))].isna().sum()==0: 
                print("Skipping ",d,k[0],k[1])
                continue
        except:
            pass
        X,y_noisy,y_real = read_data(d,stats=False)
        r = evaluate(models[k],X,y_noisy,y_real,cv,scorers)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()

('smote', 'dt')
Skipping  groovy-1_5_7.csv smote dt
Skipping  jruby-1.4.0.csv smote dt
Skipping  lucene-2.9.0.csv smote dt
Skipping  jruby-1.7.0.preview1.csv smote dt
Skipping  groovy-1_6_BETA_1.csv smote dt
Skipping  derby-10.2.1.6.csv smote dt
Skipping  wicket-1.5.3.csv smote dt
Skipping  camel-2.9.0.csv smote dt
Skipping  camel-1.4.0.csv smote dt
Skipping  activemq-5.8.0.csv smote dt
('smote', 'lr')
Skipping  groovy-1_5_7.csv smote lr
Skipping  jruby-1.4.0.csv smote lr
Skipping  lucene-2.9.0.csv smote lr
Skipping  jruby-1.7.0.preview1.csv smote lr
Skipping  groovy-1_6_BETA_1.csv smote lr
Skipping  derby-10.2.1.6.csv smote lr
Skipping  wicket-1.5.3.csv smote lr
Skipping  camel-2.9.0.csv smote lr
Skipping  camel-1.4.0.csv smote lr
Skipping  activemq-5.8.0.csv smote lr
('smote', 'nb')
Skipping  jruby-1.4.0.csv smote nb
Skipping  lucene-2.9.0.csv smote nb
Skipping  jruby-1.7.0.preview1.csv smote nb
Skipping  groovy-1_6_BETA_1.csv smote nb
Skipping  derby-10.2.1.6.csv smote nb
Skipping  

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  recall = tps / tps[-1]
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  recall = tps / tps[-1]


Skipping  lucene-2.9.0.csv rus dt
Skipping  jruby-1.7.0.preview1.csv rus dt
Skipping  groovy-1_6_BETA_1.csv rus dt
Skipping  derby-10.2.1.6.csv rus dt
Skipping  wicket-1.5.3.csv rus dt
Skipping  camel-2.9.0.csv rus dt
Skipping  camel-1.4.0.csv rus dt
Skipping  activemq-5.8.0.csv rus dt
('rus', 'lr')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  recall = tps / tps[-1]


Skipping  jruby-1.4.0.csv rus lr
Skipping  lucene-2.9.0.csv rus lr
Skipping  jruby-1.7.0.preview1.csv rus lr
Skipping  groovy-1_6_BETA_1.csv rus lr
Skipping  derby-10.2.1.6.csv rus lr
Skipping  wicket-1.5.3.csv rus lr
Skipping  camel-2.9.0.csv rus lr
Skipping  camel-1.4.0.csv rus lr
Skipping  activemq-5.8.0.csv rus lr
('rus', 'nb')
Skipping  groovy-1_5_7.csv rus nb
Skipping  jruby-1.4.0.csv rus nb
Skipping  lucene-2.9.0.csv rus nb
Skipping  jruby-1.7.0.preview1.csv rus nb
Skipping  groovy-1_6_BETA_1.csv rus nb
Skipping  derby-10.2.1.6.csv rus nb
Skipping  wicket-1.5.3.csv rus nb
Skipping  camel-2.9.0.csv rus nb
Skipping  camel-1.4.0.csv rus nb
Skipping  activemq-5.8.0.csv rus nb
('rus', 'knn')
Skipping  groovy-1_5_7.csv rus knn
Skipping  jruby-1.4.0.csv rus knn
Skipping  lucene-2.9.0.csv rus knn
Skipping  jruby-1.7.0.preview1.csv rus knn
Skipping  groovy-1_6_BETA_1.csv rus knn
Skipping  derby-10.2.1.6.csv rus knn
Skipping  wicket-1.5.3.csv rus knn
Skipping  camel-2.9.0.csv rus knn
Skip

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  recall = tps / tps[-1]
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  recall = tps / tps[-1]


Skipping  jruby-1.4.0.csv wilson nb
Skipping  lucene-2.9.0.csv wilson nb
Skipping  jruby-1.7.0.preview1.csv wilson nb
Skipping  groovy-1_6_BETA_1.csv wilson nb
Skipping  derby-10.2.1.6.csv wilson nb
Skipping  wicket-1.5.3.csv wilson nb
Skipping  camel-2.9.0.csv wilson nb
Skipping  camel-1.4.0.csv wilson nb
Skipping  activemq-5.8.0.csv wilson nb
('wilson', 'knn')
Skipping  groovy-1_5_7.csv wilson knn
Skipping  jruby-1.4.0.csv wilson knn
Skipping  lucene-2.9.0.csv wilson knn
Skipping  jruby-1.7.0.preview1.csv wilson knn
Skipping  groovy-1_6_BETA_1.csv wilson knn
Skipping  derby-10.2.1.6.csv wilson knn
Skipping  wicket-1.5.3.csv wilson knn
Skipping  camel-2.9.0.csv wilson knn
Skipping  camel-1.4.0.csv wilson knn
Skipping  activemq-5.8.0.csv wilson knn
('wilson', 'rf')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  recall = tps / tps[-1]


Skipping  jruby-1.4.0.csv wilson rf
Skipping  lucene-2.9.0.csv wilson rf
Skipping  jruby-1.7.0.preview1.csv wilson rf
Skipping  groovy-1_6_BETA_1.csv wilson rf
Skipping  derby-10.2.1.6.csv wilson rf
Skipping  wicket-1.5.3.csv wilson rf
Skipping  camel-2.9.0.csv wilson rf
Skipping  camel-1.4.0.csv wilson rf
Skipping  activemq-5.8.0.csv wilson rf
('ens', 'rboost_DT')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  recall = tps / tps[-1]


Skipping  jruby-1.4.0.csv ens rboost_DT
Skipping  lucene-2.9.0.csv ens rboost_DT
Skipping  jruby-1.7.0.preview1.csv ens rboost_DT
Skipping  groovy-1_6_BETA_1.csv ens rboost_DT
Skipping  derby-10.2.1.6.csv ens rboost_DT
Skipping  wicket-1.5.3.csv ens rboost_DT
Skipping  camel-2.9.0.csv ens rboost_DT
Skipping  camel-1.4.0.csv ens rboost_DT
Skipping  activemq-5.8.0.csv ens rboost_DT
('ens', 'rboost_NB')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  recall = tps / tps[-1]


('ens', 'bbag_DT')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  recall = tps / tps[-1]


('ens', 'bbag_NB')


In [31]:
df.isna().sum().sum()

7

In [36]:
df.isna()

imb,smote,smote,smote,smote,smote,smote,smote,smote,smote,smote,...,wilson,wilson,ens,ens,ens,ens,ens,ens,ens,ens
clf,dt,dt,lr,lr,nb,nb,knn,knn,rf,rf,...,rf,rf,rboost_DT,rboost_DT,rboost_NB,rboost_NB,bbag_DT,bbag_DT,bbag_NB,bbag_NB
metric,matthews_corrcoef,pr_rec_score,matthews_corrcoef,pr_rec_score,matthews_corrcoef,pr_rec_score,matthews_corrcoef,pr_rec_score,matthews_corrcoef,pr_rec_score,...,matthews_corrcoef,pr_rec_score,matthews_corrcoef,pr_rec_score,matthews_corrcoef,pr_rec_score,matthews_corrcoef,pr_rec_score,matthews_corrcoef,pr_rec_score
groovy-1_5_7.csv,False,False,False,False,False,True,False,False,False,False,...,False,True,False,True,False,True,False,True,False,False
jruby-1.4.0.csv,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
lucene-2.9.0.csv,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
jruby-1.7.0.preview1.csv,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
groovy-1_6_BETA_1.csv,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
derby-10.2.1.6.csv,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
wicket-1.5.3.csv,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
camel-2.9.0.csv,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
camel-1.4.0.csv,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
activemq-5.8.0.csv,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [32]:
df.to_csv("Imb X Clf.csv")

## Statistical Analysis

In [None]:
from scipy.stats import wilcoxon, friedmanchisquare
import scikit_posthocs as sp

In [None]:
boo = df.rename_axis('Datasets').reset_index()

In [None]:
res = pd.melt(boo,id_vars=['Datasets'])
res.columns

In [None]:
res

### Compare DT & RF & NB

In [None]:
mathew =  res[res['metric']=='matthews_corrcoef'].drop(columns=['metric'])

In [None]:
mathew = mathew[mathew['clf'].isin(['dt','rf','nb'])]
mathew.shape

In [None]:
tmp = mathew.set_index(['Datasets','imb'])
tmp

In [None]:
hey = tmp.pivot(columns='clf')
hey

In [None]:
dog = hey.reset_index(drop=True)
dog.columns = dog.columns.droplevel(0)
dog

In [None]:
friedmanchisquare(*(dog[c] for c in dog.columns))

In [None]:
dog.mean()

In [None]:
pvals = pd.DataFrame(columns=dog.columns,index=dog.columns,dtype='float')
for c in dog.columns:
    for d in dog.columns:
        if c==d: continue
        pvals.loc[c,d] = float(wilcoxon(dog[c],dog[d]).pvalue)
pvals

In [None]:
from statsmodels.stats.multitest import multipletests

In [None]:
ps = pvals.values.reshape(-1)
idx = np.isnan(ps)
idx

In [None]:
ps[~idx]

In [None]:
multipletests(ps[~idx],method='fdr_by')