In [1]:
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon, friedmanchisquare
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
METRIC_TO_DROP = 'matthews_corrcoef' #pr_rec_score, matthews_corrcoef

ipf = pd.read_csv("IPF.csv",header=[0,1,2],index_col=0)
ipf = ipf.drop(columns=[c[0] for c in ipf.columns if 'ens' in c[0]],level=0)
ipf = ipf.drop(columns=[METRIC_TO_DROP],level=2).droplevel(2,axis=1) 
print(f"ipf:{ipf.shape}")

ipf:(32, 30)


In [22]:
ihf = pd.read_csv("IHFilter.csv",header=[0,1,2],index_col=0)
ihf = ihf.drop(columns=[c[0] for c in ihf.columns if 'ens' in c[0]],level=0)
ihf = ihf.drop(columns=[METRIC_TO_DROP],level=2).droplevel(2,axis=1) 
print(f"ihf:{ihf.shape}")

ihf:(32, 30)


In [23]:
clni = pd.read_csv("CLNI_consensus.csv",header=[0,1,2],index_col=0)
clni = clni.drop(columns=[c[0] for c in clni.columns if 'ens' in c[0]],level=0)
clni = clni.drop(columns=[METRIC_TO_DROP],level=2).droplevel(2,axis=1) 
print(f"clni:{clni.shape}")

clni:(32, 30)


In [24]:
none = pd.read_csv("Noisy.csv",header=[0,1,2],index_col=0)
none = none.drop(columns=[c[0] for c in ipf.columns if 'ens' in c[0]],level=0)
none = none.drop(columns=[METRIC_TO_DROP],level=2).droplevel(2,axis=1) 
none.shape

(32, 30)

In [25]:
built3 = pd.read_csv("Builtin3.csv",header=[0,1,2],index_col=0)
built3 = built3.drop(columns=[METRIC_TO_DROP],level=2).droplevel(2,axis=1) 
built3.shape

(32, 18)

In [26]:
spyder = pd.read_csv("Spyder.csv",header=[0,1,2],index_col=0)
spyder = spyder.drop(columns=[METRIC_TO_DROP],level=2).droplevel(2,axis=1) 
spyder = spyder.droplevel(0,axis=1)
spyder.shape

(32, 6)

In [27]:
ncl = pd.read_csv("Tackling.csv",header=[0,1,2],index_col=0)
ncl = ncl.drop(columns=[METRIC_TO_DROP],level=2).droplevel(2,axis=1) 
ncl = ncl.droplevel(0,axis=1)
ncl.shape

(32, 6)

In [28]:
enn = built3['ENN']
smote_enc = built3['SmoteEnc']
IHThreshold = built3['IHThreshold']

In [29]:
smote_ipf = ipf['smote']

In [30]:
ipf = ipf.median(axis=1,level=1)
ihf = ihf.median(axis=1,level=1)
clni = clni.median(axis=1,level=1)
none = none.median(axis=1,level=1)

In [31]:
focused = {'spyder':spyder,'smote_ipf':smote_ipf,'smote_enc':smote_enc}
general = {'ipf':ipf,'ihf':ihf,'clni':clni}
one_sided  = {'ihthres':IHThreshold,'enn':enn,'ncl':ncl} 
nofil = {'none':none}  
filters = {**focused,**general,**one_sided,**nofil}
len(filters)

10

In [32]:
for f,d in filters.items():
    print(f,d.shape,d.isna().sum().sum(),(d<.01).sum().sum())

spyder (32, 6) 0 0
smote_ipf (32, 6) 0 0
smote_enc (32, 6) 0 0
ipf (32, 6) 0 1
ihf (32, 6) 0 0
clni (32, 6) 0 0
ihthres (32, 6) 0 0
enn (32, 6) 0 0
ncl (32, 6) 0 0
none (32, 6) 0 0


In [33]:
df = pd.concat(filters,axis=1)
df.shape

(32, 60)

In [34]:
df.to_csv("All_filter_PRC.csv")

## Classifier-based View

In [None]:
filt = pd.DataFrame({k:df[k].values.reshape(-1) for k in filters.keys()})
print(friedmanchisquare(*(filt[c] for c in filt)).pvalue)
print({c:wilcoxon(filt[c],filt['none']).pvalue for c in filt if c!='none'})
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);

In [None]:
def plot_clf(name):
    clf = df.loc[:,(slice(None),name)].droplevel(1,axis=1)
    print(friedmanchisquare(*(clf[c] for c in clf)).pvalue)
    print({c:wilcoxon(clf[c],clf['none']).pvalue for c in clf if c!='none'})
    plt.figure(figsize=(10,6))
    sns.boxplot(data=clf-clf['none']);

In [None]:
plot_clf('dt')

In [None]:
plot_clf('lr')

In [None]:
plot_clf('nb')

In [None]:
plot_clf('rf')

In [None]:
plot_clf('svm')

In [None]:
plot_clf('knn')

## Filter-based View

In [None]:
CLFS = set(df.columns.get_level_values(1))
d = {c:df.loc[:,(slice(None),c)].droplevel(1,axis=1).values.reshape(-1) for c in CLFS}
plt.figure(figsize=(10,6))
sns.boxplot(data=pd.DataFrame(d));

In [None]:
filt = df['spyder']
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);

In [None]:
filt = df['smote_ipf']
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);

In [None]:
filt = df['smote_enc']
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);

In [None]:
filt = df['ipf']
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);

In [None]:
filt = df['ihf']
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);

In [None]:
filt = df['ihthres']
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);

In [None]:
filt = df['enn']
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);

In [None]:
filt = df['ncl']
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);

In [None]:
filt = df['none']
plt.figure(figsize=(10,6))
sns.boxplot(data=filt);