In [2]:
import numpy as np
import pandas as pd,os
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from scipy.stats import wilcoxon, friedmanchisquare, rankdata
import scikit_posthocs as sp
import pingouin as pg
import seaborn as sns
import matplotlib.pyplot as plt
from library.utils import read_data

In [4]:
clean = pd.read_csv("Results/Clean.csv",header=[0,1,2],index_col=0)  # All_filter_MCC.csv
noisy = pd.read_csv("Results/Noisy.csv",header=[0,1,2],index_col=0)  #
clean = clean.drop(columns=['pr_rec_score'],level=2).droplevel(2,axis=1)  # matthews_corrcoef
noisy = noisy.drop(columns=['pr_rec_score'],level=2).droplevel(2,axis=1)
df = clean-noisy
clean.shape, noisy.shape

((32, 32), (32, 32))

## Dataset Characteristic vs MCC

In [5]:
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]

In [8]:
noisy.columns

MultiIndex([( 'smote',    'dt'),
            ( 'smote',    'lr'),
            ( 'smote',    'nb'),
            ( 'smote',   'svm'),
            ( 'smote',   'knn'),
            ( 'smote',    'rf'),
            (   'rus',    'dt'),
            (   'rus',    'lr'),
            (   'rus',    'nb'),
            (   'rus',   'svm'),
            (   'rus',   'knn'),
            (   'rus',    'rf'),
            ('wilson',    'dt'),
            ('wilson',    'lr'),
            ('wilson',    'nb'),
            ('wilson',   'svm'),
            ('wilson',   'knn'),
            ('wilson',    'rf'),
            ( 'tomek',    'dt'),
            ( 'tomek',    'lr'),
            ( 'tomek',    'nb'),
            ( 'tomek',   'svm'),
            ( 'tomek',   'knn'),
            ( 'tomek',    'rf'),
            (  'None',    'dt'),
            (  'None',    'lr'),
            (  'None',    'nb'),
            (  'None',   'svm'),
            (  'None',   'knn'),
            (  'None',    'rf'),
          

In [24]:
info = pd.DataFrame(columns=['noise','FP','FN','TP','TN','nIR','IR','PR','Rec','pos','neg'])
for d in DATASETS:
    X,y_noisy,y_real = read_data(d,stats=False)
    noise = (y_noisy!=y_real).sum()/len(y_noisy)
    tn, fp, fn, tp = confusion_matrix(y_real,y_noisy).ravel()
    nIR = (y_noisy==0).sum()/y_noisy.sum()
    IR = (y_real==0).sum()/y_real.sum()
    pr = precision_score(y_real,y_noisy)
    rec = recall_score(y_real,y_noisy)
    pos = (y_real==1).sum()
    neg = (y_real==0).sum()
    info.loc[len(info)] = [noise,fp,fn,tp,tn,nIR,IR,pr,rec,pos,neg]
        
info['size'] = info['TN']+info['FN']+info['TP']+info['FP']
info.shape,info.isna().sum().sum()

((32, 12), 0)

In [25]:
info.corr()

Unnamed: 0,noise,FP,FN,TP,TN,nIR,IR,PR,Rec,pos,neg,size
noise,1.0,0.003989,0.530921,0.394874,-0.604112,-0.510341,-0.745125,0.334167,-0.151932,0.559278,-0.597665,-0.552804
FP,0.003989,1.0,-0.394866,0.09079,0.245745,-0.319471,0.383242,-0.688314,0.363664,-0.33275,0.282585,0.256064
FN,0.530921,-0.394866,1.0,0.405652,-0.003925,0.188557,-0.418024,0.644037,-0.69764,0.98024,-0.019441,0.056795
TP,0.394874,0.09079,0.405652,1.0,0.035472,-0.34125,-0.26802,0.534875,0.105204,0.578441,0.03868,0.083549
TN,-0.604112,0.245745,-0.003925,0.035472,1.0,0.61992,0.808073,-0.189593,-0.166302,0.004174,0.99927,0.99727
nIR,-0.510341,-0.319471,0.188557,-0.34125,0.61992,1.0,0.419101,0.013025,-0.574179,0.094426,0.600879,0.60682
IR,-0.745125,0.383242,-0.418024,-0.26802,0.808073,0.419101,1.0,-0.541347,0.238465,-0.431069,0.814759,0.779358
PR,0.334167,-0.688314,0.644037,0.534875,-0.189593,0.013025,-0.541347,1.0,-0.302146,0.690527,-0.214738,-0.160566
Rec,-0.151932,0.363664,-0.69764,0.105204,-0.166302,-0.574179,0.238465,-0.302146,1.0,-0.599841,-0.150242,-0.196516
pos,0.559278,-0.33275,0.98024,0.578441,0.004174,0.094426,-0.431069,0.690527,-0.599841,1.0,-0.008979,0.068768


In [34]:
NP = info['FN']/info['pos']

In [35]:
NP.describe()*100

count    3200.000000
mean       63.058679
std        20.012955
min        19.230769
25%        48.476733
50%        63.278254
75%        80.962621
max        93.427230
dtype: float64

In [42]:
(info['nIR']>3.94).sum()

27

In [50]:
t = (1-.6327)
rb = 12.65*.0412
t,rb,t/(t+rb)

(0.36729999999999996, 0.52118, 0.41340266522600394)

In [None]:
plt.figure(figsize=(7,5))
plt.scatter(df['noise'],df['MCC'],s=20,color='y')
f = sns.regplot(x='noise',y='MCC',data=df,robust=False,order=2,ci=95,scatter=False); #line_kws={'color':'green'}
plt.xlabel("Noise Level");

In [None]:
plt.figure(figsize=(7,5))
plt.scatter(df['nIR'],df['MCC'],s=20,color='y')
f = sns.regplot(x='nIR',y='MCC',data=df,robust=False,order=2,ci=95,scatter=False); #line_kws={'color':'green'}
plt.xlabel("Noise Level");

In [None]:
for col in df:
    if col=='MCC': continue
    print(col)
    print(pg.corr(df[col],df['MCC'],method='percbend'),'\n\n') #shepherd , percbend