In [1]:
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import matplotlib.patches as patches
from matplotlib.collections import PatchCollection
import pandas as pd
import math as m
import itertools
import os
import numpy as np

In [None]:
# Function that computes the Imbalance ratio of a dataframe
def compute_IR(dataframe):
    classes = dataframe['class'].unique()
    maj_class = max(dataframe['class'].value_counts())
    min_class = min(dataframe['class'].value_counts())
    return round(min_class/maj_class, 3)

In [2]:
# Loads all the imbalanced datasets created for a dataset in its imbalanced repertory 
# and compute their imbalance ratios
def all_IRs(dataset):
    path = "../../DataDeterioration/DeterioratedDatasets/" + dataset + "/imbalanced/"
    if os.path.isfile(path + dataset + "_train_balanced.csv"):
        df = pd.read_csv(path + dataset + "_train_balanced.csv")
    else:
        df = pd.read_csv(path + dataset + "_train.csv")
    irs = [compute_IR(df)]
    del df
    for p in range(5, 100, 5):
        df = pd.read_csv(path + dataset + "_train_" + str(p) + ".csv")
        irs.append(compute_IR(df))
        del df
    return np.array(irs)

In [3]:
def load_cimb_score(dataset):
    path = "../Output/class balance/scores/"
    end = "_(x,qa,qf,time).npy"
    if dataset == 'iris':
        data_0 = np.load(path + dataset + "_3class_1_1_(x,y,y1,y2,z,z1,z2,time).npy", allow_pickle=True)
    elif dataset in ['adult', 'cancer']:
        data_0 = np.load(path + dataset + "_1_1_(x,y,y1,y2,z,z1,z2,time).npy", allow_pickle=True)
    else : 
        data_0 = np.load(path + dataset + "_0" + end, allow_pickle=True)
    x,qa,qf = [data_0[0]],[data_0[1][0]],[data_0[2][0]]
    for p in range(5, 100, 5):
        if os.path.isfile(path + dataset + "_" + str(p) + end):
            data_p = np.load(path + dataset + "_" + str(p) + end, allow_pickle=True)
            x.append(data_p[0])
            qa.append(data_p[1][0])
            qf.append(data_p[2][0])
        else:
            x.append([np.nan, np.nan, np.nan])
            qa.append(np.nan)
            qf.append(np.nan)
    return np.array(x), np.array(qa), np.array(qf)
        

In [4]:
def load_base(dataset):
    path = "../Output/class balance/base_scores/"
    if dataset in ['iris', 'adult', 'cancer']:
        accs = [np.nan]
        f1s = [np.nan]
    else:
        accs = [np.nanmean(np.nan_to_num(np.load(path + dataset + "_0_base_accs.npy")))]
        f1s = [np.nanmean(np.nan_to_num(np.load(path + dataset + "_0_base_f1s.npy")))]
    for p in range(5, 100, 5):
        if os.path.isfile(path + dataset + "_" + str(p) + "_base_accs.npy") and os.path.isfile(path + dataset + "_" + str(p) + "_base_f1s.npy"):
            acc = np.nanmean(np.nan_to_num(np.load(path + dataset + "_" + str(p) + "_base_accs.npy")))
            f1 = np.nanmean(np.nan_to_num(np.load(path + dataset + "_"+ str(p) + "_base_f1s.npy")))
            accs.append(acc)
            f1s.append(f1)
        else:
            accs.append(np.nan)
            f1s.append(np.nan)
    return np.array(accs), np.array(f1s)

In [5]:
def load_var(dataset):
    path = "../Output/class balance/variations/"
    if dataset in ['iris', 'adult', 'cancer']:
        accs = [np.nan]
        f1s = [np.nan]
    else:
        accs = [np.nanmean(np.nan_to_num(np.load(path + dataset + "_0_var_accs.npy")))]
        f1s = [np.nanmean(np.nan_to_num(np.load(path + dataset + "_0_var_f1s.npy")))]
    for p in range(5, 100, 5):
        if os.path.isfile(path + dataset + "_" + str(p) + "_var_accs.npy") and os.path.isfile(path + dataset + "_" + str(p) + "_var_f1s.npy"):
            acc = np.nanmean(np.nan_to_num(np.load(path + dataset + "_" + str(p) + "_var_accs.npy")))
            f1 = np.nanmean(np.nan_to_num(np.load(path + dataset + "_"+ str(p) + "_var_f1s.npy")))
            accs.append(acc)
            f1s.append(f1)
        else:
            accs.append(np.nan)
            f1s.append(np.nan)
    return np.array(accs), np.array(f1s)

In [6]:
iris_x, iris_qa, iris_qf = load_cimb_score('iris')
iris_accs, iris_f1s = load_base('iris')
iris_var_accs, iris_var_f1s = load_var('iris')
iris_var_accs = [round(abs(acc - acc_var), 3) for acc,acc_var in zip(iris_accs, iris_var_accs)]
iris_var_f1s = [round(abs(f1 - f1_var), 3) for f1,f1_var in zip(iris_f1s, iris_var_f1s)]
iris_irs = all_IRs('iris')

cancer_x, cancer_qa, cancer_qf = load_cimb_score('cancer')
cancer_accs, cancer_f1s = load_base('cancer')
cancer_var_accs, cancer_var_f1s = load_var('cancer')
cancer_var_accs = [round(abs(acc - acc_var), 3) for acc,acc_var in zip(cancer_accs, cancer_var_accs)]
cancer_var_f1s = [round(abs(f1 - f1_var), 3) for f1,f1_var in zip(cancer_f1s, cancer_var_f1s)]
cancer_irs = all_IRs('cancer')

adult_x, adult_qa, adult_qf = load_cimb_score('adult')
adult_accs, adult_f1s = load_base('adult')
adult_var_accs, adult_var_f1s = load_var('adult')
adult_var_accs = [round(abs(acc - acc_var), 3) for acc,acc_var in zip(adult_accs, adult_var_accs)]
adult_var_f1s = [round(abs(f1 - f1_var), 3) for f1,f1_var in zip(adult_f1s, adult_var_f1s)]
adult_irs = all_IRs('adult')

heart_x, heart_qa, heart_qf = load_cimb_score('heart')
heart_accs, heart_f1s = load_base('heart')
heart_var_accs, heart_var_f1s = load_var('heart')
heart_var_accs = [round(abs(acc - acc_var), 3) for acc,acc_var in zip(heart_accs, heart_var_accs)]
heart_var_f1s = [round(abs(f1 - f1_var), 3) for f1,f1_var in zip(heart_f1s, heart_var_f1s)]
heart_irs = all_IRs('heart')

abalone_x, abalone_qa, abalone_qf = load_cimb_score('abalone')
abalone_accs, abalone_f1s = load_base('abalone')
abalone_var_accs, abalone_var_f1s = load_var('abalone')
abalone_var_accs = [round(abs(acc - acc_var), 3) for acc,acc_var in zip(abalone_accs, abalone_var_accs)]
abalone_var_f1s = [round(abs(f1 - f1_var), 3) for f1,f1_var in zip(abalone_f1s, abalone_var_f1s)]
abalone_irs = all_IRs('abalone')

statlog_x, statlog_qa, statlog_qf = load_cimb_score('statlog')
statlog_accs, statlog_f1s = load_base('statlog')
statlog_var_accs, statlog_var_f1s = load_var('statlog')
statlog_var_accs = [round(abs(acc - acc_var), 3) for acc,acc_var in zip(statlog_accs, statlog_var_accs)]
statlog_var_f1s = [round(abs(f1 - f1_var), 3) for f1,f1_var in zip(statlog_f1s, statlog_var_f1s)]
statlog_irs = all_IRs('statlog')

spambase_x, spambase_qa, spambase_qf = load_cimb_score('spambase')
spambase_accs, spambase_f1s = load_base('spambase')
spambase_var_accs, spambase_var_f1s = load_var('spambase')
spambase_var_accs = [round(abs(acc - acc_var), 3) for acc,acc_var in zip(spambase_accs, spambase_var_accs)]
spambase_var_f1s = [round(abs(f1 - f1_var), 3) for f1,f1_var in zip(spambase_f1s, spambase_var_f1s)]
spambase_irs = all_IRs('spambase')

bean_x, bean_qa, bean_qf = load_cimb_score('bean')
bean_accs, bean_f1s = load_base('bean')
bean_var_accs, bean_var_f1s = load_var('bean')
bean_var_accs = [round(abs(acc - acc_var), 3) for acc,acc_var in zip(bean_accs, bean_var_accs)]
bean_var_f1s = [round(abs(f1 - f1_var), 3) for f1,f1_var in zip(bean_f1s, bean_var_f1s)]
bean_irs = all_IRs('bean')

NameError: name 'compute_IR' is not defined

In [None]:
iris_dq = np.array([max(qa,qf) for qa,qf in zip(iris_qa, iris_qf)])
cancer_dq = np.array([max(qa,qf) for qa,qf in zip(cancer_qa, cancer_qf)])
adult_dq = np.array([max(qa,qf) for qa,qf in zip(adult_qa, adult_qf)])

heart_dq = np.array([max(qa,qf) for qa,qf in zip(heart_qa, heart_qf)])
abalone_dq = np.array([max(qa,qf) for qa,qf in zip(abalone_qa, abalone_qf)])
statlog_dq = np.array([max(qa,qf) for qa,qf in zip(statlog_qa, statlog_qf)])
spambase_dq = np.array([max(qa,qf) for qa,qf in zip(spambase_qa, spambase_qf)])
bean_dq = np.array([max(qa,qf) for qa,qf in zip(bean_qa, bean_qf)])

In [None]:
simple_heart_dq = np.array([heart_dq[k] for k in range(0, len(heart_dq), 4)] + [heart_dq[19]])
simple_abalone_dq = np.array([abalone_dq[k] for k in range(0, len(abalone_dq), 4)] + [abalone_dq[19]])
simple_statlog_dq = np.array([statlog_dq[k] for k in range(0, len(statlog_dq), 4)] + [statlog_dq[19]])
simple_spambase_dq = np.array([spambase_dq[k] for k in range(0, len(spambase_dq), 4)] + [spambase_dq[19]])
simple_bean_dq = np.array([bean_dq[k] for k in range(0, len(bean_dq), 4)] + [bean_dq[19]])

simple_heart_irs = np.array([heart_irs[k] for k in range(0, len(heart_irs), 4)] + [heart_irs[19]])
simple_abalone_irs = np.array([abalone_irs[k] for k in range(0, len(abalone_irs), 4)] + [abalone_irs[19]])
simple_statlog_irs = np.array([statlog_irs[k] for k in range(0, len(statlog_irs), 4)] + [statlog_irs[19]])
simple_spambase_irs = np.array([spambase_irs[k] for k in range(0, len(spambase_irs), 4)] + [spambase_irs[19]])
simple_bean_irs = np.array([bean_irs[k] for k in range(0, len(bean_irs), 4)] + [bean_irs[19]])

In [None]:
x = [1-(50/50), 1-(40/60), 1-(30/70), 1-(20/80), 1-(10/90), 1-(5/95)]
ys = [0, 0.3, 0.6]
heights = [0.3, 0.3, 0.4]
colors = cm.rainbow([0.5, 0.75, 1])
zones = []
for j in range(3):
    zones.append(patches.Rectangle((0, ys[j]), 1, heights[j], color=colors[j], alpha=0.33))

fig, ax = plt.subplots()

line4a, = plt.plot(x, simple_heart_dq, color='c')
line5a, = plt.plot(x, simple_abalone_dq, color='y')
line6a, = plt.plot(x, simple_statlog_dq, color='m')
line7a, = plt.plot(x, simple_spambase_dq, color='k')
line8a, = plt.plot(x, simple_bean_dq, color='0.5')

plt.ylim([0, 1])
plt.xlim([0, 0.95])

pc = PatchCollection(zones, match_original=True)
ax.add_collection(pc)

legend1 = plt.legend([line4a, line5a, line6a, line7a, line8a], ['Heart Disease', 'Abalone', 'Statlog', 'Spambase', 'Dry Beans'])
plt.gca().add_artist(legend1)

ymajor_ticks = np.arange(0, 11, 1)/10
yminor_ticks = np.arange(0, 20, 1)/20
ax.set_yticks(ymajor_ticks)
ax.set_yticks(yminor_ticks, minor=True)
ax.set_xticklabels(['50/50', '40/60', '30/70', '20/80', '10/90', '5/95'])
ax.grid(which='minor', linestyle='--')
plt.grid(alpha=0.5)
fig.tight_layout()
plt.show()