In [1]:
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import matplotlib.patches as patches
from matplotlib.collections import PatchCollection
import pandas as pd
import math as m
import itertools
import os
import numpy as np

In [2]:
# Function that computes the Imbalance ratio of a dataframe
def compute_IR(dataframe):
    classes = dataframe['class'].unique()
    maj_class = max(dataframe['class'].value_counts())
    min_class = min(dataframe['class'].value_counts())
    return round(min_class/maj_class, 3)

In [3]:
# Loads all the imbalanced datasets created for a dataset in its imbalanced repertory 
# and compute their imbalance ratios
def all_IRs(dataset):
    path = "../../DataDeterioration/DeterioratedDatasets/" + dataset + "/imbalanced/"
    if os.path.isfile(path + dataset + "_train_balanced.csv"):
        df = pd.read_csv(path + dataset + "_train_balanced.csv")
    else:
        df = pd.read_csv(path + dataset + "_train.csv")
    irs = [compute_IR(df)]
    del df
    for p in range(5, 100, 5):
        df = pd.read_csv(path + dataset + "_train_" + str(p) + ".csv")
        irs.append(compute_IR(df))
        del df
    return np.array(irs)

In [4]:
iris_qa, iris_qf, _ = np.load("../OutputForVisualisation/scores/iris_imbalanced_(qa,qf,time).npy", allow_pickle=True)
cancer_qa, cancer_qf, _ = np.load("../OutputForVisualisation/scores/cancer_imbalanced_(qa,qf,time).npy", allow_pickle=True)
adult_qa, adult_qf, _ = np.load("../OutputForVisualisation/scores/adult_imbalanced_(qa,qf,time).npy", allow_pickle=True)

heart_qa, heart_qf, _ = np.load("../OutputForVisualisation/scores/heart_imbalanced_(qa,qf,time).npy", allow_pickle=True)
statlog_qa, statlog_qf, _ = np.load("../OutputForVisualisation/scores/statlog_imbalanced_(qa,qf,time).npy", allow_pickle=True)
spambase_qa, spambase_qf, _ = np.load("../OutputForVisualisation/scores/spambase_imbalanced_(qa,qf,time).npy", allow_pickle=True)
bean_qa, bean_qf, _ = np.load("../OutputForVisualisation/scores/bean_imbalanced_(qa,qf,time).npy", allow_pickle=True)
abalone_qa, abalone_qf, _ = np.load("../OutputForVisualisation/scores/abalone_imbalanced_(qa,qf,time).npy", allow_pickle=True)

In [5]:
iris_irs = all_IRs('iris')
cancer_irs = all_IRs('cancer')
adult_irs = all_IRs('adult')

heart_irs = all_IRs('heart')
abalone_irs = all_IRs('abalone')
statlog_irs = all_IRs('statlog')
spambase_irs = all_IRs('spambase')
bean_irs = all_IRs('bean')

In [6]:
iris_dq = np.array([max(qa[0],qf[0]) for qa,qf in zip(iris_qa, iris_qf)])
cancer_dq = np.array([max(qa[0],qf[0]) for qa,qf in zip(cancer_qa, cancer_qf)])
adult_dq = np.array([max(qa[0],qf[0]) for qa,qf in zip(adult_qa, adult_qf)])

heart_dq = np.array([max(qa[0],qf[0]) for qa,qf in zip(heart_qa, heart_qf)])
abalone_dq = np.array([max(qa[0],qf[0]) for qa,qf in zip(abalone_qa, abalone_qf)])
statlog_dq = np.array([max(qa[0],qf[0]) for qa,qf in zip(statlog_qa, statlog_qf)])
spambase_dq = np.array([max(qa[0],qf[0]) for qa,qf in zip(spambase_qa, spambase_qf)])
bean_dq = np.array([max(qa[0],qf[0]) for qa,qf in zip(bean_qa, bean_qf)])

In [7]:
simple_heart_dq = np.array([heart_dq[k] for k in range(0, len(heart_dq), 4)] + [heart_dq[19]])
simple_abalone_dq = np.array([abalone_dq[k] for k in range(0, len(abalone_dq), 4)] + [abalone_dq[19]])
simple_statlog_dq = np.array([statlog_dq[k] for k in range(0, len(statlog_dq), 4)] + [statlog_dq[19]])
simple_spambase_dq = np.array([spambase_dq[k] for k in range(0, len(spambase_dq), 4)] + [spambase_dq[19]])
simple_bean_dq = np.array([bean_dq[k] for k in range(0, len(bean_dq), 4)] + [bean_dq[19]])

simple_heart_irs = np.array([heart_irs[k] for k in range(0, len(heart_irs), 4)] + [heart_irs[19]])
simple_abalone_irs = np.array([abalone_irs[k] for k in range(0, len(abalone_irs), 4)] + [abalone_irs[19]])
simple_statlog_irs = np.array([statlog_irs[k] for k in range(0, len(statlog_irs), 4)] + [statlog_irs[19]])
simple_spambase_irs = np.array([spambase_irs[k] for k in range(0, len(spambase_irs), 4)] + [spambase_irs[19]])
simple_bean_irs = np.array([bean_irs[k] for k in range(0, len(bean_irs), 4)] + [bean_irs[19]])

In [8]:
x = [1-(50/50), 1-(40/60), 1-(30/70), 1-(20/80), 1-(10/90), 1-(5/95)]
ys = [0, 0.3, 0.6]
heights = [0.3, 0.3, 0.4]
colors = cm.rainbow([0.5, 0.75, 1])
zones = []
for j in range(3):
    zones.append(patches.Rectangle((0, ys[j]), 1, heights[j], color=colors[j], alpha=0.33))

fig, ax = plt.subplots()

line4a, = plt.plot(x, simple_heart_dq, color='c')
line5a, = plt.plot(x, simple_abalone_dq, color='y')
line6a, = plt.plot(x, simple_statlog_dq, color='m')
line7a, = plt.plot(x, simple_spambase_dq, color='k')
line8a, = plt.plot(x, simple_bean_dq, color='0.5')

plt.ylim([0, 1])
plt.xlim([0, 0.95])

pc = PatchCollection(zones, match_original=True)
ax.add_collection(pc)

legend1 = plt.legend([line4a, line5a, line6a, line7a, line8a], ['Heart Disease', 'Abalone', 'Statlog', 'Spambase', 'Dry Beans'])
plt.gca().add_artist(legend1)

ymajor_ticks = np.arange(0, 11, 1)/10
yminor_ticks = np.arange(0, 20, 1)/20
ax.set_yticks(ymajor_ticks)
ax.set_yticks(yminor_ticks, minor=True)
ax.set_xticklabels(['50/50', '40/60', '30/70', '20/80', '10/90', '5/95'])
ax.grid(which='minor', linestyle='--')
plt.grid(alpha=0.5)
fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

  ax.set_xticklabels(['50/50', '40/60', '30/70', '20/80', '10/90', '5/95'])
