In [15]:
#Utils
import os
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
from itertools import combinations
#Plotting
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [16]:
results_path_reg = './regression_results/regular/'
test_performance_ml = pd.read_csv(os.path.join(results_path_reg, 'performance_test.csv'))
test_performance_gcn = pd.read_csv(os.path.join(results_path_reg, 'performance_test_gcn.csv'))
test_performance = pd.concat([test_performance_ml, test_performance_gcn])
test_performance = test_performance.set_index('Target ID')
test_performance

Unnamed: 0_level_0,Algorithm,Metric,Value,trial,Approach,Approach_trial
Target ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
279,kNN,MAE,0.603979,0,Complete set,0
279,kNN,MSE,0.739169,0,Complete set,0
279,kNN,R2,0.345384,0,Complete set,0
279,kNN,RMSE,0.859749,0,Complete set,0
279,SVR,MAE,0.883091,0,Complete set,0
...,...,...,...,...,...,...
1865,GCN,RMSE,1.010412,3,Diverse set,0
1865,GCN,MAE,0.906499,4,Diverse set,0
1865,GCN,MSE,1.277601,4,Diverse set,0
1865,GCN,R2,-0.203066,4,Diverse set,0


# Load y-randomization data

In [17]:
results_path_y_rand = './regression_results/y_rand/'
test_performance_ml_rand = pd.read_csv(os.path.join(results_path_y_rand, 'performance_test_y_rand.csv'))
test_performance_gcn_rand = pd.read_csv(os.path.join(results_path_y_rand, 'performance_test_gcn_y_rand.csv'))
test_performance_yrand = pd.concat([test_performance_ml_rand, test_performance_gcn_rand])
test_performance_yrand = test_performance_yrand.set_index('Target ID')
test_performance_yrand

Unnamed: 0_level_0,Algorithm,Metric,Value,trial,Approach,Approach_trial
Target ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
279,kNN,MAE,0.603979,0,Complete set,0
279,kNN,MSE,0.739169,0,Complete set,0
279,kNN,R2,0.345384,0,Complete set,0
279,kNN,RMSE,0.859749,0,Complete set,0
279,SVR,MAE,0.883091,0,Complete set,0
...,...,...,...,...,...,...
1865,GCN,RMSE,1.150123,3,Diverse set,0
1865,GCN,MAE,1.090739,4,Diverse set,0
1865,GCN,MSE,1.684687,4,Diverse set,0
1865,GCN,R2,-0.556567,4,Diverse set,0


# Load Cluster and Potent data

In [None]:
results_path_cp = './regression_results/cluster_potent/'
test_performance_cluster_potent = pd.read_csv(os.path.join(results_path_cp, 'performance_test_cluster_potent.csv'))
test_performance_gcn_cluster_potent = pd.read_csv(os.path.join(results_path_cp, 'performance_test_gcn_cluster_potent.csv'))
test_performance_cluster_potent = pd.concat([test_performance_cluster_potent, test_performance_gcn_cluster_potent ])
test_performance_cluster_potent = test_performance_cluster_potent.set_index('Target ID')
test_performance_cluster_potent

# Metrics summary

In [None]:
performance_test_df_mean = test_performance.copy()

performance_test_df_mean = performance_test_df_mean.groupby(["Target ID","Algorithm", "Approach", "Metric"]).agg({"Value": ["mean", "std"],
                                                                                          }).round(decimals=3)
performance_test_df_mean = pd.DataFrame(performance_test_df_mean)
display(performance_test_df_mean)

# Boxplot results (Complete, Random, Diverse sets)

In [21]:
def plot_results(df, metric, savepath=None):
    #plot parameters
    matplotlib.rcdefaults()
    font = {'size': 20}
    matplotlib.rc('font', **font)
    plt.figure(dpi=300)

    # Database
    df = df.loc[df ['Metric'] == metric].reset_index()

    ax = sns.catplot(x="Approach", y="Value",
                     hue="Algorithm", hue_order=['kNN', 'SVR', 'RFR', 'DNN', 'GCN', 'MR'],
                     data=df,
                     kind="box",
                     col='Target ID',
                     order=['Complete set', 'Random set', 'Diverse set'],
                     col_wrap=2,
                     aspect=1.5,
                     palette=["tab:blue", "tab:orange", "tab:purple", "tab:green", "tab:red", "tab:gray"],
                     width=0.8)

    ax.set_titles("{col_var}: {col_name}")
    ax.set_ylabels(f"{metric}", fontsize=20)
    ax.set_xlabels(" ")
    ax = sns.move_legend(ax, "lower center", bbox_to_anchor=(.485, 0.05), ncol=6, title=None, frameon=False)
    plt.subplots_adjust(bottom=0.1)
    if savepath:
        plt.savefig(savepath + f'all_classes_{metric}.png', dpi=300)

In [None]:
plot_results(test_performance, 'MAE', results_path_reg)
plot_results(test_performance, 'RMSE', results_path_reg)

# Boxplot results (Complete, Random, Diverse sets) - Y_randomization

In [23]:
def plot_results_y_rand(df, metric, savepath=None):
    #plot parameters
    matplotlib.rcdefaults()
    font = {'size': 20}
    matplotlib.rc('font', **font)
    plt.figure(dpi=300)

    #Define database
    df = df.loc[df['Metric'] == metric].reset_index()

    ax = sns.catplot(x="Approach", y="Value",
                     hue="Algorithm", hue_order=['kNN', 'SVR', 'MR'],
                     data=df,
                     kind="box",
                     col='Target ID',
                     order=['Complete set', 'Random set', 'Diverse set'],
                     col_wrap=2,
                     aspect=2,
                     palette=["tab:blue", "tab:orange", "tab:gray"],
                     width=0.8)

    ax.set_titles("{col_var}: {col_name}")
    ax.set_ylabels(f"{metric}", fontsize=20)
    ax.set_xlabels(" ")

    ax = sns.move_legend(ax, "lower center", bbox_to_anchor=(.485, 0.05), ncol=5, title=None, frameon=False)
    plt.subplots_adjust(bottom=0.1)
    if savepath:
        plt.savefig(savepath + f'y_rand_{metric}.png', dpi=300)

In [None]:
plot_results_y_rand(test_performance_yrand, 'MAE', results_path_y_rand)
plot_results_y_rand(test_performance_yrand, 'RMSE', results_path_y_rand)

# Barplot results (Cluster, Potent sets)

In [25]:
def plot_results_cluster_potent(df, set, metric, savepath=None):
    #plot parameters
    matplotlib.rcdefaults()
    font = {'size': 20}
    matplotlib.rc('font', **font)
    plt.figure(dpi=300)

    # Database
    df = df.loc[df ['Metric'] == metric].reset_index()

    ax = sns.catplot(x="Approach", y="Value",
                     hue="Algorithm", hue_order=['kNN', 'SVR', 'RFR', 'DNN', 'GCN', 'MR'],
                     data=df,
                     kind="bar",
                     col='Target ID',
                     order=[f'{set} set',],
                     col_wrap=2,
                     aspect=1.5,
                     palette=["tab:blue", "tab:orange", "tab:purple", "tab:green", "tab:red", "tab:gray"],
                    )

    ax.set_titles("{col_var}: {col_name}")
    ax.set_ylabels(f"{metric}", fontsize=20)
    ax.set_xlabels(" ")
    ax = sns.move_legend(ax, "lower center", bbox_to_anchor=(.485, 0.05), ncol=6, title=None, frameon=False)
    plt.subplots_adjust(bottom=0.1)

    if savepath:
        plt.savefig(savepath + f'all_classes_{metric}_{set}.png', dpi=300)

In [None]:
plot_results_cluster_potent(test_performance_cluster_potent, 'Cluster', 'MAE', results_path_cp)
plot_results_cluster_potent(test_performance_cluster_potent, 'Potent', 'MAE', results_path_cp)
plot_results_cluster_potent(test_performance_cluster_potent, 'Cluster', 'RMSE', results_path_cp)
plot_results_cluster_potent(test_performance_cluster_potent, 'Potent', 'RMSE', results_path_cp)