In [1]:
#Utils
import os
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
from itertools import combinations
#Plotting
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib
%load_ext autoreload
%autoreload 2

# Load data

In [None]:
results_path = './regression_results/'
test_performance_ml = pd.read_csv(os.path.join(results_path, 'performance_test.csv'))
test_performance_gcn = pd.read_csv(os.path.join(results_path, 'performance_test_gcn.csv'))
test_performance = pd.concat([test_performance_ml, test_performance_gcn])
test_performance = test_performance.set_index('Target ID')
test_performance

# Load y-randomization data

In [None]:
results_path = './regression_results/'
test_performance_ml_rand = pd.read_csv(os.path.join(results_path, 'performance_test_y_rand.csv'))
test_performance_gcn_rand = pd.read_csv(os.path.join(results_path, 'performance_test_gcn_y_rand.csv'))
test_performance_yrand = pd.concat([test_performance_ml_rand, test_performance_gcn_rand])
test_performance_yrand = test_performance_yrand.set_index('Target ID')
test_performance_yrand

# Load Cluster and Potent data

In [None]:
test_performance_cluster_potent = pd.read_csv(os.path.join(results_path, 'performance_test_cluster_potent.csv'))
test_performance_gcn_cluster_potent = pd.read_csv(os.path.join(results_path, 'performance_test_gcn_cluster_potent.csv'))
test_performance_cluster_potent = pd.concat([test_performance_cluster_potent, test_performance_gcn_cluster_potent ])
test_performance_cluster_potent = test_performance_cluster_potent.set_index('Target ID')
test_performance_cluster_potent

# Metrics summary

In [None]:
performance_test_df_mean = test_performance.copy()

performance_test_df_mean = performance_test_df_mean.groupby(["Target ID","Algorithm", "Approach", "Metric"]).agg({"Value": ["mean", "std"],
                                                                                          }).round(decimals=3)
performance_test_df_mean = pd.DataFrame(performance_test_df_mean)
display(performance_test_df_mean)

# Plot results (Complete, Random, Diverse sets)

In [10]:
def plot_results(df, metric, savepath=None):
    #plot parameters
    matplotlib.rcdefaults()
    font = {'size': 20}
    matplotlib.rc('font', **font)
    plt.figure(dpi=300)

    # Database
    df = df.loc[df ['Metric'] == metric].reset_index()

    ax = sns.catplot(x="Approach", y="Value",
                     hue="Algorithm", hue_order=['kNN', 'SVR', 'RFR', 'DNN', 'GCN', 'MR'],
                     data=df,
                     kind="box",
                     col='Target ID',
                     order=['Complete set', 'Random set', 'Diverse set'],
                     col_wrap=2,
                     aspect=1.5,
                     palette=["tab:blue", "tab:orange", "tab:purple", "tab:green", "tab:red", "tab:gray"],
                     width=0.8)

    ax.set_titles("{col_var}: {col_name}")
    ax.set_ylabels(f"{metric}", fontsize=20)
    ax.set_xlabels(" ")
    if metric == 'RMSE':
    #RMSE
        ax.set(ylim=(0.4, 1.68),  yticks=[0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6])
    else:
    #MAE
        ax.set(ylim=(0.3, 1.3),  yticks=[0.4, 0.6, 0.8, 1, 1.2])
    ax = sns.move_legend(ax, "lower center", bbox_to_anchor=(.485, 0.05), ncol=6, title=None, frameon=False)
    plt.subplots_adjust(bottom=0.1)
    if savepath:
        plt.savefig(results_path + f'all_classes_{metric}.png', dpi=300)

# Plot results (Complete, Random, Diverse sets)

In [None]:
plot_results(test_performance, 'MAE', results_path)

# # Plot results (Complete, Random, Diverse sets) - Y_randomization

In [13]:
def plot_results_y_rand(df, metric, savepath=None):
    #plot parameters
    matplotlib.rcdefaults()
    font = {'size': 20}
    matplotlib.rc('font', **font)
    plt.figure(dpi=300)

    #Define database
    df = df.loc[df['Metric'] == metric].reset_index()

    ax = sns.catplot(x="Approach", y="Value",
                     hue="Algorithm", hue_order=['kNN', 'SVR', 'MR'],
                     data=df,
                     kind="box",
                     col='Target ID',
                     order=['Complete set', 'Random set', 'Diverse set'],
                     col_wrap=2,
                     aspect=2,
                     palette=["tab:blue", "tab:orange", "tab:gray"],
                     width=0.8)

    ax.set_titles("{col_var}: {col_name}")
    ax.set_ylabels(f"{metric}", fontsize=20)
    ax.set_xlabels(" ")
    if metric == 'RMSE':
        #RMSE
        ax.set(ylim=(0.4, 1.68), yticks=[0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6])
        #MAE
    else:
        ax.set(ylim=(0.3, 1.4), yticks=[0.4, 0.6, 0.8, 1, 1.2])
    ax = sns.move_legend(ax, "lower center", bbox_to_anchor=(.485, 0.05), ncol=5, title=None, frameon=False)
    plt.subplots_adjust(bottom=0.1)
    if savepath:
        plt.savefig(results_path + f'y_rand_{metric}.png', dpi=300)

In [None]:
plot_results_y_rand(test_performance_yrand, 'MAE', savepath=None)

# Plot results (Cluster, Potent sets)

In [None]:
def plot_results_barplot(df, set, metric, savepath=None):
    #plot parameters
    matplotlib.rcdefaults()
    font = {'size': 20}
    matplotlib.rc('font', **font)
    plt.figure(dpi=300)

    # Database
    df = df.loc[df ['Metric'] == metric].reset_index()

    ax = sns.catplot(x="Approach", y="Value",
                     hue="Algorithm", hue_order=['kNN', 'SVR', 'RFR', 'DNN', 'GCN', 'MR'],
                     data=df,
                     kind="bar",
                     col='Target ID',
                     order=[f'{set} set',],
                     col_wrap=2,
                     aspect=1.5,
                     palette=["tab:blue", "tab:orange", "tab:purple", "tab:green", "tab:red", "tab:gray"],
                     width=0.8)

    ax.set_titles("{col_var}: {col_name}")
    ax.set_ylabels(f"{metric}", fontsize=20)
    ax.set_xlabels(" ")
    ax = sns.move_legend(ax, "lower center", bbox_to_anchor=(.485, 0.05), ncol=6, title=None, frameon=False)
    plt.subplots_adjust(bottom=0.1)

    if savepath:
        plt.savefig(results_path + f'all_classes_{metric}_{set}.png', dpi=300)

In [None]:
plot_results_barplot(test_performance_cluster_potent, 'Cluster', 'MAE')

# Statistical Analysis (Wilcoxon test)

In [None]:
test_stats = test_performance.copy().reset_index(inplace=True)
test_stats

In [None]:
metric = 'MAE'
mut_result = []
for app in test_stats.Approach.unique()[:]:
    for target in test_stats['Target ID'].unique()[:]:
        df = test_stats.loc[test_stats['Target ID'] == target]
        for alg in combinations(['kNN', 'SVR', 'RFR', 'DNN', 'GCN', 'MR'], 2):
            print(app, target, alg)

            alg1 = df.query(f'Algorithm == "{alg[0]}" & Approach == "{app}" & Metric == "{metric}"')['Value']
            alg2 = df.query(f'Algorithm == "{alg[1]}" & Approach == "{app}" & Metric == "{metric}"')['Value']

            df_1 = pd.DataFrame(list(zip(alg1, alg2)), columns=[f'{alg[0]} {target} {app}', f'{alg[1]} {target} {app}'])

            stats, p_value = wilcoxon(list(alg1),list(alg2))

            mut_result_dict = {"Approach": app,
                               "Algorithm_1": f'{alg[0]}',
                               "Algorithm_2": f'{alg[1]}',
                               "Target": target,
                               "p_value":round(p_value, 3)}
            mut_result.append(mut_result_dict)
mut_result = pd.DataFrame(mut_result)
display(mut_result)
mut_result.to_csv(results_path +'stat_test_p_values.csv')