# Analyzing the effect of number of support points on recovery rate

In [13]:
import os
from collections import defaultdict

import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib import rcParams

LOGS_PATH = "../../logs"

sns.set_theme(style="ticks")
sns.set_style('whitegrid')

get_data = lambda path:  pd.read_excel(path,engine="openpyxl", usecols="B:H")

In [14]:
def get_exp_results_by_points(noise:float=0.0):
    
    exp_data = defaultdict(dict)

    for dataset in os.listdir(LOGS_PATH):

        dataset_path = os.path.join(LOGS_PATH, dataset)
        model_results = defaultdict(dict)

        for model in sorted(os.listdir(dataset_path), key=lambda x: x.lower()):

            model_path = os.path.join(dataset_path, model)

            filter_fn = lambda x: (
                x[-4:] == "xlsx" and \
                int(x.split("_")[1].split("-")[1]) == noise
            )
            for file in filter(filter_fn, os.listdir(model_path)):
                header = file.split("_")
                model = header[0]
                points = header[2].split("-")[1]
                
                if model_results[model].get(points) is None:
                    tmp = get_data(os.path.join(model_path, file))
                    tmp["trial"] = [0]*len(tmp)
                    model_results[model][points] = [tmp]
                    continue
                
                tmp = get_data(os.path.join(model_path, file))
                tmp["trial"] = [len(model_results[model][points])]*len(tmp)
                
                model_results[model][points].append(tmp)
                
        exp_data[dataset] = dict(model_results)
            
    return exp_data
            

exp_data = get_exp_results_by_points()


FileNotFoundError: [Errno 2] No such file or directory: '../../logs/nguyen-12/num_points'

In [8]:
dataset_df = {}

for dataset in os.listdir(LOGS_PATH):

    acc, tim = [], []

    df_dict = {
        "model":[],
        "accuracy":[],
        "support points":[],
        "duration":[],
        "trial":[],
    }    
    
    models = sorted(exp_data[dataset].keys())
    for model in models:
        
        points = sorted(exp_data[dataset][model].keys(), key=lambda x: int(x))
        for point in points:
            
            for df_col in exp_data[dataset][model][point]:
            
                acc = len(df_col[df_col["accuracy"]>=0.95])/len(df_col)        
                dur = df_col.describe()['time']['mean']
                trial = int(df_col.describe()['trial']['mean'])

                df_dict["model"].append(model)
                df_dict["accuracy"].append(acc)
                df_dict["support points"].append(point)
                df_dict["duration"].append(dur)
                df_dict["trial"].append(trial)
            
    df = pd.DataFrame(data=df_dict)
    dataset_df[dataset] = df

In [9]:
def show_acc_plot(datasets:list):
    
    f, axs = plt.subplots(1,len(datasets),
                      figsize=(13,5),
                      )
    
    for i, dataset in enumerate(datasets):
        
        fig = sns.lineplot(x="support points", y="accuracy",
            hue="model",
            data=dataset_df[dataset], marker="o", 
            linewidth = 0.8,  ax=axs[i]
        )

        fig.set_title(f"\n{dataset.capitalize()}: Effect of number of support points on recovery rate\n")
        fig.axhline(dataset_df[dataset]["accuracy"].max(), linestyle="--", linewidth=1.2, color='gray')
    
        fig.set_ylim(-0.1, 1.0)
    
        handles, labels = axs[i].get_legend_handles_labels()
        fig.legend(handles, labels, loc='upper right')
        fig.set_xlabel("Number of support points")
        fig.set_ylabel("Recovery rate")

show_acc_plot(["Feynman-03", "nguyen-12"])