In [3]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [4]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [5]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)
        self.results_df = self.results_df.sort_values(by="r2", ascending=False, ignore_index=True)

    def discard_below_average(self, sort_by):
        column_mean = self.results_df[sort_by].mean()      
        self.results_df = self.results_df[self.results_df[sort_by] >= column_mean]
    
    def discard_high_standard_deviation(self):
        r2_val, r2_test = self.results_df['r2_val'], self.results_df['r2_test']
        std_devs = np.abs(r2_val - r2_test)
        mean_std_dev = std_devs.mean()
        self.results_df = self.results_df[std_devs < mean_std_dev]

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.discard_below_average(sort_by="r2")
        self.discard_below_average(sort_by="r2_vt")
        self.discard_high_standard_deviation()
        self.results_df.sort_values(by="mse_sup", ascending=True, ignore_index=True)
        self.results_df.to_excel(f"better_results.xlsx", index=True)
        display(self.results_df)


In [8]:
analize = Analizer(0.9)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,r2_vt,mse,mse_sup,mse_test,mse_val,mse_vt,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_30_9_4,0.999809,0.647331,0.999580,0.999223,0.999447,0.000077,0.209359,0.000151,0.000249,0.000200,0.002206,0.008777,1.000052,0.008847,1184.942630,2838.494981,"Hidden Size=[26, 18], regularizer=0.2, learnin..."
1,model_30_9_5,0.999805,0.647650,0.999460,0.999260,0.999404,0.000079,0.209170,0.000195,0.000237,0.000216,0.002302,0.008869,1.000053,0.008940,1184.900702,2838.453053,"Hidden Size=[26, 18], regularizer=0.2, learnin..."
2,model_30_9_3,0.999804,0.646942,0.999677,0.999149,0.999462,0.000079,0.209590,0.000117,0.000273,0.000195,0.002095,0.008899,1.000054,0.008971,1184.887043,2838.439395,"Hidden Size=[26, 18], regularizer=0.2, learnin..."
3,model_30_9_6,0.999796,0.647857,0.999338,0.999274,0.999349,0.000082,0.209047,0.000239,0.000233,0.000236,0.002361,0.009062,1.000056,0.009135,1184.814505,2838.366857,"Hidden Size=[26, 18], regularizer=0.2, learnin..."
4,model_30_9_2,0.999786,0.646419,0.999738,0.999033,0.999442,0.000086,0.209901,0.000095,0.000309,0.000202,0.001980,0.009282,1.000058,0.009357,1184.718638,2838.270990,"Hidden Size=[26, 18], regularizer=0.2, learnin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851,model_4_7_6,0.997905,0.652331,0.997975,0.997911,0.997968,0.000845,0.206391,0.000741,0.000599,0.000670,0.007003,0.029064,1.000160,0.029297,3532.153079,8521.172954,"Hidden Size=[40, 39], regularizer=0.0, learnin..."
854,model_19_8_7,0.997901,0.665671,0.998739,0.997966,0.998461,0.000846,0.198472,0.000466,0.000458,0.000462,0.008286,0.029090,1.001650,0.029324,582.149443,1387.653505,"Hidden Size=[15, 14], regularizer=0.0, learnin..."
858,model_23_8_10,0.997884,0.641628,0.997253,0.998220,0.997850,0.000853,0.212745,0.001456,0.000935,0.001196,0.013926,0.029202,1.001663,0.029437,582.134049,1387.638111,"Hidden Size=[15, 14], regularizer=0.5, learnin..."
862,model_19_8_6,0.997872,0.665722,0.998763,0.998015,0.998494,0.000858,0.198442,0.000457,0.000447,0.000452,0.008312,0.029286,1.001673,0.029521,582.122614,1387.626676,"Hidden Size=[15, 14], regularizer=0.0, learnin..."
