In [7]:
import pandas as pd
import numpy as np

def read_excel(file_name):
    df = pd.read_excel(file_name)
    return df

def read_txt(file_name):
    file = open(file_name)
    lines = file.readlines()
    return(lines[0])

In [4]:
import os
import glob

def get_files(subfolder, extension):
    dir = f"{os.getcwd()}/content/{subfolder}/"
    tables = glob.glob(f"{dir}*.{extension}")
    return tables

In [5]:
class Analizer:
    def __init__(self, boundary):
        self.results = get_files(subfolder="results", extension="xlsx")
        self.results_df = pd.DataFrame()
        self.boundary = boundary
    
    def has_minimum_requirements(self, df, sort_by="r2"):
        sorted_df = df.sort_values(by=sort_by, ascending=False)
        top_r2 = sorted_df.head(1)[sort_by].values[0]
        if top_r2 < self.boundary:
            return False
        return True
    
    def concatenate_df(self, df, architecture):
        if self.has_minimum_requirements(df):
            df['Architecture'] = architecture
            df = df.rename(columns={'Unnamed: 0': 'model'})
            self.results_df = pd.concat([self.results_df, df], ignore_index=True) 

    def create_results_df(self):
        for file in self.results:
            df = read_excel(file)
            architecture = read_txt(file.replace(".xlsx", ".txt"))
            self.concatenate_df(df, architecture)
        self.results_df = self.results_df.sort_values(by="r2", ascending=False, ignore_index=True)

    def discard_below_average(self, sort_by):
        column_mean = self.results_df[sort_by].mean()      
        self.results_df = self.results_df[self.results_df[sort_by] >= column_mean]
    
    def discard_high_standard_deviation(self):
        r2_val, r2_test = self.results_df['r2_val'], self.results_df['r2_test']
        std_devs = np.abs(r2_val - r2_test)
        mean_std_dev = std_devs.mean()
        self.results_df = self.results_df[std_devs < mean_std_dev]

    def clean_folder(self, subfolder, extension, remove_last=True):
        files = get_files(subfolder, extension)
        models = self.results_df["model"]
        if (remove_last):
            models = models.apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1]))
        for file in files:
            file_name = os.path.basename(file).split('.')[0]
            file_parts = file_name.split('_')            
            dataset_model = f"model_{file_parts[1]}_{file_parts[2]}" 
            if (remove_last == False):
                dataset_model = (f"{dataset_model}_{file_parts[3]}")
            if dataset_model not in models.values:
                os.remove(file)   
        
    def Analize(self):
        self.create_results_df()
        self.discard_below_average(sort_by="r2")
        self.discard_below_average(sort_by="r2_vt")
        self.discard_high_standard_deviation()
        self.results_df.to_excel(f"better_results.xlsx", index=True)
        display(self.results_df)


In [12]:
analize = Analizer(0.9)
analize.Analize()
analize.clean_folder(subfolder="dataset", extension="pkl")
analize.clean_folder(subfolder="results", extension="xlsx")
analize.clean_folder(subfolder="results", extension="txt")
analize.clean_folder(subfolder="models", extension="keras", remove_last=False)



Unnamed: 0,model,r2,r2_sup,r2_test,r2_val,r2_vt,mse,mse_sup,mse_test,mse_val,mse_vt,mape,rmse,r2_adj,rsd,aic,bic,Architecture
0,model_42_9_0,0.999977,0.986335,0.999964,0.999971,0.999968,0.000014,0.008112,0.000020,0.000015,0.000017,0.001405,0.003689,1.000004,0.003765,702.409820,1352.497642,"Hidden Size=[21, 12], regularizer=0.2, learnin..."
1,model_42_9_1,0.999974,0.986333,0.999945,0.999938,0.999943,0.000016,0.008113,0.000030,0.000032,0.000031,0.001740,0.003991,1.000004,0.004074,702.094412,1352.182233,"Hidden Size=[21, 12], regularizer=0.2, learnin..."
2,model_42_8_24,0.999963,0.986201,0.999835,0.999847,0.999866,0.000022,0.008192,0.000110,0.000030,0.000072,0.001167,0.004736,1.000006,0.004834,701.409902,1351.497724,"Hidden Size=[21, 12], regularizer=0.2, learnin..."
3,model_42_8_23,0.999962,0.986188,0.999836,0.999842,0.999866,0.000023,0.008199,0.000109,0.000031,0.000073,0.001196,0.004760,1.000006,0.004858,701.389881,1351.477703,"Hidden Size=[21, 12], regularizer=0.2, learnin..."
4,model_42_8_22,0.999962,0.986173,0.999837,0.999835,0.999865,0.000023,0.008208,0.000109,0.000032,0.000073,0.001227,0.004790,1.000006,0.004889,701.364876,1351.452698,"Hidden Size=[21, 12], regularizer=0.2, learnin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,model_48_8_1,0.999208,0.948242,0.999815,0.999833,0.999830,0.000477,0.030726,0.000157,0.000148,0.000153,0.007357,0.021831,1.000133,0.022281,695.297721,1345.385543,"Hidden Size=[21, 12], regularizer=0.5, learnin..."
485,model_44_4_3,0.999194,0.952761,0.998735,0.998993,0.999267,0.000485,0.028043,0.000250,0.000678,0.000450,0.008891,0.022032,1.000136,0.022486,695.261081,1345.348903,"Hidden Size=[21, 12], regularizer=0.0, learnin..."
497,model_50_6_3,0.999173,0.961243,0.999726,0.999796,0.999779,0.000498,0.023008,0.000094,0.000190,0.000139,0.009015,0.022310,1.000057,0.022770,1541.210850,3000.084403,"Hidden Size=[30, 21], regularizer=0.2, learnin..."
505,model_54_7_1,0.999148,0.952277,0.999877,0.999729,0.999791,0.000513,0.028330,0.000073,0.000253,0.000157,0.018035,0.022640,1.000058,0.023107,1541.152154,3000.025707,"Hidden Size=[30, 21], regularizer=0.2, learnin..."
