In [1]:
import os
import json
import pandas as pd

def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], f'{name}{a}.')
        elif isinstance(x, list):
            # Handle lists by extracting summary statistics if they contain numbers
            if len(x) == 0:
                out[name[:-1]] = None
            elif all(isinstance(i, (int, float)) for i in x):
                out[name + 'mean'] = sum(x) / len(x)
                out[name + 'min'] = min(x)
                out[name + 'max'] = max(x)
                out[name + 'last'] = x[-1]
            else:
                # Store the length of the list for non-numeric lists
                out[name + 'length'] = len(x)
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

def load_results_to_dataframe(results_dir):
    data_list = []
    for root, dirs, files in os.walk(results_dir):
        for file in files:
            if file.endswith('.json'):
                json_path = os.path.join(root, file)
                with open(json_path, 'r') as f:
                    try:
                        data = json.load(f)
                        flat_data = flatten_json(data)
                        flat_data['file_name'] =  file
                        data_list.append(flat_data)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON from file {json_path}: {e}")
    df = pd.DataFrame(data_list)
    return df

# Usage example
if __name__ == "__main__":
    results_dir = '/home/skage/projects/ikt450_deep-neural-networks/stocknet-project-ikt450/skage/results'  # Replace with your actual path
    df = load_results_to_dataframe(results_dir)
    

    # Select relevant columns
    columns_of_interest = [
        'file_name',
        'Model',
        'Dataclass',
        'Results Testset.accuracy_train',
        'Results Testset.accuracy_test',
        'Results Testset.accuracy_eval',
        'Results Testset.F1_test',
        'Results Testset.MCC_test',
        'Config.LEARNING_RATE'
    ]
    # Keep only the columns that are present in the DataFrame
    columns_present = [col for col in columns_of_interest if col in df.columns]
    df_selected = df[columns_present]
    
    # Sort by test accuracy in descending order
    
    df_sorted = df_selected.sort_values(by = 'Results Testset.accuracy_test', ascending=False)
    
    # Display the sorted DataFrame
    print(df_sorted)

    df_sorted.to_csv('temp.csv')

    print('All keys:', df.columns)

    file_name          Model                           Dataclass  \
24  0000.json  BILSTM_4_FC_3  TwitterSentimentVolumePriceXPriceY   
7   0003.json    LSTM_4_FC_3  TwitterSentimentVolumePriceXPriceY   
13  0013.json    LSTM_4_FC_3  TwitterSentimentVolumePriceXPriceY   
50  0005.json     GRU_4_FC_3  TwitterSentimentVolumePriceXPriceY   
65  0001.json  BILSTM_4_FC_3       NormSentimentNormPriceXPriceY   
..        ...            ...                                 ...   
44  0001.json     GRU_4_FC_3  TwitterSentimentVolumePriceXPriceY   
17  0000.json     RNN_1_FC_2  TwitterSentimentVolumePriceXPriceY   
25  0005.json  BILSTM_4_FC_3  TwitterSentimentVolumePriceXPriceY   
1   0011.json    LSTM_4_FC_3  TwitterSentimentVolumePriceXPriceY   
18  0001.json    LSTM_4_FC_3       NormSentimentNormPriceXPriceY   

    Results Testset.accuracy_train  Results Testset.accuracy_test  \
24                        0.525731                       0.536538   
7                         0.507904           