### dataset_history
It produce and interval of the history start till the end on time feature for main_feature= main_value 
for all the dataframes in df_list.

In [1]:
import pandas as pd

def dataset_history(
        df_list,
        main_feature,
        time_feature,
        main_value,
):
    start_time = None
    end_time = None

    for df_path in df_list:
        df = pd.read_csv(df_path, parse_dates=[time_feature])
        df_filtered = df[df[main_feature] == main_value]

        min_time = df_filtered[time_feature].min()
        max_time = df_filtered[time_feature].max()

        # Converts to UTC to avoid time zone error
        min_time_utc = min_time.tz_localize(None).tz_localize('UTC')
        max_time_utc = max_time.tz_localize(None).tz_localize('UTC')

        if start_time is None or min_time_utc < start_time:
            start_time = min_time_utc
        if end_time is None or max_time_utc > end_time:
            end_time = max_time_utc
        
    # To Convert them back to the original time zone if needed
    #start_time = start_time.tz_convert(df[time_feature].dt.tz)
    #end_time = end_time.tz_convert(df[time_feature].dt.tz)

    return[start_time, end_time]


#Example Usage:
my_df_list = ["datasets/fitbit/processed_data/calories.csv",
              "datasets/fitbit/processed_data/distance.csv",
              "datasets/fitbit/processed_data/exercise.csv",
              "datasets/fitbit/processed_data/heart_rate.csv",
              "datasets/fitbit/processed_data/sleep_score.csv",
              "datasets/fitbit/processed_data/sleep.csv",
              "datasets/fitbit/processed_data/steps.csv"
              ]
#dataset_history(my_df_list,"participant","vt","p04")

###  dataset_to_histories
the result of this function is a dataframe including hs: history start of all individual participants and he: histort end in all the dataframes addressed.

In [2]:
def dataset_to_histories( 
    df_list,
    main_feature,
    time_feature
):
    histories_dict = {}
    
    for df_path in df_list:
        df = pd.read_csv(df_path, parse_dates=[time_feature])
        unique_main_values = df[main_feature].unique()
        
        for main_value in unique_main_values:
            # Repeats for each dataframe in df_list
            history = dataset_history([df_path], main_feature, time_feature, main_value)
            
            if main_value not in histories_dict:
                histories_dict[main_value] = {
                    'hs': history[0],
                    'he': history[1]
                }
            else:
                # Updates hs and he if a history already exists for the main_value
                existing_start_time = histories_dict[main_value]['hs']
                existing_end_time = histories_dict[main_value]['he']
                
                if history[0] < existing_start_time:
                    histories_dict[main_value]['hs'] = history[0]
                
                if history[1] > existing_end_time:
                    histories_dict[main_value]['he'] = history[1]
    
    df_result = pd.DataFrame.from_dict(histories_dict, orient='index')
    df_result.index.name = 'main_value'
    
    return df_result

# Example Usage:
my_df_list = ["datasets/fitbit/processed_data/calories.csv",
              "datasets/fitbit/processed_data/distance.csv",
              "datasets/fitbit/processed_data/exercise.csv",
              "datasets/fitbit/processed_data/heart_rate.csv",
              "datasets/fitbit/processed_data/sleep_score.csv",
              "datasets/fitbit/processed_data/sleep.csv",
              "datasets/fitbit/processed_data/steps.csv"
              ]

histories_df = dataset_to_histories(my_df_list, "participant", "vt")
print(histories_df)

                                  hs                        he
main_value                                                    
p01        2019-11-01 00:00:00+00:00 2020-03-31 23:59:00+00:00
p02        2019-11-01 00:00:00+00:00 2020-03-31 23:59:00+00:00
p03        2019-11-01 00:00:00+00:00 2020-03-31 23:59:00+00:00
p04        2019-11-01 00:00:00+00:00 2020-03-31 17:12:00+00:00
p05        2019-11-01 00:00:00+00:00 2020-03-31 23:59:00+00:00
p06        2019-11-01 00:00:00+00:00 2020-03-31 23:59:00+00:00
p07        2019-11-05 00:00:00+00:00 2020-03-31 18:55:00+00:00
p08        2019-11-10 00:00:00+00:00 2020-03-31 23:59:00+00:00
p09        2019-11-01 00:00:00+00:00 2020-03-31 23:59:00+00:00
p10        2019-11-05 00:00:00+00:00 2020-03-31 23:59:00+00:00
p11        2019-11-01 00:00:00+00:00 2020-03-31 23:59:00+00:00
p12        2019-11-01 00:00:00+00:00 2020-03-31 23:59:00+00:00
p13        2019-11-01 00:00:00+00:00 2020-03-31 23:59:00+00:00
p14        2019-11-13 00:00:00+00:00 2020-03-31 23:30:3

### dataset_to_histories_json

In [3]:
import os
import json

def dataset_to_histories_json( 
    df_list,
    main_feature,
    time_feature,
    output_path="datasets/fitbit/processed_data/"
):
    histories_dict = {}
    
    for df_path in df_list:
        df = pd.read_csv(df_path, parse_dates=[time_feature])
        unique_main_values = df[main_feature].unique()
        
        for main_value in unique_main_values:
            # Repeats for each dataframe in df_list
            history = dataset_history([df_path], main_feature, time_feature, main_value)
            
            # Convert Timestamp objects to string format (for putting in .JSON)
            start_time_str = str(history[0])
            end_time_str = str(history[1])
            
            if main_value not in histories_dict:
                histories_dict[main_value] = {
                    'hs': start_time_str,
                    'he': end_time_str
                }
            else:
                # Update hs and he if a history already exists for the main_value
                existing_start_time = pd.to_datetime(histories_dict[main_value]['hs'])
                existing_end_time = pd.to_datetime(histories_dict[main_value]['he'])
                
                if history[0] < existing_start_time:
                    histories_dict[main_value]['hs'] = start_time_str
                
                if history[1] > existing_end_time:
                    histories_dict[main_value]['he'] = end_time_str
    
    # Saves the histories_dict as a JSON file
    output_file_path = os.path.join(output_path, "histories_dict.json")
    with open(output_file_path, 'w') as file:
        json.dump(histories_dict, file)
    
    return output_file_path

# Example Usage:
my_df_list = ["datasets/fitbit/processed_data/calories.csv",
              "datasets/fitbit/processed_data/distance.csv",
              "datasets/fitbit/processed_data/exercise.csv",
              "datasets/fitbit/processed_data/heart_rate.csv",
              "datasets/fitbit/processed_data/sleep_score.csv",
              "datasets/fitbit/processed_data/sleep.csv",
              "datasets/fitbit/processed_data/steps.csv"
              ]

#result_file_path = dataset_to_histories_json(my_df_list, "participant", "vt")
#print(f"Histories dictionary saved at: {result_file_path}")
