In [1]:
import os
import pandas as pd
import fetchData
import numpy as np
import re
import glob

In [2]:
class Timestamp:
    def __init__(self, year=None, month=None, day=None, hour=None, minute=None):
        self.year = year
        self.month = month
        self.day = day
        self.hour = hour
        self.minute = minute

    def __repr__(self):
        return (f"Timestamp(year={self.year}, month={self.month}, day={self.day}, "
                f"hour={self.hour}, minute={self.minute})")
    
    def to_string(self, format="%Y-%m-%d %H:%M"):
        """Returns a formatted string representation of the timestamp. 
        If any part is None, it is omitted from the output."""
        
        # Build a dictionary for format fields
        fields = {
            "%Y": str(self.year) if self.year is not None else "",
            "%m": str(self.month).zfill(2) if self.month is not None else "",
            "%d": str(self.day).zfill(2) if self.day is not None else "",
            "%H": str(self.hour).zfill(2) if self.hour is not None else "",
            "%M": str(self.minute).zfill(2) if self.minute is not None else ""
        }
        
        # Replace the format placeholders with actual values or empty strings
        formatted = format
        for key, value in fields.items():
            formatted = formatted.replace(key, value)
        
        # Strip any extra spaces or redundant separators (e.g., "--" or " :")
        formatted = formatted.replace("  ", " ").replace("--", "-").replace(" :", "").strip()
        
        return formatted


timestamp1 = Timestamp(year=2022, month=9, day=24, hour=15, minute=30)
timestamp2 = Timestamp(year=2023, month=1)

print(timestamp1)  
print(timestamp2)  

print(timestamp1.to_string())         
print(timestamp2.to_string("%Y-%m"))   


Timestamp(year=2022, month=9, day=24, hour=15, minute=30)
Timestamp(year=2023, month=1, day=None, hour=None, minute=None)
2022-09-24 15:30
2023-01


In [3]:
data_type = 'CWAM'
dataset_purposes = ['train', 'test']

path_levels = ['part_1_', 'part_2_', 'part_3_', 'part_4_', 'part_5_', 'part_6_', 'part_7_', 
               'part_8_', 'part_9_', 'part_10_', 'part_11_', 'part_12_']


In [4]:
def get_full_folder_path(dataset_purpose):
    folder_paths = []

    base_dir = fetchData.get_defult_base_dir()
    path = os.path.join(base_dir, data_type, dataset_purpose)
    all_folders = os.listdir(path)

    for prefix in path_levels:
        matching_folders = [folder for folder in all_folders if folder.startswith(prefix)]   
        if matching_folders:
            folder_paths.append(matching_folders[0])
    return folder_paths

In [5]:
def get_star_and_end_timestamp(part_path):
    pattern = re.compile(r"part_(\d+)"
                     r"_(?P<start_year>\d{2})(?P<start_month>\d{2})(?P<start_day>\d{2})"
                     r"_(?P<end_year>\d{2})(?P<end_month>\d{2})(?P<end_day>\d{2})")
    match = pattern.match(part_path)
    if match:
        start_year = match.group("start_year")
        start_year = '20' + start_year
        start_month = match.group("start_month")
        start_day = match.group("start_day")
        start_time = Timestamp(year=start_year, month=start_month, day=start_day)

        end_year = match.group("end_year")
        end_year = '20' + end_year
        end_month = match.group("end_month")
        end_day = match.group("end_day")
        end_time = Timestamp(year=end_year, month=end_month, day=end_day)
    return [start_time, end_time]
            

In [6]:
def get_date_range(timerangelist):
    start_date = timerangelist[0].to_string("%Y-%m-%d")
    end_date = timerangelist[1].to_string("%Y-%m-%d")
    date_range = pd.date_range(start=start_date, end=end_date)
    
    timestamp_list = [
        Timestamp(year=date.year, month=date.month, day=date.day) for date in date_range
    ]

    return timestamp_list

In [7]:
def find_forecast_files(directory, date):
    year = date.to_string("%Y")
    month = date.to_string("%m")
    day = date.to_string("%d")

    pattern = os.path.join(directory, f"{year}_{month}_{day}_??_??_GMT.Forecast.h5.CWAM.h5")
    return glob.glob(pattern)


In [8]:
def set_hour_minute(file_name, date):
    pattern = re.compile(r"(\d{4})"
                     r"_(\d{2})"
                     r"_(\d{2})"
                     r"_(?P<hour>\d{2})"
                     r"_(?P<minute>\d{2})"
                     r"_GMT.Forecast")
    match = pattern.match(file_name)
    if match:
        date.hour = int(match.group("hour"))
        date.minute = int(match.group("minute"))

In [9]:
def to_pandas_timestamp(timestamp):
    
    local_time = pd.Timestamp(
        year=timestamp.year,
        month=timestamp.month,
        day=timestamp.day,
        hour=timestamp.hour if timestamp.hour is not None else 0,
        minute=timestamp.minute if timestamp.minute is not None else 0
    )
    return local_time.tz_localize('UTC')


In [10]:
def expand_coordinates(data_df):
    data_df = data_df.explode('Latitudes')
    data_df = data_df.explode('Longitudes')
    return data_df

In [None]:
def aggregate_cwam_data(data_df, base_time):
    base_time = to_pandas_timestamp(base_time)

    data_df["Forecast Time (FCST)"] = data_df["Forecast Time (FCST)"].str.replace('FCST', '').astype(int)
    data_df["Forecast Time (FCST)"] = pd.to_timedelta(data_df["Forecast Time (FCST)"], unit='m')
    data_df["Actual Time"] = base_time + data_df["Forecast Time (FCST)"]
    
    data_df["Threshold (TRSH)"] = data_df["Threshold (TRSH)"].str.extract(r'(\d+)').astype(float)
    data_df = expand_coordinates(data_df)

    data_df.set_index("Actual Time", inplace=True)
    
    aggregated_df = data_df.groupby(pd.Grouper(freq="15min")).agg({
        "Threshold (TRSH)": "mean",  
        "Latitudes": "mean",         
        "Longitudes": "mean"
    }).reset_index() 
    return aggregated_df

In [None]:
date = Timestamp(year=2022, month=9, day=1, hour=0, minute=0) 
data_range = date.to_string("%Y-%m-%d")
start_date = pd.to_datetime(data_range).tz_localize('UTC')
end_date = start_date + pd.Timedelta(days=1)
complete_times = pd.date_range(start=start_date, end=end_date, freq='15min')
complete_index = pd.MultiIndex.from_product([complete_times], names=['timestamp_15mins'])
complete_df = pd.DataFrame(index=complete_index).reset_index()

df = fetchData.load_data( 
                    data_type=data_type, 
                    dataset_purpose='train', 
                    path_level='part_1_220901_220924', 
                    month=date.to_string("%m"), 
                    day=date.to_string("%d"), 
                    file_name="2022_09_01_00_00_GMT.Forecast"
                )
df = aggregate_cwam_data(df, date)
print(df.head())

In [None]:
for dataset_purpose in dataset_purposes:
    folder_path_levels = get_full_folder_path(dataset_purpose)
    for part_path in folder_path_levels:
        timerangelist = get_star_and_end_timestamp(part_path)
        timestamp_list = get_date_range(timerangelist)
        for date in timestamp_list:
            month = date.to_string("%m")
            day = date.to_string("%d")
            directory = os.path.join(fetchData.get_defult_base_dir(), 
                                     data_type, 
                                     dataset_purpose, 
                                     part_path, 
                                     month, 
                                     day
                                    )
            df_list = []
            data_range = date.to_string("%Y-%m-%d")
            start_date = pd.to_datetime(data_range).tz_localize('UTC')
            end_date = start_date + pd.Timedelta(days=1)
            complete_times = pd.date_range(start=start_date, end=end_date, freq='15min')
            complete_index = pd.MultiIndex.from_product([complete_times], names=['timestamp_15mins'])
            complete_df = pd.DataFrame(index=complete_index).reset_index()

            forecast_files = find_forecast_files(directory, date)
            for forecast_file in forecast_files:
                file_name = forecast_file.split(os.path.sep)[-1]
                file_name = file_name[:len(file_name)-11]
                set_hour_minute(file_name, date)
                df = fetchData.load_data( 
                    data_type=data_type, 
                    dataset_purpose=dataset_purpose, 
                    path_level=part_path, 
                    month=month, 
                    day=day, 
                    file_name=file_name
                )
                
                
            
    

KeyboardInterrupt: 