In [None]:
import pandas as pd
from datetime import datetime, time, timedelta
from tqdm import tqdm
month_to_times = {
    1: (time(8), time(16)),
    2: (time(8), time(17)),
    3: (time(7), time(18)),
    4: (time(7), time(19)),
    5: (time(6), time(20)),
    6: (time(5), time(20)),
    7: (time(5), time(20)),
    8: (time(6), time(20)),
    9: (time(7), time(19)),
    10: (time(7), time(18)),
    11: (time(7), time(16)),
    12: (time(8), time(16))
}
MAX_SAMPLES = {
    1: 75087,
    2: 135478,
    3: 148795,
    4: 142888,
    5: 146131,
    6: 139695,
    7: 144006,
    8: 141826,
    9: 135122,
    10: 141410,
    11: 363602,
    12: 0
 }
def get_image_times(year, month):
    min_date = datetime(year, month, 1)
    
    if month == 2:
        max_date = datetime(year, month, 28)
    elif month in [4, 6, 9, 11]:
        max_date = datetime(year, month, 30)
    else:
        max_date = datetime(year, month, 31)        

    start_time, end_time = month_to_times[month]
    
    date = min_date
    while date <= max_date:
        current_time = datetime.combine(date, start_time)
        
        while current_time.time() < end_time:
            if current_time:
                yield current_time
                
            current_time += timedelta(hours=1)
            
        date += timedelta(days=1)

dfs = []
for month in tqdm(range(1, 13)):
    pv_file_path = f"/data/pv/2021/{month}.parquet"
    df = pd.read_parquet(pv_file_path)
    # start, end = month_to_times[month]
    # df = df[(df.index.get_level_values('timestamp').minute == 0) & (df.index.get_level_values('timestamp').hour >= start.hour) & (df.index.get_level_values('timestamp').hour <= end.hour) ]
    # df = df.sample(n=min(MAX_SAMPLES[month], len(df)))
    print(len(df))
    dfs.append(df)
df_concat = pd.concat(dfs)

In [None]:
df_concat.to_parquet("/data/pv/concat.parquet")

In [None]:
df = df_concat
import json
sat_type = "nonhrv"
pv_metadata_file = "/data/pv/metadata.csv"
with open("./indices.json") as f:
    site_locations = {  
        data_source: {
            int(site): (int(location[0]), int(location[1]))
            for site, location in locations.items()
        }
        for data_source, locations in json.load(f).items()
    }
sites = list(site_locations[sat_type].keys())

In [None]:
pv_data = df
pv_metadata_file = "/data/pv/metadata.csv"
with open(pv_metadata_file, "r") as f:
    pv_metadata = pd.read_csv(f)
    pv_metadata.set_index("ss_id", inplace=True)
NWP_FEATURES = ["t_500", "clcl", "alb_rad", "tot_prec", "ww", "relhum_2m", "h_snow", "aswdir_s", "td_2m", "omega_1000"]
EXTRA_FEATURES = ["latitude_rounded", "longitude_rounded", "orientation", "tilt"]
columns = ['pv', sat_type, 'nwp', 'extra', 'y', 'time']
train_data = []
for time in tqdm(df.index.get_level_values('timestamp')):
        if time.minute != 0:
            continue
        first_hour = slice(str(time), str(time + timedelta(minutes=55)))

        pv_features = pv_data.xs(first_hour, drop_level=False)  # type: ignore
        pv_targets = pv_data.xs(
            slice(  # type: ignore
                str(time + timedelta(hours=1)),
                str(time + timedelta(hours=4, minutes=55)),
            ),
            drop_level=False,
        )

        for site in sites:
            try:
                # Get solar PV features and targets
                site_features = pv_features.xs(site, level=1).to_numpy().squeeze(-1)
                site_targets = pv_targets.xs(site, level=1).to_numpy().squeeze(-1)
                assert site_features.shape == (12,) and site_targets.shape == (48,)

                # Get a 128x128 crop centred on the site over the previous hour
                x, y = site_locations[sat_type][site]
                sat = (x, y)

                # nwp features
                x_nwp, y_nwp = site_locations["weather"][site]

                nwp = (x_nwp, y_nwp)
                            
                # extra features
                extra = pv_metadata.loc[site, EXTRA_FEATURES].to_numpy().astype(np.float32)
                assert extra.shape == (len(EXTRA_FEATURES),)
                # train on 2021, val on 2020
                set_type = "train"
                train_data.append([site_features, sat, nwp, extra, site_targets, time])
            except:
                # print(e)
                continue
        

In [None]:
import pandas as pd
df = pd.read_parquet('/data/pv/proc.parquet')

In [None]:
df_sorted = df.sort_index(level=[0, 1], ascending=True).drop("generation_wh", axis=1)
df_sorted.to_parquet('/data/pv/proc.parquet')

In [35]:
import pandas as pd
from datetime import datetime, time, timedelta, timezone
df = pd.read_parquet('/data/pv/proc.parquet')
pv_data = pd.read_parquet('/data/pv/concat.parquet').drop("generation_wh", axis=1)

In [38]:
import json
sat_type = "nonhrv"
pv_metadata_file = "/data/pv/metadata.csv"
with open(pv_metadata_file, "r") as f:
    pv_metadata = pd.read_csv(f)
    pv_metadata.set_index("ss_id", inplace=True)
NWP_FEATURES = ["t_500", "clcl", "alb_rad", "tot_prec", "ww", "relhum_2m", "h_snow", "aswdir_s", "td_2m", "omega_1000"]
# NWP_FEATURES = ["t_500", "clct", "alb_rad", "tot_prec", "aswdifd_s"]
EXTRA_FEATURES = ["latitude_rounded", "longitude_rounded", "orientation", "tilt"]
import numpy as np
with open("./indices.json") as f:
    site_locations = {  
        data_source: {
            int(site): (int(location[0]), int(location[1]))
            for site, location in locations.items()
        }
        for data_source, locations in json.load(f).items()
    }
import h5py
with (
    h5py.File(f'/data/processed_data/processed_train_all.hdf5', 'w') as f_train,
    h5py.File(f'/data/processed_data/processed_val_all.hdf5', 'w') as f_val,
    ):
        f_pv = f_train.create_group('pv')
        f_sat = f_train.create_group(sat_type)
        f_nwp = f_train.create_group('nwp')
        f_extra = f_train.create_group('extra')
        f_y = f_train.create_group('y')
        f_time = f_train.create_group('time')
        f_pv_val = f_val.create_group('pv')
        f_sat_val = f_val.create_group(sat_type)
        f_nwp_val = f_val.create_group('nwp')
        f_extra_val = f_val.create_group('extra')
        f_y_val = f_val.create_group('y')
        f_time_val = f_val.create_group('time')

        for i, row in enumerate(tqdm(df.iterrows(), total=len(df))):
            try:
                time, ss_id = row[0]
                site = ss_id
                first_hour = slice(str(time), str(time + timedelta(minutes=55)))
                pv_features = pv_data.xs(first_hour, drop_level=False)  # type: ignore
                pv_targets = pv_data.xs(
                    slice(  # type: ignore
                        str(time + timedelta(hours=1)),
                        str(time + timedelta(hours=4, minutes=55)),
                    ),
                    drop_level=False,
                )
                # Get solar PV features and targets
                site_features = pv_features.xs(site, level=1).to_numpy().squeeze(-1)
                site_targets = pv_targets.xs(site, level=1).to_numpy().squeeze(-1)
                assert site_features.shape == (12,) and site_targets.shape == (48,)

                # Get a 128x128 crop centred on the site over the previous hour
                x, y = site_locations[sat_type][site]
                sat = (x, y)
                # nwp features
                x_nwp, y_nwp = site_locations["weather"][site]
                nwp = (x_nwp, y_nwp)
                            
                # extra features
                extra = pv_metadata.loc[site, EXTRA_FEATURES].to_numpy().astype(np.float32)
                assert extra.shape == (len(EXTRA_FEATURES),)
                data = (site_features, sat, nwp, extra, site_targets, time)
                timen = data[5].replace(tzinfo=timezone.utc) 
                f_pv.create_dataset(f'data_{i}', data=data[0], compression="lzf")
                f_sat.create_dataset(f'data_{i}', data=data[1], compression="lzf")
                f_nwp.create_dataset(f'data_{i}', data=data[2], compression="lzf")
                f_extra.create_dataset(f'data_{i}', data=data[3], compression="lzf")
                f_y.create_dataset(f'data_{i}', data=data[4], compression="lzf")
                f_time.create_dataset(f'data_{i}', data=np.array([timen.timestamp()]), compression="lzf")
            except:
                continue

3355it [00:21, 237.54it/s]