# 1.0 Preprocessing

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import massbalancemachine as mbm

cfg = mbm.config.Config()

In [None]:
# Get filepath and filename of raw data
filepath = 'C:/Users/kasj/MassBalanceMachine/regions/Norway/data/'
filename = '2024-09-11_stake_dataset_Norway.csv'

# Read DataFrame
data = pd.read_csv(filepath + filename, dtype={"FROM_DATE":str, "TO_DATE":str}, index_col=0)

In [None]:
# Only select WGMS and topo columns
data = data[["RGIId", "POINT_ELEVATION", "POINT_BALANCE", "FROM_DATE", "TO_DATE", "POINT_LAT", "POINT_LON", "aspect", "slope", "YEAR", 
             "MEASUREMENT_ID", "BALANCE_CODE", "POINT_ID"]]

In [None]:
data_annual = data[data['BALANCE_CODE']=='BA']

In [None]:
# Create Dataset
dataset_annual = mbm.Dataset(cfg, data=data_annual_crop, region_name='Norway', data_path='C:/Users/kasj/MassBalanceMachine/regions/Norway/data/')

In [None]:
# Specify the short names of the climate and topographical variables available in the dataset
voi_topographical = ['aspect', 'slope']
vois_climate = ['t2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str']

# Specify the files of the climate data, that will be matched with the coordinates of the stake data
era5_climate_data = 'C:/Users/kasj/MassBalanceMachine/regions/Norway/data/data.nc'
geopotential_data = 'C:/Users/kasj/MassBalanceMachine/regions/Norway/data/geo.nc'

# Match the climate features, from the ERA5Land netCDF file, for each of the stake measurement dataset
dataset_annual.get_climate_features(climate_data=era5_climate_data, geopotential_data=geopotential_data)

In [None]:
dataset_annual.data.columns
# Debug: There are no 'var_nan' in dataset at this point
# Problem is in generating month name from TO_DATE and FROM_DATE?

In [None]:
dataset_annual.data.FROM_DATE

In [None]:
dataset_annual.data["FROM_DATE"] = pd.to_datetime(dataset_annual.data["FROM_DATE"], format="%Y%m%d")
dataset_annual.data["TO_DATE"] = pd.to_datetime(dataset_annual.data["TO_DATE"], format="%Y%m%d")


In [None]:
dataset_annual.data

In [None]:
dataset_annual.data["MONTHS"] = dataset_annual.data.apply(
        lambda row: pd.date_range(start=row["FROM_DATE"], end=row["TO_DATE"], freq="MS")
        .strftime("%b")
        .str.lower()
        .tolist(),
        axis=1,
    )

In [None]:
def nearest_start_of_month(date):
    """
    Round to the nearest start of the month.
    If day < 15, round down to start of current month.
    If day >= 15, round up to start of next month.
    """
    if date.day < 15:
        return date - pd.offsets.MonthBegin(1)
    else:
        return date + pd.offsets.MonthBegin(1)

def create_date_range(row):
    start_date = nearest_start_of_month(row['FROM_DATE'])
    end_date = nearest_start_of_month(row['TO_DATE'])
    
    # Generate the date range
    date_range = pd.date_range(start=start_date, end=end_date, freq='MS').strftime('%b').str.lower()
    return date_range.tolist()

result = dataset_annual.data.iloc[0:10,:].apply(create_date_range, axis=1)
print(result)

In [None]:
result[1]

In [None]:
#dataset_annual.data["MONTHS"] = dataset_annual.data.apply(
#        lambda row: pd.date_range(start=row["FROM_DATE"], end=row["TO_DATE"], freq="MS")
#        .strftime("%b")
#        .str.lower()
#        .tolist(),
#        axis=1,
#    )

In [None]:
# For each record, convert to a monthly time resolution
voi_topographical = ['aspect', 'slope']
vois_climate = ['t2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str']

dataset_annual.convert_to_monthly(vois_climate=vois_climate, vois_topographical=voi_topographical)

In [None]:
dataset