# 1.0 Preprocessing

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import massbalancemachine as mbm

In [2]:
# Get filepath and filename of raw data
filepath = 'C:/Users/kasj/MassBalanceMachine/regions/Norway/data/'
filename = '2024-09-11_stake_dataset_Norway.csv'

# Read DataFrame
data = pd.read_csv(filepath + filename, dtype={"FROM_DATE":str, "TO_DATE":str}, index_col=0)

In [3]:
# Only select WGMS and topo columns
data = data[["RGIId", "POINT_ELEVATION", "POINT_BALANCE", "FROM_DATE", "TO_DATE", "POINT_LAT", "POINT_LON", "aspect", "slope", "YEAR", 
             "MEASUREMENT_ID", "BALANCE_CODE", "POINT_ID"]]

In [15]:
data_annual = data[data['BALANCE_CODE']=='BA']

In [25]:
data_annual_crop = data_annual[0:100]

In [26]:
# Create Dataset
dataset_annual = mbm.Dataset(data=data_annual_crop, region_name='Norway', data_path='C:/Users/kasj/MassBalanceMachine/regions/Norway/data/')

In [27]:
# Specify the short names of the climate and topographical variables available in the dataset
voi_topographical = ['aspect', 'slope']
vois_climate = ['t2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str']

# Specify the files of the climate data, that will be matched with the coordinates of the stake data
era5_climate_data = 'C:/Users/kasj/MassBalanceMachine/regions/Norway/data/data.nc'
geopotential_data = 'C:/Users/kasj/MassBalanceMachine/regions/Norway/data/geo.nc'

# Match the climate features, from the ERA5Land netCDF file, for each of the stake measurement dataset
dataset_annual.get_climate_features(climate_data=era5_climate_data, geopotential_data=geopotential_data)

In [28]:
dataset_annual.data

Unnamed: 0,RGIId,POINT_ELEVATION,POINT_BALANCE,FROM_DATE,TO_DATE,POINT_LAT,POINT_LON,aspect,slope,YEAR,...,tp_feb,tp_mar,tp_apr,tp_may,tp_jun,tp_jul,tp_aug,tp_sep,ALTITUDE_CLIMATE,ELEVATION_DIFFERENCE
0,RGI60-08.01258,471.0,-4.92,20120925,20131107,70.125889,21.777016,0.629597,0.278773,2013,...,0.005116,0.002688,0.002334,0.001159,0.003054,0.003977,0.004691,0.001449,341.344485,-129.655515
1,RGI60-08.01258,439.0,-2.67,20131107,20140924,70.125784,21.776300,0.585924,0.296590,2014,...,0.001831,0.004597,0.005634,0.002244,0.003412,0.000891,0.002129,0.004860,341.344485,-97.655515
2,RGI60-08.01258,444.0,-3.41,20140924,20150923,70.125732,21.775903,0.663261,0.302508,2015,...,0.004949,0.003286,0.003739,0.004709,0.005111,0.001253,0.002506,0.003256,341.344485,-102.655515
3,RGI60-08.01258,444.0,-3.59,20150923,20160922,70.125732,21.775903,0.663261,0.302508,2016,...,0.002495,0.002263,0.001256,0.002129,0.003969,0.003559,0.002119,0.002851,341.344485,-102.655515
4,RGI60-08.01258,468.0,-2.46,20160922,20170929,70.125536,21.773523,0.836757,0.309196,2017,...,0.003314,0.004417,0.002704,0.003991,0.001780,0.004257,0.005193,0.000710,341.344485,-126.655515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,RGI60-08.01258,728.0,-0.45,20021012,20031002,70.128473,21.738492,1.521716,0.084739,2003,...,0.004442,0.007023,0.001871,0.002366,0.002139,0.002598,0.002842,0.005029,306.927436,-421.072564
93,RGI60-08.01258,728.0,-1.17,20031002,20041004,70.128292,21.738750,1.377000,0.091205,2004,...,0.006573,0.001889,0.001142,0.001552,0.004369,0.001684,0.003808,0.006340,306.927436,-421.072564
94,RGI60-08.01258,727.0,-1.85,20041004,20051026,70.128515,21.739153,1.521716,0.084739,2005,...,0.002746,0.003652,0.003084,0.003837,0.003756,0.002763,0.004584,0.005162,306.927436,-420.072564
95,RGI60-08.01258,884.0,-1.26,20011007,20021012,70.125338,21.717716,2.671492,0.050478,2002,...,0.003433,0.003766,0.001926,0.002618,0.001111,0.002805,0.004974,0.005950,306.927436,-577.072564


In [29]:
#dataset_annual.data["FROM_DATE"] = pd.to_datetime(dataset_annual.data["FROM_DATE"], format="%Y%m%d")
#dataset_annual.data["TO_DATE"] = pd.to_datetime(dataset_annual.data["TO_DATE"], format="%Y%m%d")


In [30]:
#dataset_annual.data["MONTHS"] = dataset_annual.data.apply(
#        lambda row: pd.date_range(start=row["FROM_DATE"], end=row["TO_DATE"], freq="MS")
#        .strftime("%b")
#        .str.lower()
#        .tolist(),
#        axis=1,
#    )

In [31]:
# For each record, convert to a monthly time resolution
voi_topographical = ['aspect', 'slope']
vois_climate = ['t2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str']

dataset_annual.convert_to_monthly(vois_climate=vois_climate, vois_topographical=voi_topographical)

In [32]:
dataset_annual.data

Unnamed: 0,YEAR,POINT_LON,POINT_LAT,POINT_BALANCE,ALTITUDE_CLIMATE,ELEVATION_DIFFERENCE,POINT_ELEVATION,RGIId,POINT_ID,ID,...,MONTHS,aspect,slope,t2m,tp,slhf,sshf,ssrd,fal,str
0,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,oct,0.629597,0.278773,272.721559,0.003846,-5.533076e+04,6.574301e+05,2.041005e+06,0.439743,-2.504289e+06
1,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,nov,0.629597,0.278773,268.369414,0.003902,1.782743e+05,9.093649e+05,1.577804e+05,0.817482,-1.714226e+06
2,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,dec,0.629597,0.278773,261.676842,0.000812,1.249153e+05,1.028680e+06,0.000000e+00,0.735525,-1.698121e+06
3,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,jan,0.629597,0.278773,265.982680,0.005081,1.455951e+05,8.587204e+05,3.325961e+04,0.839843,-1.628942e+06
4,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,feb,0.629597,0.278773,265.532511,0.005116,1.180220e+05,7.823245e+05,1.183961e+06,0.803068,-1.415184e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1166,2003,21.717690,70.125329,-0.90,306.927436,-577.072564,884.0,RGI60-08.01258,99,96,...,may,2.671492,0.050478,274.797046,0.002366,-2.258752e+05,3.089274e+05,1.978501e+07,0.634963,-3.552583e+06
1167,2003,21.717690,70.125329,-0.90,306.927436,-577.072564,884.0,RGI60-08.01258,99,96,...,jun,2.671492,0.050478,278.963202,0.002139,-1.995657e+06,-6.571803e+05,2.075481e+07,0.432191,-3.890606e+06
1168,2003,21.717690,70.125329,-0.90,306.927436,-577.072564,884.0,RGI60-08.01258,99,96,...,jul,2.671492,0.050478,285.889589,0.002598,-5.675638e+06,-2.813862e+06,1.697984e+07,0.131847,-4.932677e+06
1169,2003,21.717690,70.125329,-0.90,306.927436,-577.072564,884.0,RGI60-08.01258,99,96,...,aug,2.671492,0.050478,282.771045,0.002842,-3.644678e+06,-1.758054e+06,1.123607e+07,0.135796,-4.175007e+06


In [35]:
data_annual_crop

Unnamed: 0,RGIId,POINT_ELEVATION,POINT_BALANCE,FROM_DATE,TO_DATE,POINT_LAT,POINT_LON,aspect,slope,YEAR,MEASUREMENT_ID,BALANCE_CODE,POINT_ID
0,RGI60-08.01258,471.0,-4.92,20120925,20131107,70.125889,21.777016,0.629597,0.278773,2013,0,BA,0
1,RGI60-08.01258,439.0,-2.67,20131107,20140924,70.125784,21.776300,0.585924,0.296590,2014,1,BA,1
2,RGI60-08.01258,444.0,-3.41,20140924,20150923,70.125732,21.775903,0.663261,0.302508,2015,2,BA,2
3,RGI60-08.01258,444.0,-3.59,20150923,20160922,70.125732,21.775903,0.663261,0.302508,2016,3,BA,3
4,RGI60-08.01258,468.0,-2.46,20160922,20170929,70.125536,21.773523,0.836757,0.309196,2017,4,BA,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,RGI60-08.01258,728.0,-0.45,20021012,20031002,70.128473,21.738492,1.521716,0.084739,2003,96,BA,95
96,RGI60-08.01258,728.0,-1.17,20031002,20041004,70.128292,21.738750,1.377000,0.091205,2004,97,BA,96
97,RGI60-08.01258,727.0,-1.85,20041004,20051026,70.128515,21.739153,1.521716,0.084739,2005,98,BA,97
98,RGI60-08.01258,884.0,-1.26,20011007,20021012,70.125338,21.717716,2.671492,0.050478,2002,99,BA,98


In [38]:
dataset_annual.data[0:20]

Unnamed: 0,YEAR,POINT_LON,POINT_LAT,POINT_BALANCE,ALTITUDE_CLIMATE,ELEVATION_DIFFERENCE,POINT_ELEVATION,RGIId,POINT_ID,ID,...,MONTHS,aspect,slope,t2m,tp,slhf,sshf,ssrd,fal,str
0,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,oct,0.629597,0.278773,272.721559,0.003846,-55330.76,657430.1,2041005.0,0.439743,-2504289.0
1,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,nov,0.629597,0.278773,268.369414,0.003902,178274.3,909364.9,157780.4,0.817482,-1714226.0
2,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,dec,0.629597,0.278773,261.676842,0.000812,124915.3,1028680.0,0.0,0.735525,-1698121.0
3,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,jan,0.629597,0.278773,265.98268,0.005081,145595.1,858720.4,33259.61,0.839843,-1628942.0
4,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,feb,0.629597,0.278773,265.532511,0.005116,118022.0,782324.5,1183961.0,0.803068,-1415184.0
5,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,mar,0.629597,0.278773,262.800368,0.002688,82279.19,876317.2,6495359.0,0.842935,-2615744.0
6,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,apr,0.629597,0.278773,269.858086,0.002334,-156687.3,487899.9,14428180.0,0.76715,-3465103.0
7,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,may,0.629597,0.278773,277.67798,0.001159,-237874.6,428671.6,20979520.0,0.580414,-3367192.0
8,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,jun,0.629597,0.278773,283.33964,0.003054,-4024574.0,-2133166.0,17728590.0,0.198752,-3994740.0
9,2013,21.777016,70.125889,-4.92,341.344485,-129.655515,471.0,RGI60-08.01258,0,0,...,jul,0.629597,0.278773,283.829285,0.003977,-5184939.0,-2731458.0,15538730.0,0.131911,-4661453.0
