# Pre-processing of GLAMOS MB data:

## Setting up:

In [1]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import re
import massbalancemachine as mbm
import geopandas as gpd
from shapely.geometry import Polygon, LineString, Point

from scripts.helpers import *
from scripts.glamos_preprocess import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [2]:
path_PMB_GLAMOS_raw = '../../../data/GLAMOS/point/raw/'
path_PMB_GLAMOS_w_raw = path_PMB_GLAMOS_raw + 'winter/'
path_PMB_GLAMOS_a_raw = path_PMB_GLAMOS_raw + 'annual/'

path_PMB_GLAMOS_csv = '../../../data/GLAMOS/point/csv/'
path_PMB_GLAMOS_csv_w = path_PMB_GLAMOS_csv + 'winter/'
path_PMB_GLAMOS_csv_a = path_PMB_GLAMOS_csv + 'annual/'


## Transform .dat files to .csv:

In [3]:
# Get all files with pmb (for winter and annual mb):
glamosfiles_mb_a, glamosfiles_mb_w = [], []
for file in os.listdir(path_PMB_GLAMOS_a_raw):
    # check if current path is a file
    if os.path.isfile(os.path.join(path_PMB_GLAMOS_a_raw, file)):
        glamosfiles_mb_a.append(file)

for file in os.listdir(path_PMB_GLAMOS_w_raw):
    # check if current path is a file
    if os.path.isfile(os.path.join(path_PMB_GLAMOS_w_raw, file)):
        glamosfiles_mb_w.append(file)

print('Examples of index stake raw files:\n', glamosfiles_mb_a[:5])

# Transform all files to csv
RUN = True
if RUN:
    emptyfolder(path_PMB_GLAMOS_csv_a)
    emptyfolder(path_PMB_GLAMOS_csv_w)
    for file in glamosfiles_mb_a:
        fileName = re.split('.dat', file)[0]
        processDatFile(fileName, path_PMB_GLAMOS_a_raw, path_PMB_GLAMOS_csv_a)

    for file in glamosfiles_mb_w:
        fileName = re.split('.dat', file)[0]
        processDatFile(fileName, path_PMB_GLAMOS_w_raw, path_PMB_GLAMOS_csv_w)

# separate clariden into clariden II and III
fileName = 'clariden_annual.csv'
clariden_csv_a = pd.read_csv(path_PMB_GLAMOS_csv_a + fileName,
                             sep=',',
                             header=0,
                             encoding='latin-1')
clariden_csv_a[clariden_csv_a['# name'] == 'L'].to_csv(path_PMB_GLAMOS_csv_a +
                                                       'claridenL_annual.csv',
                                                       index=False)
clariden_csv_a[clariden_csv_a['# name'] == 'U'].to_csv(path_PMB_GLAMOS_csv_a +
                                                       'claridenU_annual.csv',
                                                       index=False)

fileName = 'clariden_winter.csv'
clariden_csv_w = pd.read_csv(path_PMB_GLAMOS_csv_w + fileName,
                             sep=',',
                             header=0,
                             encoding='latin-1')
clariden_csv_w[clariden_csv_w['# name'] == 'L'].to_csv(path_PMB_GLAMOS_csv_w +
                                                       'claridenL_winter.csv',
                                                       index=False)
clariden_csv_w[clariden_csv_w['# name'] == 'U'].to_csv(path_PMB_GLAMOS_csv_w +
                                                       'claridenU_winter.csv',
                                                       index=False)

os.remove(path_PMB_GLAMOS_csv_a + 'clariden_annual.csv')
os.remove(path_PMB_GLAMOS_csv_w + 'clariden_winter.csv')

# Example:
fileName = 'aletsch_annual.csv'
aletsch_csv = pd.read_csv(path_PMB_GLAMOS_csv_a + fileName,
                          sep=',',
                          header=0,
                          encoding='latin-1')
aletsch_csv.head(2)

Examples of index stake raw files:
 ['tortin_annual.dat', 'forno_annual.dat', 'rosatsch_annual.dat', 'petitplanneve_annual.dat', 'corvatsch_annual.dat']


Unnamed: 0,# name,date0,time0,date1,time1,period,date_quality,x_pos,y_pos,z_pos,...,density,density_quality,mb_we,measurement_quality,measurement_type,mb_error,reading_error,density_error,error_evaluation_method,source
0,NMF,19120909,1200,19130920,1200,376.0,0,647166.0,150081.0,2850.0,...,700,4,-1120,2,6,1094,1085,134,0,glrep
1,NMG1,19120924,1200,19130920,1200,361.0,0,647089.0,150780.0,2802.0,...,539,6,592,4,6,873,870,71,0,glrep


## Assemble into one table:

In [4]:
# RGI Ids:
# Read rgi ids:
path_rgi = '../../../data/GLAMOS/CH_glacier_ids_long.csv'
rgi_df = pd.read_csv(path_rgi, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)
rgi_df.head(2)

Unnamed: 0_level_0,full_name,sgi-id,rgi_id.v7,Issue
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
adler,Adler,B56/03,RGI2000-v7.0-G-11-01075,False
albigna,Albigna,,RGI2000-v7.0-G-11-02309,True


In [5]:
# Assemble all into one csv file:
RUN = True
if RUN:
    # Annual:
    df_all_raw = pd.DataFrame()
    for file in tqdm(os.listdir(path_PMB_GLAMOS_csv_a), desc='Summer stakes'):
        fileName = re.split('.csv', file)[0]
        glacierName = re.split('_', fileName)[0]
        df = pd.read_csv(path_PMB_GLAMOS_csv_a + file,
                         sep=',',
                         header=0,
                         encoding='latin-1')
        df['glacier'] = glacierName
        df['period'] = 'annual'

        # Correct years and add hydrol. year:
        df_processed = transformDates(df)

        # Remove obvious duplicates:
        df_processed = df_processed.drop_duplicates()

        # Transform to lat/lon system
        df_processed = LV03toWGS84(df_processed)

        df_all_raw = pd.concat([df_all_raw, df_processed])

    # Get the year:
    df_all_raw['YEAR'] = df_all_raw['date1'].apply(
        lambda x: pd.to_datetime(x).year)

    # download all stakes coordinates:
    df_all_raw[['glacier', '# name', 'lat', 'lon',
                'period']].to_csv(path_PMB_GLAMOS_csv + 'coordinates_all.csv')

    # Save all stakes:
    df_all_raw.to_csv(path_PMB_GLAMOS_csv + 'point_all.csv')

df_all_raw = pd.read_csv(path_PMB_GLAMOS_csv + 'point_all.csv',
                         sep=',',
                         header=0,
                         encoding='latin-1').drop(columns='Unnamed: 0')

# Reshape to WGMS format:
# re order columns:
df_all_raw = df_all_raw[[
    'YEAR', '# name', 'glacier', 'date0', 'date1', 'lat', 'lon', 'height',
    'mb_we', 'period', 'date_fix0', 'date_fix1', 'time0', 'time1',
    'date_quality', 'position_quality', 'mb_raw', 'density', 'density_quality',
    'measurement_quality', 'measurement_type', 'mb_error', 'reading_error',
    'density_error', 'error_evaluation_method', 'source'
]]
df_all_raw.rename(columns={
    '# name': 'POINT_ID',
    'lat': 'POINT_LAT',
    'lat': 'POINT_LAT',
    'lon': 'POINT_LON',
    'height': 'POINT_ELEVATION',
    'date0': 'FROM_DATE',
    'date1': 'TO_DATE',
    'mb_we': 'POINT_BALANCE',
    'glacier': 'GLACIER',
    'period': 'PERIOD'
},
                  inplace=True)
# remove duplicates:
df_all_raw = df_all_raw.drop_duplicates()

print('Number of winter and annual samples:', len(df_all_raw))
print('Number of winter samples:',
      len(df_all_raw[df_all_raw.PERIOD == 'winter']))
print('Number of annual samples:',
      len(df_all_raw[df_all_raw.PERIOD == 'annual']))

df_all_raw.head(2)

Summer stakes:   0%|          | 0/57 [00:00<?, ?it/s]

Number of winter and annual samples: 10688
Number of winter samples: 0
Number of annual samples: 10688


Unnamed: 0,YEAR,POINT_ID,GLACIER,FROM_DATE,TO_DATE,POINT_LAT,POINT_LON,POINT_ELEVATION,POINT_BALANCE,PERIOD,...,mb_raw,density,density_quality,measurement_quality,measurement_type,mb_error,reading_error,density_error,error_evaluation_method,source
0,2003,1,oberaar,2002-10-06,2003-10-11,46.538806,8.233237,2389.812633,-6174,annual,...,-686,900,1,1,1,102,45,92,0,hm
1,2003,2,oberaar,2002-10-06,2003-10-11,46.536611,8.225514,2499.825727,-5310,annual,...,-590,900,1,1,1,91,45,79,0,hm


## Add RGIs:

In [6]:
# Keep important features:
df_pmb = df_all_raw[[
    'YEAR',
    'POINT_ID',
    'GLACIER',
    'FROM_DATE',
    'TO_DATE',
    'POINT_LAT',
    'POINT_LON',
    'POINT_ELEVATION',
    'POINT_BALANCE',
    'PERIOD',
]]
df_pmb.head(3)

Unnamed: 0,YEAR,POINT_ID,GLACIER,FROM_DATE,TO_DATE,POINT_LAT,POINT_LON,POINT_ELEVATION,POINT_BALANCE,PERIOD
0,2003,1,oberaar,2002-10-06,2003-10-11,46.538806,8.233237,2389.812633,-6174,annual
1,2003,2,oberaar,2002-10-06,2003-10-11,46.536611,8.225514,2499.825727,-5310,annual
2,2003,3,oberaar,2002-10-06,2003-10-11,46.532136,8.207734,2679.854419,-4320,annual


In [7]:
# Add RGIs:
# Specify the shape filename of the glaciers outline obtained from RGIv6
glacier_outline_fname = '../../../data/GLAMOS/nsidc0770_11.rgi60.CentralEurope/11_rgi60_CentralEurope.shp'

# Load the target data and the glacier outlines
glacier_outline = gpd.read_file(glacier_outline_fname)

# Add RGI IDs through intersection with shapefiles:
df_pmb = mbm.utils.get_rgi(data=df_pmb, glacier_outlines=glacier_outline)

# Add RGIs without intersections (by finding the closest polygon):
# for points where polygon intersection is NaN (about a 1000)
no_match_df = df_pmb[df_pmb.RGIId.isna()]
geometry = [
    Point(lon, lat)
    for lon, lat in zip(no_match_df["POINT_LON"], no_match_df["POINT_LAT"])
]
points_gdf = gpd.GeoDataFrame(no_match_df,
                              geometry=geometry,
                              crs=glacier_outline.crs)
for index in tqdm(no_match_df.index):
    point = points_gdf.loc[index]['geometry']
    polygon_index = glacier_outline.distance(point).sort_values().index[0]
    closest_rgi = glacier_outline.loc[polygon_index].RGIId
    df_pmb.at[index, 'RGIId'] = closest_rgi

  0%|          | 0/1325 [00:00<?, ?it/s]

In [8]:
# Look at identified RGIs per glacier:
rgiids6 = df_pmb[['GLACIER',
                  'RGIId']].sort_values(by='GLACIER').drop_duplicates()
rgis = {}
for gl in rgiids6.GLACIER.unique():
    rgis[gl] = list(rgiids6[rgiids6.GLACIER == gl].RGIId)
rgis

{'adler': ['RGI60-11.02764'],
 'albigna': ['RGI60-11.02299', 'RGI60-11.02285', 'RGI60-11.02282'],
 'aletsch': ['RGI60-11.01450'],
 'allalin': ['RGI60-11.02704'],
 'arolla': ['RGI60-11.02810'],
 'basodino': ['RGI60-11.01987'],
 'bertol': ['RGI60-11.02779'],
 'blauschnee': ['RGI60-11.00638'],
 'cantun': ['RGI60-11.02268'],
 'chessjen': ['RGI60-11.02674'],
 'claridenL': ['RGI60-11.00817'],
 'claridenU': ['RGI60-11.00843'],
 'corbassiere': ['RGI60-11.02766'],
 'corvatsch': ['RGI60-11.01962'],
 'damma': ['RGI60-11.01246'],
 'diablerets': ['RGI60-11.02261'],
 'diavolezza': ['RGI60-11.02013'],
 'err': ['RGI60-11.01516', 'RGI60-11.01549'],
 'findelen': ['RGI60-11.02773'],
 'forno': ['RGI60-11.02245'],
 'gietro': ['RGI60-11.02774'],
 'gorner': ['RGI60-11.02822'],
 'gries': ['RGI60-11.01876', 'RGI60-11.02441'],
 'gurschen': ['RGI60-11.01344'],
 'hohlaub': ['RGI60-11.02679'],
 'joeri': ['RGI60-11.01063'],
 'limmern': ['RGI60-11.00918', 'RGI60-11.00915'],
 'misaun': ['RGI60-11.01945'],
 'morterats

In [9]:
# Manual pre-processing and removal of errors:
# Silvretta: weird outlier coordinate
df_pmb_clean = df_pmb.copy()
index_outlier = df_pmb_clean[(df_pmb_clean.GLACIER == 'silvretta')
                             & (df_pmb_clean.POINT_LAT > 46.9)].index
df_pmb_clean.drop(index_outlier, inplace=True)

# and remove the stake that is on the neighbouring glacier:
index_outlier = df_pmb_clean[(df_pmb_clean.GLACIER == 'silvretta')
                             & (df_pmb_clean.RGIId != 'RGI60-11.00804')].index
df_pmb_clean.drop(index_outlier, inplace=True)

# Albigna: different rgis, remove stakes that are for two neighbouring glaciers:
index_outlier = df_pmb_clean[(df_pmb_clean.GLACIER == 'albigna')
                             & (df_pmb_clean.RGIId != 'RGI60-11.02285')].index
df_pmb_clean.drop(index_outlier, inplace=True)

# Err glacier: remove stakes that are on neighbouring glacier:
index_outlier = df_pmb_clean[(df_pmb_clean.GLACIER == 'err')
                             & (df_pmb_clean.RGIId != 'RGI60-11.01516')].index
df_pmb_clean.drop(index_outlier, inplace=True)

# Gries: weird outlier coordinate
index_outlier = df_pmb_clean[(df_pmb_clean.GLACIER == 'gries')
                             & (df_pmb_clean.RGIId != 'RGI60-11.01876')].index
df_pmb_clean.drop(index_outlier, inplace=True)

# Limmern: three stakes on neighbouring glacier
index_outlier = df_pmb_clean[(df_pmb_clean.GLACIER == 'limmern')
                             & (df_pmb_clean.RGIId != 'RGI60-11.00918')].index
df_pmb_clean.drop(index_outlier, inplace=True)

# Offental: on no RGI v6 outline
df_pmb_clean = df_pmb_clean[df_pmb_clean.GLACIER != 'ofental']

# Orny: change to correct RGIId
index_outlier = df_pmb_clean[(df_pmb_clean.GLACIER == 'orny')].index
for i in index_outlier:
    df_pmb_clean.at[i, 'RGIId'] = 'RGI60-11.02775'

# Plattalva: change to correct RGIId
index_outlier = df_pmb_clean[(df_pmb_clean.GLACIER == 'plattalva')].index
for i in index_outlier:
    df_pmb_clean.at[i, 'RGIId'] = 'RGI60-11.00892'
    
# Look at identified RGIs per glacier:
rgiids6 = df_pmb_clean[['GLACIER',
                  'RGIId']].sort_values(by='GLACIER').drop_duplicates()
rgis = {}
for gl in rgiids6.GLACIER.unique():
    rgis[gl] = list(rgiids6[rgiids6.GLACIER == gl].RGIId)
rgis

{'adler': ['RGI60-11.02764'],
 'albigna': ['RGI60-11.02285'],
 'aletsch': ['RGI60-11.01450'],
 'allalin': ['RGI60-11.02704'],
 'arolla': ['RGI60-11.02810'],
 'basodino': ['RGI60-11.01987'],
 'bertol': ['RGI60-11.02779'],
 'blauschnee': ['RGI60-11.00638'],
 'cantun': ['RGI60-11.02268'],
 'chessjen': ['RGI60-11.02674'],
 'claridenL': ['RGI60-11.00817'],
 'claridenU': ['RGI60-11.00843'],
 'corbassiere': ['RGI60-11.02766'],
 'corvatsch': ['RGI60-11.01962'],
 'damma': ['RGI60-11.01246'],
 'diablerets': ['RGI60-11.02261'],
 'diavolezza': ['RGI60-11.02013'],
 'err': ['RGI60-11.01516'],
 'findelen': ['RGI60-11.02773'],
 'forno': ['RGI60-11.02245'],
 'gietro': ['RGI60-11.02774'],
 'gorner': ['RGI60-11.02822'],
 'gries': ['RGI60-11.01876'],
 'gurschen': ['RGI60-11.01344'],
 'hohlaub': ['RGI60-11.02679'],
 'joeri': ['RGI60-11.01063'],
 'limmern': ['RGI60-11.00918'],
 'misaun': ['RGI60-11.01945'],
 'morteratsch': ['RGI60-11.01946'],
 'murtel': ['RGI60-11.02024'],
 'oberaar': ['RGI60-11.01509'],
 '

In [10]:
# Save to csv:
df_pmb_clean.to_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset.csv', index=False)

## Add topographical features from OGGM:

In [11]:
# Provide the column name for the column that has the RGI IDs for each of the stakes
dataset = mbm.Dataset(data=df_pmb,
                      region_name='CH',
                      data_path=path_PMB_GLAMOS_csv)
dataset

<Dataset.Dataset at 0x7f5dc0760160>

In [12]:
# Specify the topographical features of interest
# Please see the OGGM documentation what variables are available: https://oggm.org/tutorials/stable/notebooks/10minutes/machine_learning.html ('topo', 'slope_factor', 'dis_from_border')
voi_topographical = ['aspect', 'slope']

# Retrieve the topographical features for each stake measurement and add them to the dataset
dataset.get_topo_features(vois=voi_topographical)

2024-07-31 09:46:43: oggm.cfg: Reading default parameters from the OGGM `params.cfg` configuration file.
2024-07-31 09:46:43: oggm.cfg: Multiprocessing switched OFF according to the parameter file.
2024-07-31 09:46:43: oggm.cfg: Multiprocessing: using all available processors (N=32)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

#