# Pre-processing of GLAMOS MB data:

## Setting up:

In [1]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import re

from scripts.helpers import *
from scripts.glamos_preprocess import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [2]:
path_PMB_GLAMOS_raw = '../../../data/GLAMOS/point/raw/'
path_PMB_GLAMOS_w_raw = path_PMB_GLAMOS_raw + 'winter/'
path_PMB_GLAMOS_a_raw = path_PMB_GLAMOS_raw + 'annual/'

path_PMB_GLAMOS_csv = '../../../data/GLAMOS/point/csv/'
path_PMB_GLAMOS_csv_w = path_PMB_GLAMOS_csv + 'winter/'
path_PMB_GLAMOS_csv_a = path_PMB_GLAMOS_csv + 'annual/'


## Transform .dat files to .csv:

In [3]:
# Get all files with pmb (for winter and annual mb):
glamosfiles_mb_a, glamosfiles_mb_w = [], []
for file in os.listdir(path_PMB_GLAMOS_a_raw):
    # check if current path is a file
    if os.path.isfile(os.path.join(path_PMB_GLAMOS_a_raw, file)):
        glamosfiles_mb_a.append(file)

for file in os.listdir(path_PMB_GLAMOS_w_raw):
    # check if current path is a file
    if os.path.isfile(os.path.join(path_PMB_GLAMOS_w_raw, file)):
        glamosfiles_mb_w.append(file)

print('Examples of index stake raw files:\n', glamosfiles_mb_a[:5])

# Transform all files to csv
RUN = True
if RUN:
    emptyfolder(path_PMB_GLAMOS_csv_a)
    emptyfolder(path_PMB_GLAMOS_csv_w)
    for file in glamosfiles_mb_a:
        fileName = re.split('.dat', file)[0]
        processDatFile(fileName, path_PMB_GLAMOS_a_raw, path_PMB_GLAMOS_csv_a)

    for file in glamosfiles_mb_w:
        fileName = re.split('.dat', file)[0]
        processDatFile(fileName, path_PMB_GLAMOS_w_raw, path_PMB_GLAMOS_csv_w)

# separate clariden into clariden II and III
fileName = 'clariden_annual.csv'
clariden_csv_a = pd.read_csv(path_PMB_GLAMOS_csv_a + fileName,
                             sep=',',
                             header=0,
                             encoding='latin-1')
clariden_csv_a[clariden_csv_a['# name'] == 'L'].to_csv(path_PMB_GLAMOS_csv_a +
                                                       'claridenL_annual.csv',
                                                       index=False)
clariden_csv_a[clariden_csv_a['# name'] == 'U'].to_csv(path_PMB_GLAMOS_csv_a +
                                                       'claridenU_annual.csv',
                                                       index=False)

fileName = 'clariden_winter.csv'
clariden_csv_w = pd.read_csv(path_PMB_GLAMOS_csv_w + fileName,
                             sep=',',
                             header=0,
                             encoding='latin-1')
clariden_csv_w[clariden_csv_w['# name'] == 'L'].to_csv(path_PMB_GLAMOS_csv_w +
                                                       'claridenL_winter.csv',
                                                       index=False)
clariden_csv_w[clariden_csv_w['# name'] == 'U'].to_csv(path_PMB_GLAMOS_csv_w +
                                                       'claridenU_winter.csv',
                                                       index=False)

os.remove(path_PMB_GLAMOS_csv_a + 'clariden_annual.csv')
os.remove(path_PMB_GLAMOS_csv_w + 'clariden_winter.csv')

# Example:
fileName = 'aletsch_annual.csv'
aletsch_csv = pd.read_csv(path_PMB_GLAMOS_csv_a + fileName,
                          sep=',',
                          header=0,
                          encoding='latin-1')
aletsch_csv.head(2)

Examples of index stake raw files:
 ['tortin_annual.dat', 'forno_annual.dat', 'rosatsch_annual.dat', 'petitplanneve_annual.dat', 'corvatsch_annual.dat']


Unnamed: 0,# name,date0,time0,date1,time1,period,date_quality,x_pos,y_pos,z_pos,...,density,density_quality,mb_we,measurement_quality,measurement_type,mb_error,reading_error,density_error,error_evaluation_method,source
0,NMF,19120909,1200,19130920,1200,376.0,0,647166.0,150081.0,2850.0,...,700,4,-1120,2,6,1094,1085,134,0,glrep
1,NMG1,19120924,1200,19130920,1200,361.0,0,647089.0,150780.0,2802.0,...,539,6,592,4,6,873,870,71,0,glrep


## Assemble into one table:

In [4]:
# RGI Ids:
# Read rgi ids:
path_rgi = '../../../data/GLAMOS/CH_glacier_ids_long.csv'
rgi_df = pd.read_csv(path_rgi, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)
rgi_df.head(2)

Unnamed: 0_level_0,full_name,sgi-id,rgi_id.v7,Issue
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
adler,Adler,B56/03,RGI2000-v7.0-G-11-01075,False
albigna,Albigna,,RGI2000-v7.0-G-11-02309,True


In [48]:
# Assemble all into one csv file:
RUN = False
if RUN:
    # Annual:
    df_all_raw = pd.DataFrame()
    for file in tqdm(os.listdir(path_PMB_GLAMOS_csv_a), desc='Summer stakes'):
        fileName = re.split('.csv', file)[0]
        glacierName = re.split('_', fileName)[0]
        df = pd.read_csv(path_PMB_GLAMOS_csv_a + file,
                         sep=',',
                         header=0,
                         encoding='latin-1')
        df['glacier'] = glacierName
        df['period'] = 'annual'

        # Correct years and add hydrol. year:
        df_processed = transformDates(df)

        # Remove obvious duplicates:
        df_processed = df_processed.drop_duplicates()

        # Transform to lat/lon system
        df_processed = LV03toWGS84(df_processed)

        # Add rgi and issue id:
        rgi_id = rgi_df.loc[glacierName]['rgi_id.v7']
        known_issue = rgi_df.loc[glacierName].Issue
        df_processed['rgi_id'] = [rgi_id for i in range(len(df_processed))]
        df_processed['known_issue'] = [
            known_issue for i in range(len(df_processed))
        ]

        df_all_raw = pd.concat([df_all_raw, df_processed])

    # Winter:
    for file in tqdm(os.listdir(path_PMB_GLAMOS_csv_w), desc='Winter stakes'):
        fileName = re.split('.csv', file)[0]
        glacierName = re.split('_', fileName)[0]
        df = pd.read_csv(path_PMB_GLAMOS_csv_w + file,
                         sep=',',
                         header=0,
                         encoding='latin-1')
        df['glacier'] = glacierName
        df['period'] = 'winter'

        # Correct years and add hydrol. year:
        df_processed = transformDates(df)

        # Remove obvious duplicates:
        df_processed = df_processed.drop_duplicates()

        # Transform to lat/lon system
        df_processed = LV03toWGS84(df_processed)

        # Add rgi and glims id:
        rgi_id = rgi_df.loc[glacierName]['rgi_id.v7']
        known_issue = rgi_df.loc[glacierName].Issue
        df_processed['rgi_id'] = [rgi_id for i in range(len(df_processed))]
        df_processed['known_issue'] = [
            known_issue for i in range(len(df_processed))
        ]

        df_all_raw = pd.concat([df_all_raw, df_processed])

    # Get the year:
    df_all_raw['YEAR'] = df_all_raw['date1'].apply(
        lambda x: pd.to_datetime(x).year)

    # download all stakes coordinates:
    df_all_raw[['glacier', '# name', 'lat', 'lon',
                'period']].to_csv(path_PMB_GLAMOS_csv + 'coordinates_all.csv')

    # Save all stakes:
    df_all_raw.to_csv(path_PMB_GLAMOS_csv + 'point_all.csv')

df_all_raw = pd.read_csv(path_PMB_GLAMOS_csv + 'point_all.csv',
                         sep=',',
                         header=0,
                         encoding='latin-1').drop(columns='Unnamed: 0')

# Reshape to WGMS format:
# re order columns:
df_all_raw = df_all_raw[[
    'YEAR', '# name', 'glacier', 'rgi_id', 'date0', 'date1', 'lat', 'lon',
    'height', 'mb_we', 'period', 'known_issue', 'date_fix0', 'date_fix1',
    'time0', 'time1', 'date_quality', 'position_quality', 'mb_raw', 'density',
    'density_quality', 'measurement_quality', 'measurement_type', 'mb_error',
    'reading_error', 'density_error', 'error_evaluation_method', 'source'
]]
df_all_raw.rename(columns={
    '# name': 'POINT_ID',
    'lat': 'POINT_LAT',
    'lat': 'POINT_LAT',
    'lon': 'POINT_LON',
    'height': 'POINT_ELEVATION',
    'date0': 'FROM_DATE',
    'date1': 'TO_DATE',
    'mb_we': 'POINT_BALANCE',
    'glacier': 'GLACIER',
    'rgi_id': 'RGI_ID',
    'period': 'PERIOD'
},
                  inplace=True)
# remove duplicates:
df_all_raw = df_all_raw.drop_duplicates()

# remove stakes that had issues with their location and glacier outline:
df_all_raw = df_all_raw[df_all_raw['known_issue'] == False]

print('Number of winter and annual samples:', len(df_all_raw))
df_all_raw.head(2)

Number of winter and annual samples: 55177


Unnamed: 0,YEAR,POINT_ID,GLACIER,RGI_ID,FROM_DATE,TO_DATE,POINT_LAT,POINT_LON,POINT_ELEVATION,POINT_BALANCE,...,mb_raw,density,density_quality,measurement_quality,measurement_type,mb_error,reading_error,density_error,error_evaluation_method,source
0,2003,1,oberaar,RGI2000-v7.0-G-11-02622,2002-10-06,2003-10-11,46.538806,8.233237,2389.812633,-6174,...,-686,900,1,1,1,102,45,92,0,hm
1,2003,2,oberaar,RGI2000-v7.0-G-11-02622,2002-10-06,2003-10-11,46.536611,8.225514,2499.825727,-5310,...,-590,900,1,1,1,91,45,79,0,hm


In [52]:
# Keep important features:
df_pmb = df_all_raw[[
    'YEAR',
    'POINT_ID',
    'GLACIER',
    'RGI_ID',
    'FROM_DATE',
    'TO_DATE',
    'POINT_LAT',
    'POINT_LON',
    'POINT_ELEVATION',
    'POINT_BALANCE',
    'PERIOD',
]]
# save to csv:
df_pmb.to_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset.csv', index=False)
df_pmb.head(3)

Unnamed: 0,YEAR,POINT_ID,GLACIER,RGI_ID,FROM_DATE,TO_DATE,POINT_LAT,POINT_LON,POINT_ELEVATION,POINT_BALANCE,PERIOD
0,2003,1,oberaar,RGI2000-v7.0-G-11-02622,2002-10-06,2003-10-11,46.538806,8.233237,2389.812633,-6174,annual
1,2003,2,oberaar,RGI2000-v7.0-G-11-02622,2002-10-06,2003-10-11,46.536611,8.225514,2499.825727,-5310,annual
2,2003,3,oberaar,RGI2000-v7.0-G-11-02622,2002-10-06,2003-10-11,46.532136,8.207734,2679.854419,-4320,annual


#