# Compile CSVs exported from the GlaSEE pipeline

__NOTE:__ You must do one of the following to access your CSVs

- Upload this notebook to your Google Drive and run as a Colab notebook.

- Download the CSVs locally.

- Download Google Drive Desktop or other software for mounting your Drive locally.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
import os
import seaborn as sns
from tqdm import tqdm

## Define path to files

In [None]:
# If using Google Colab, mount your Drive
from google.colab import drive
drive.mount('/content/drive')

# Define to the Google Drive folder with exported CSV files
out_path = '/content/drive/My Drive/glacier_snow_cover_exports/'
compiled_out_path = '/content/drive/My Drive/glacier_snow_cover_exports_compiled/'

## Compile CSVs

In [None]:
# -----Option 1: enter glacier IDs manually
# May be desired if some are glaciers still exporting, etc.
# glacier_IDs = ['G219787E60289N']

# -----Option 2: Grab all the glacier IDs in the folder
all_files = glob(os.path.join(out_path, '*.csv'))
ids = []
for file in all_files:
  id = file.split('/')[-1].split('_')[0]
  ids.append(id)
glacier_IDs = sorted(list(set(ids)))

print('Number of unique glacier IDs:', len(glacier_IDs))
print(glacier_IDs)

In [None]:
# Iterate over glacier IDs
for i, glacier_ID in enumerate(glacier_IDs):
  print('\n', i, glacier_ID)

  # Define output file name for compiled time series
  out_fn = os.path.join(compiled_out_path, glacier_ID + '_timeseries.csv')

  # Check if compiled CSV already exists and if there are new raw files
  files = glob(os.path.join(out_path, glacier_ID + '*snow_cover_stats*.csv'))
  if os.path.exists(out_fn) and len(files) > 0:
      print(f'Compiled CSV already exists. Checking for new snow cover stats files.')

      # Load the existing compiled CSV
      try:
          glacier_df = pd.read_csv(out_fn)
          print('Existing compiled time series loaded.')
      except:
          print(f'Error reading existing compiled CSV: {out_fn}. Recompiling all files.')
          glacier_df = pd.DataFrame() # Initialize an empty DataFrame if there's an error

      dfs = [glacier_df] # Start with the existing data

      # Iterate over new CSVs
      print(f'Found {len(files)} CSVs to potentially add')
      for file in tqdm(files):
          try:
              df = pd.read_csv(file)
              # Check if this file's data is already in the compiled DataFrame
              if not df.equals(glacier_df[glacier_df['system:index'] == df['system:index'].iloc[0]]):
                  dfs.append(df)
          except: # CSVs fail opening when empty
              continue

      # Compile and save new CSV to file
      if len(dfs) > 1: # Only compile if new data was added
          # concatenate dataframes
          glacier_df = pd.concat(dfs)

          # sort by date
          glacier_df = glacier_df.sort_values(by='date')

          # get rid of empty columns
          glacier_df = glacier_df.drop(columns=['system:index', '.geo'], errors='ignore')

          # save to file
          glacier_df.to_csv(out_fn, index=False)
          print('Compiled time series updated and saved to file:', out_fn)
      else:
          print('No new data to add.')

  elif os.path.exists(out_fn) and len(files) == 0:
      print(f'Compiled CSV already exists and no new snow cover stats files found. Skipping.')

  elif not os.path.exists(out_fn) and len(files) > 0:
      print(f'Compiled CSV does not exist. Compiling {len(files)} CSVs.')
      dfs = []
      for file in tqdm(files):
          try:
              df = pd.read_csv(file)
              dfs.append(df)
          except:
              # print('Error reading',file)
              continue

      if len(dfs) > 0:
          # concatenate dataframes
          glacier_df = pd.concat(dfs)

          # sort by date
          glacier_df = glacier_df.sort_values(by='date')

          # get rid of empty columns
          glacier_df = glacier_df.drop(columns=['system:index', '.geo'], errors='ignore')

          # save to file
          glacier_df.to_csv(out_fn, index=False)
          print('Compiled time series saved to file:', out_fn)
      else:
          print('No data found to compile.')
  else:
      print('No compiled CSV and no snow cover stats files found. Skipping.')

## Optional: delete the raw files

Commented out for now to avoid accidents!

In [None]:
# for glacier_ID in tqdm(glacier_IDs):
#   raw_files = glob(os.path.join(out_path, glacier_ID + '*snow_cover_stats*.csv'))
#   for file in raw_files:
#     os.remove(file)

## Plot some time series data for each glacier

In [None]:
# Iterate over glacier IDs
for glacier_ID in glacier_IDs:
  # load compiled time series
  df = pd.read_csv(os.path.join(out_path, glacier_ID + '_timeseries.csv'))
  df['date'] = pd.to_datetime(df['date'])

  # plot
  fig, ax = plt.subplots(figsize=(8,3))
  sns.scatterplot(df, x='date', y='transient_AAR', hue='source', sizes=10)
  ax.set_title(glacier_ID)
  ax.set_ylim(-0.1, 1.1)
  plt.grid()
  plt.show()