## ML Preprocessing
### DSCI 525 - Group 10


In [21]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

### Data prep

We are going to read in multiple weather model forecasts for Australia from parquet files, trim to Sydney forecasts and then join on actual observed rainfall data. We will aggregate to *mean* daily rainfall from the models to compare to actuals. The final data frame will have a separate column for each weather model to be used for machine learning modelling of the actual rainfall in Sydney.

In [22]:
# INPUTS ------------------------------------------
# Due to difficulties with permissions on S3 buckets, we've been using a few iterations here
path_to_input_datasets = "/srv/data/my_shared_data_folder/"
# path_to_input_datasets = "s3://mds-s3-student71/"

path_to_output = "/srv/data/my_shared_data_folder/"
# path_to_output = "s3://mds-s3-student71/output/"

# CONSTANTS ----------------------------------------
syd_lat = -33.86
syd_lon = 151.21

df_actuals = (pd.read_csv(os.path.join(path_to_input_datasets, "observed_daily_rainfall_SYD.csv"),
                          names=["time","observed_rainfall"],
                          header=0,
                          parse_dates=True,
                          index_col="time"
                         ))

# Get all weather model data, aggregate to daily rainfall numbers by taking mean, 
# and join on observed values read in above.
df_modelling = (pd.read_parquet(os.path.join(path_to_input_datasets, "combined_model_data_parti.parquet"))
      .query(f"lat_min < {syd_lat} and lat_max > {syd_lat} and lon_min < {syd_lon} and lon_max > {syd_lon}")\
    [["time", "model", "rain (mm/day)"]]
      .assign(time = lambda x:x["time"].dt.date)
      .pivot_table(index="time", columns="model", values="rain (mm/day)")
      .merge(df_actuals[["observed_rainfall"]], left_index=True, right_index=True)
     )

df_modelling.head()

Unnamed: 0_level_0,ACCESS-CM2,ACCESS-ESM1-5,AWI-ESM-1-1-LR,BCC-CSM2-MR,BCC-ESM1,CMCC-CM2-HR4,CMCC-CM2-SR5,CMCC-ESM2,CanESM5,EC-Earth3-Veg-LR,...,MPI-ESM-1-2-HAM,MPI-ESM1-2-HR,MPI-ESM1-2-LR,MRI-ESM2-0,NESM3,NorESM2-LM,NorESM2-MM,SAM0-UNICON,TaiESM1,observed_rainfall
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1889-01-01,0.040427,1.814552,35.579336,4.268112,0.001107466,11.410537,3.322009e-08,2.6688,1.321215,1.515293,...,4.244226e-13,1.390174e-13,6.537884e-05,3.445495e-06,15.76096,4.759651e-05,2.451075,0.221324,2.257933,0.006612
1889-01-02,0.073777,0.303965,4.59652,1.190141,0.0001015323,4.014984,1.3127,0.946211,2.788724,4.771375,...,4.409552,0.1222283,1.049131e-13,4.791993e-09,0.367551,0.4350863,0.477231,3.757179,2.287381,0.090422
1889-01-03,0.232656,0.019976,5.927467,1.003845e-09,1.760345e-05,9.660565,9.10372,0.431999,0.003672,4.23398,...,0.22693,0.3762301,9.758706e-14,0.6912302,0.1562869,9.561101,0.023083,0.253357,1.199909,1.401452
1889-01-04,0.911319,13.623777,8.029624,0.08225225,0.1808932,3.951528,13.1716,0.368693,0.013578,15.252495,...,0.02344586,0.4214019,0.007060915,0.03835721,2.472226e-07,0.5301038,0.002699,2.185454,2.106737,14.869798
1889-01-05,0.698013,0.021048,2.132686,2.496841,4.708019e-09,2.766362,18.2294,0.339267,0.002468,11.920356,...,4.270161e-13,0.1879692,4.504985,3.506923e-07,1.949792e-13,1.460928e-10,0.001026,2.766507,1.763335,0.467628


We write to `S3` for object persistence and downstream modelling.

In [23]:
df_modelling.to_csv(os.path.join(path_to_output, "ml_data_SYD.csv"))