In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd



## Setting up the instance

### Screenshot

![](https://github.com/UBC-MDS/web_cloud_comp_group2/raw/main/notebooks/img/image_1.png)

## Setting up JupyterHub

### Screenshot

![](https://github.com/UBC-MDS/web_cloud_comp_group2/raw/main/notebooks/img/image_2.png)

## Setting up the server

### Screenshot

![](https://github.com/UBC-MDS/web_cloud_comp_group2/raw/main/notebooks/img/image_3.png)

## Setting up the S3 bucket and moving the data

### Screenshot

![](https://github.com/UBC-MDS/web_cloud_comp_group2/raw/main/notebooks/img/image_4.png)

## Wrangling the data

In [None]:
# Necessary metadata
article_id = 14226968  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "/srv/data/my_shared_data_folder/"

In [None]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

In [None]:
files_to_dl = ["combined_model_data_parti.parquet.zip"]  ## Please download the partitioned 
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

In [None]:
with zipfile.ZipFile(os.path.join(output_directory, "combined_model_data_parti.parquet.zip"), 'r') as f:
    f.extractall(output_directory)

In [2]:
# Passing your credentials 
## IMPORTANT: make sure you dont include you secret and key when submitting the notebook
aws_credentials ={"key": "","secret": ""} ## dont include you secret and key when submitting the notebook
df = pd.read_parquet("s3://mds-s3-student1/combined_model_data_parti.parquet", storage_options=aws_credentials)
df_syd_observed = pd.read_csv("s3://mds-s3-student1/observed_daily_rainfall_SYD.csv", storage_options=aws_credentials, index_col='time')

In [3]:
df.shape

(62513863, 7)

In [4]:
df_syd_observed.shape

(46020, 1)

In [5]:
syd_lat = -33.86
syd_lon = 151.21

df = df.query("lat_min <= @syd_lat <= lat_max ").query("lon_min <= @syd_lon <= lon_max ")
df.index = df.time
df = df.drop(columns=['lat_min', 'lat_max', 'lon_min', 'lon_max', 'time'])
df = df.pivot(columns='model', values='rain (mm/day)')
df = df.resample('D').mean()
df['observed'] = df_syd_observed['rain (mm/day)'].to_numpy()

In [6]:
df

model,ACCESS-CM2,ACCESS-ESM1-5,AWI-ESM-1-1-LR,BCC-CSM2-MR,BCC-ESM1,CMCC-CM2-HR4,CMCC-CM2-SR5,CMCC-ESM2,CanESM5,EC-Earth3-Veg-LR,...,MPI-ESM-1-2-HAM,MPI-ESM1-2-HR,MPI-ESM1-2-LR,MRI-ESM2-0,NESM3,NorESM2-LM,NorESM2-MM,SAM0-UNICON,TaiESM1,observed
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1889-01-01,0.040427,1.814552,3.557934e+01,4.268112e+00,1.107466e-03,1.141054e+01,3.322009e-08,2.668800,1.321215,1.515293,...,4.244226e-13,1.390174e-13,6.537884e-05,3.445495e-06,1.576096e+01,4.759651e-05,2.451075,0.221324,2.257933,0.006612
1889-01-02,0.073777,0.303965,4.596520e+00,1.190141e+00,1.015323e-04,4.014984e+00,1.312700e+00,0.946211,2.788724,4.771375,...,4.409552e+00,1.222283e-01,1.049131e-13,4.791993e-09,3.675510e-01,4.350863e-01,0.477231,3.757179,2.287381,0.090422
1889-01-03,0.232656,0.019976,5.927467e+00,1.003845e-09,1.760345e-05,9.660565e+00,9.103720e+00,0.431999,0.003672,4.233980,...,2.269300e-01,3.762301e-01,9.758706e-14,6.912302e-01,1.562869e-01,9.561101e+00,0.023083,0.253357,1.199909,1.401452
1889-01-04,0.911319,13.623777,8.029624e+00,8.225225e-02,1.808932e-01,3.951528e+00,1.317160e+01,0.368693,0.013578,15.252495,...,2.344586e-02,4.214019e-01,7.060915e-03,3.835721e-02,2.472226e-07,5.301038e-01,0.002699,2.185454,2.106737,14.869798
1889-01-05,0.698013,0.021048,2.132686e+00,2.496841e+00,4.708019e-09,2.766362e+00,1.822940e+01,0.339267,0.002468,11.920356,...,4.270161e-13,1.879692e-01,4.504985e+00,3.506923e-07,1.949792e-13,1.460928e-10,0.001026,2.766507,1.763335,0.467628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-12-27,0.033748,0.123476,1.451179e+00,3.852845e+01,2.061717e-03,8.179260e-09,1.171263e-02,0.090786,59.895053,5.071783,...,4.726998e-13,1.326889e-01,1.827857e+00,6.912632e-03,2.171327e-03,1.620489e+00,2.084252,0.868046,17.444923,0.037472
2014-12-28,0.094198,2.645496,4.249335e+01,5.833801e-01,5.939502e-09,8.146937e-01,4.938899e-01,0.000000,0.512632,1.578188,...,4.609420e-13,1.644482e+00,7.242920e-01,2.836752e-03,1.344768e+01,2.391159e+00,1.644527,0.782258,1.569647,0.158061
2014-12-29,0.005964,3.041667,2.898325e+00,9.359547e-02,2.000051e-08,2.532205e-01,1.306046e+00,0.000002,37.169669,1.565885,...,2.016156e+01,1.506439e+00,1.049481e-01,8.137182e+00,2.547820e+01,1.987695e-12,0.205036,2.140723,1.444630,0.025719
2014-12-30,0.000028,1.131412,2.516381e-01,1.715028e-01,7.191735e-05,8.169252e-02,1.722262e-01,0.788577,7.361246,0.025749,...,9.420543e+00,6.242895e+00,1.245115e-01,9.305263e-03,4.192948e+00,2.150346e+00,0.000017,29.714692,0.716019,0.729390


In [7]:
df.to_csv('s3://mds-s3-student1/output/ml_data_SYD.csv', storage_options=aws_credentials)

### Screenshot

![](https://github.com/UBC-MDS/web_cloud_comp_group2/raw/main/notebooks/img/image_5.png)