# DSCI 525: Web and Cloud Computing

## Milestone 2: Data Wrangling for Machine Learning

### Group 13
Authors: Ivy Zhang, Mike Lynch, Selma Duric, William Xu

### Imports

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd



### 1. Setup EC2 instance

<img src="../img/1_result.png" alt="1_result" style="width: 1000px;"/>

### 2. Setup JupyterHub

<img src="../img/2_result.png" alt="2_result" style="width: 1000px;"/>

### 3. Setup the server

<img src="../img/3_result.png" alt="3_result" style="width: 1000px;"/>

### 4. Data Wrangling

In [2]:
# Necessary metadata
article_id = 14226968  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "/srv/data/figsharerainfall/"

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'is_link_only': False,
  'name': 'allyears.csv.zip',
  'supplied_md5': '9e046ac05ecd2c32a256a47dd1098b81',
  'computed_md5': '9e046ac05ecd2c32a256a47dd1098b81',
  'id': 26844650,
  'download_url': 'https://ndownloader.figshare.com/files/26844650',
  'size': 2405908113},
 {'is_link_only': False,
  'name': 'individual_years.zip',
  'supplied_md5': '921da748974b07b2a70bbfcc04535a77',
  'computed_md5': '921da748974b07b2a70bbfcc04535a77',
  'id': 26863682,
  'download_url': 'https://ndownloader.figshare.com/files/26863682',
  'size': 1896206676},
 {'is_link_only': False,
  'name': 'combined_model_data.csv.zip',
  'supplied_md5': '7638434c44a7d29cbb29fe200b4fd65d',
  'computed_md5': '7638434c44a7d29cbb29fe200b4fd65d',
  'id': 27515426,
  'download_url': 'https://ndownloader.figshare.com/files/27515426',
  'size': 821308997},
 {'is_link_only': False,
  'name': 'combined_model_data_parti.parquet.zip',
  'supplied_md5': '02f4e3df8d16580a02291de225072689',
  'computed_md5': '02f4e3df8d16580a02

In [4]:
files_to_dl = ["combined_model_data_parti.parquet.zip"]  ## Please download the partitioned 
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

In [5]:
with zipfile.ZipFile(os.path.join(output_directory, "combined_model_data_parti.parquet.zip"), 'r') as f:
    f.extractall(output_directory)

### 5. Setup S3 bucket and move data

<img src="../img/4_result.png" alt="4_result" style="width: 1000px;"/>

### 6. Wrangle the data in preparation for machine learning

In [6]:
# Passing our credentials 
aws_credentials ={"key": "AKIATB63UHM3LJWIOHEM","secret": "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"} ## dont include you secret and key when submitting the notebook
df = pd.read_parquet("s3://mds-s3-student85/combined_model_data_parti.parquet", storage_options=aws_credentials)

In [7]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658,ACCESS-CM2


In [8]:
df = df.query("lat_min <= -33.86 & lat_max >= -33.86 & lon_min <= 151.21 & lon_max >= 151.21")
df = df.drop(columns = ["lat_min", "lat_max", "lon_min", "lon_max"])
df = df.set_index('time')

In [9]:
df.head()

Unnamed: 0_level_0,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1
1889-01-01 12:00:00,0.040427,ACCESS-CM2
1889-01-02 12:00:00,0.073777,ACCESS-CM2
1889-01-03 12:00:00,0.232656,ACCESS-CM2
1889-01-04 12:00:00,0.911319,ACCESS-CM2
1889-01-05 12:00:00,0.698013,ACCESS-CM2


In [10]:
df.shape

(1150049, 2)

In [11]:
df_models = df.pivot(columns="model", values="rain (mm/day)").resample('1D').sum()

In [12]:
df_syd = pd.read_csv("s3://mds-s3-student85/observed_daily_rainfall_SYD.csv", storage_options=aws_credentials)
df_syd = df_syd.set_index('time')
df_syd = df_syd.rename(columns={"rain (mm/day)": "observed_rainfall"})

In [13]:
df_combined = df_models.merge(df_syd, on=df_models.index).set_index('key_0')
df_combined.index.name = "time"

In [14]:
df_combined.head()

Unnamed: 0_level_0,ACCESS-CM2,ACCESS-ESM1-5,AWI-ESM-1-1-LR,BCC-CSM2-MR,BCC-ESM1,CMCC-CM2-HR4,CMCC-CM2-SR5,CMCC-ESM2,CanESM5,EC-Earth3-Veg-LR,...,MPI-ESM-1-2-HAM,MPI-ESM1-2-HR,MPI-ESM1-2-LR,MRI-ESM2-0,NESM3,NorESM2-LM,NorESM2-MM,SAM0-UNICON,TaiESM1,observed_rainfall
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1889-01-01,0.040427,1.814552,35.579336,4.268112,0.001107466,11.410537,3.322009e-08,2.6688,1.321215,1.515293,...,4.244226e-13,1.390174e-13,6.537884e-05,3.445495e-06,15.76096,4.759651e-05,2.451075,0.221324,2.257933,0.006612
1889-01-02,0.073777,0.303965,4.59652,1.190141,0.0001015323,4.014984,1.3127,0.946211,2.788724,4.771375,...,4.409552,0.1222283,1.049131e-13,4.791993e-09,0.367551,0.4350863,0.477231,3.757179,2.287381,0.090422
1889-01-03,0.232656,0.019976,5.927467,1.003845e-09,1.760345e-05,9.660565,9.10372,0.431999,0.003672,4.23398,...,0.22693,0.3762301,9.758706e-14,0.6912302,0.1562869,9.561101,0.023083,0.253357,1.199909,1.401452
1889-01-04,0.911319,13.623777,8.029624,0.08225225,0.1808932,3.951528,13.1716,0.368693,0.013578,15.252495,...,0.02344586,0.4214019,0.007060915,0.03835721,2.472226e-07,0.5301038,0.002699,2.185454,2.106737,14.869798
1889-01-05,0.698013,0.021048,2.132686,2.496841,4.708019e-09,2.766362,18.2294,0.339267,0.002468,11.920356,...,4.270161e-13,0.1879692,4.504985,3.506923e-07,1.949792e-13,1.460928e-10,0.001026,2.766507,1.763335,0.467628


In [16]:
df_combined.shape

(46020, 26)

In [17]:
df_combined.to_csv('s3://mds-s3-student85/output/ml_data_SYD.csv', storage_options=aws_credentials)