# Milestone 2

In this milestone, we will be migrating our analysis to the **AWS cloud**. The process is as such:  

(1) Set up collaborative environment via EC2 instance with JupyterHub, set up S3 bucket  
(2) Migrate data from Milestone 1 to S3  
(3) Wrangle data in preparation for Machine Learning

## 1. Setup Collaborative Environment

### 1.1 Setup EC2 instance

![](img/EC2_screenshot.PNG)

### 1.2. Setup JupyterHub

 ![](img/jupyterhub_screenshot.PNG)

### 1.3 Setup the server

![](img/server_screenshot.PNG)

## 2. Migrate data from Milestone 1 to S3

### 2.1 Setup S3 bucket and move data

![](img/S3_bucket.PNG)

## 3. Wrangle Data

#### Installing packages

In [1]:
# !pip install pandas

In [2]:
# !pip install pyarrow

In [3]:
# !pip install s3fs

#### Reading parquet file from S3 bucket

In [4]:
import pandas as pd
aws_credentials = {"key" : "ASIATPTZJGVKER44PNNH",
                "secret": "CJyy69lmWSDqyJTef2LU1kTNOLE8Cg7MVHjds7Lx",
                "token": "FwoGZXIvYXdzEFgaDA5nyZSZt+cg1eB6oCLEAbgmthG3gMBUl6sU4v5E0KLLdD7NRSu3u/RN6miN/xbdap9C+LLT/tgoRKFOWGRwmiOZXSIdqw5duL0ygTp6EGupFjhghF0yFeEaXEXHJRXfpG0WiK0bcLy/w1FpZWHFaJzJsX1QRuoWXU2IYy7IxaGt5mYzQG2gLPCjC0ImdbEVuRRzB5DtKIkRJ9uOHTDmmgoeB/ahndy24QeZ9ZYOB4lTZWDfRe7drck/WZEmwlnVo+Yd87Oz4XnuJx5JsB7t42ZiwyMo7PnCkgYyLaAGNnGwbkPGhgif+7rSKh0wENigsUx+CgFv1y+XKmr92FgZmkM6IB0N11GiDw=="}

combined_df = pd.read_parquet('s3://mds-s3-5/combined_model_data_parti.parquet',
                     storage_options=aws_credentials)
combined_df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658,ACCESS-CM2


#### Filtering Sydney lat-lon

In [5]:
combined_df['valid_min_lat'] = combined_df['lat_min'] < -33.86
combined_df['valid_max_lat'] = combined_df['lat_max'] > -33.86
combined_df['valid_min_lon'] = combined_df['lon_min'] < 151.21
combined_df['valid_max_lon'] = combined_df['lon_max'] > 151.21

combined_df = combined_df[combined_df['valid_min_lat'] == True][combined_df['valid_max_lat'] == True][combined_df['valid_min_lon'] == True][combined_df['valid_max_lon'] == True]

combined_df.head()


  combined_df = combined_df[combined_df['valid_min_lat'] == True][combined_df['valid_max_lat'] == True][combined_df['valid_min_lon'] == True][combined_df['valid_max_lon'] == True]


Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model,valid_min_lat,valid_max_lat,valid_min_lon,valid_max_lon
552240,1889-01-01 12:00:00,-35.0,-33.75,150.0,151.875,0.040427,ACCESS-CM2,True,True,True,True
552241,1889-01-02 12:00:00,-35.0,-33.75,150.0,151.875,0.073777,ACCESS-CM2,True,True,True,True
552242,1889-01-03 12:00:00,-35.0,-33.75,150.0,151.875,0.232656,ACCESS-CM2,True,True,True,True
552243,1889-01-04 12:00:00,-35.0,-33.75,150.0,151.875,0.911319,ACCESS-CM2,True,True,True,True
552244,1889-01-05 12:00:00,-35.0,-33.75,150.0,151.875,0.698013,ACCESS-CM2,True,True,True,True


#### Keep relevant columns and adjusting 'time' column to keep only date

In [6]:
combined_df = combined_df[['time', 'rain (mm/day)', 'model']]

In [7]:
combined_dates = combined_df['time'].tolist()
only_dates = []
for v in combined_dates:
    only_dates.append(str(v).split(" ")[0])

In [8]:
combined_df['time'] = only_dates
combined_df.head()

Unnamed: 0,time,rain (mm/day),model
552240,1889-01-01,0.040427,ACCESS-CM2
552241,1889-01-02,0.073777,ACCESS-CM2
552242,1889-01-03,0.232656,ACCESS-CM2
552243,1889-01-04,0.911319,ACCESS-CM2
552244,1889-01-05,0.698013,ACCESS-CM2


#### Reading observed data from S3 bucket

In [9]:
obs = pd.read_csv('s3://mds-s3-5/observed_daily_rainfall_SYD.csv', storage_options=aws_credentials)
obs.head()

Unnamed: 0,time,rain (mm/day)
0,1889-01-01,0.006612
1,1889-01-02,0.090422
2,1889-01-03,1.401452
3,1889-01-04,14.869798
4,1889-01-05,0.467628


#### Added 'model' column and adjust 'time' column to keep only date

In [10]:
observed_dates = obs['time'].tolist()
obs_dates = []
for v in observed_dates:
    obs_dates.append(str(v).split(" ")[0])

obs['time'] = obs_dates
obs['model'] = 'observed_rainfall'

obs.head()

Unnamed: 0,time,rain (mm/day),model
0,1889-01-01,0.006612,observed_rainfall
1,1889-01-02,0.090422,observed_rainfall
2,1889-01-03,1.401452,observed_rainfall
3,1889-01-04,14.869798,observed_rainfall
4,1889-01-05,0.467628,observed_rainfall


#### Combine both dataframes

In [11]:
combined_df = pd.concat((combined_df, obs))

#### Pivot dataframe to get the desired output

In [12]:
combined_df = combined_df.pivot(index='time', columns='model', values='rain (mm/day)')
combined_df.head()

model,ACCESS-CM2,ACCESS-ESM1-5,AWI-ESM-1-1-LR,BCC-CSM2-MR,BCC-ESM1,CMCC-CM2-HR4,CMCC-CM2-SR5,CMCC-ESM2,CanESM5,EC-Earth3-Veg-LR,...,MPI-ESM-1-2-HAM,MPI-ESM1-2-HR,MPI-ESM1-2-LR,MRI-ESM2-0,NESM3,NorESM2-LM,NorESM2-MM,SAM0-UNICON,TaiESM1,observed_rainfall
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1889-01-01,0.040427,1.814552,35.579336,4.268112,0.001107466,11.410537,3.322009e-08,2.6688,1.321215,1.515293,...,4.244226e-13,1.390174e-13,6.537884e-05,3.445495e-06,15.76096,4.759651e-05,2.451075,0.221324,2.257933,0.006612
1889-01-02,0.073777,0.303965,4.59652,1.190141,0.0001015323,4.014984,1.3127,0.946211,2.788724,4.771375,...,4.409552,0.1222283,1.049131e-13,4.791993e-09,0.367551,0.4350863,0.477231,3.757179,2.287381,0.090422
1889-01-03,0.232656,0.019976,5.927467,1.003845e-09,1.760345e-05,9.660565,9.10372,0.431999,0.003672,4.23398,...,0.22693,0.3762301,9.758706e-14,0.6912302,0.1562869,9.561101,0.023083,0.253357,1.199909,1.401452
1889-01-04,0.911319,13.623777,8.029624,0.08225225,0.1808932,3.951528,13.1716,0.368693,0.013578,15.252495,...,0.02344586,0.4214019,0.007060915,0.03835721,2.472226e-07,0.5301038,0.002699,2.185454,2.106737,14.869798
1889-01-05,0.698013,0.021048,2.132686,2.496841,4.708019e-09,2.766362,18.2294,0.339267,0.002468,11.920356,...,4.270161e-13,0.1879692,4.504985,3.506923e-07,1.949792e-13,1.460928e-10,0.001026,2.766507,1.763335,0.467628


In [13]:
combined_df.shape

(46020, 26)

#### Save data to S3 bucket

In [14]:
combined_df.to_csv('s3://mds-s3-5/output/ml_data_SYD.csv', storage_options=aws_credentials)

![](img/output_file_S3.PNG)