# Milestone 1

<br>

### Imports

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import dask
import dask.dataframe as dd
import seaborn as sns

### Variables

In [2]:
# Figshare article metadata
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figsharerainfall/"

### 3. Send API request to Figshare to download articles data 

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # articles data
files = data["files"]             # files data
files[3] # target folder

{'id': 26766812,
 'name': 'data.zip',
 'size': 814041183,
 'is_link_only': False,
 'download_url': 'https://ndownloader.figshare.com/files/26766812',
 'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
 'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9'}

In [4]:
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

### Collect data.zip folder from the list of `files` 

In [5]:
%%time
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 4.19 s, sys: 7.02 s, total: 11.2 s
Wall time: 36 s


### Unzip files from `data.zip` folder

In [6]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory + "/data")

CPU times: user 14.8 s, sys: 2.17 s, total: 17 s
Wall time: 19.6 s


### 4. Combining csv's in `data` folder

In [25]:
pd.concat((pd.read_csv(file, index_col=0) for file in files)
              )

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13
...,...,...,...,...,...
2014-12-27 12:00:00,-30.157068,-29.214660,153.1250,154.3750,6.689683e+00
2014-12-28 12:00:00,-30.157068,-29.214660,153.1250,154.3750,7.862555e+00
2014-12-29 12:00:00,-30.157068,-29.214660,153.1250,154.3750,1.000503e+01
2014-12-30 12:00:00,-30.157068,-29.214660,153.1250,154.3750,8.541592e+00


In [26]:
%%time
## merging files
import pandas as pd
files = glob.glob('figsharerainfall/data/*.csv')
files = [file for file in files if file != 'figsharerainfall/data/observed_daily_rainfall_SYD.csv']
df = pd.concat((pd.read_csv(file, index_col=0) 
                .assign(model=re.findall(r'[^\/]+(?=\_daily_rainfall_NSW.csv)', file)[0])
                for file in files)
              )
df.to_csv("figsharerainfall/combined_data.csv")

CPU times: user 6min 12s, sys: 23.3 s, total: 6min 36s
Wall time: 6min 47s


| Team Member       | Operating System | RAM  | Processor | Is SSD | Time taken |
|-------------------|------------------|------|-----------|--------|------------|
| Karanpreet Kaur   |                  |      |           |        |            |
| Melisa Maidana    | mac              | 8 GB | M1        | yes    |            |
| Nagraj Rao        |                  |      |           |        |            |
| Ting Zhe (TZ) Yan |                  |      |           |        |            |

In [30]:
%%sh
du -sh figsharerainfall/combined_data.csv

5.6G	figsharerainfall/combined_data.csv


In [31]:
print(df.shape)

(62467843, 6)


In [32]:
df.head()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


### 5. Load the combined CSV to memory and perform a simple EDA

#### Approach #1

In [None]:
df.dtypes

| Team Member       | Operating System | RAM  | Processor | Is SSD | Time taken |
|-------------------|------------------|------|-----------|--------|------------|
| Karanpreet Kaur   |                  |      |           |        |            |
| Melisa Maidana    | mac              | 8 GB | M1        | yes    |            |
| Nagraj Rao        |                  |      |           |        |            |
| Ting Zhe (TZ) Yan |                  |      |           |        |            |

#### Approach #2

In [78]:
dd = dd.read_csv('figsharerainfall/combined_data.csv')
dd_plot = dd.groupby('model')['rain (mm/day)'].mean().compute()
dd_plot = dd_plot.to_frame().sort_values('rain (mm/day)')

# Plot
sns.barplot(data=dd_plot, x = 'rain (mm/day)', y = dd_plot.index)

#### Summary of results

| Team Member       | Operating System | RAM  | Processor | Is SSD | Time taken |
|-------------------|------------------|------|-----------|--------|------------|
| Karanpreet Kaur   |                  |      |           |        |            |
| Melisa Maidana    | mac              | 8 GB | M1        | yes    |            |
| Nagraj Rao        |                  |      |           |        |            |
| Ting Zhe (TZ) Yan |                  |      |           |        |            |

### 6. Perform a simple EDA in R

#### Approach to transfer the dataframe from python to R

#### Discuss why you chose this approach over others

#### Summary of results

| Team Member       | Operating System | RAM  | Processor | Is SSD | Time taken |
|-------------------|------------------|------|-----------|--------|------------|
| Karanpreet Kaur   |                  |      |           |        |            |
| Melisa Maidana    | mac              | 8 GB | M1        | yes    |            |
| Nagraj Rao        |                  |      |           |        |            |
| Ting Zhe (TZ) Yan |                  |      |           |        |            |