# Milestone 1

## Import

In [1]:
import json
import os
import requests
import pandas as pd
import zipfile

## Download the Data

### Download from API

In [2]:
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
output_dir = "../data/"

In [3]:
resp = requests.get(url)

file_list = json.loads(resp.content)['files']

Note: The cell below takes around 1.5 minutes to run. 

In [4]:
%%time
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
for file in file_list:
    if file['name'] != "data.zip":
        continue
    output_name = os.path.join(output_dir, file['name'])
    if os.path.exists(output_name):
        print(f"{file['name']} already exists!")
    else:
        print(f"Downloading {file['name']} to {output_dir}...")
        file_resp = requests.get(file['download_url'])
        with open(output_name, "wb") as f:
            for chunk in file_resp.iter_content(chunk_size=1024): 
                if chunk:
                    f.write(chunk)
        print("Download finished")

data.zip already exists!
CPU times: user 323 µs, sys: 121 µs, total: 444 µs
Wall time: 313 µs


### Unzip data

In [5]:
raw_csv_dir = os.path.join(output_dir, "raw")

In [6]:
%%time
with zipfile.ZipFile(output_name, "r") as zf:
    zf.extractall(raw_csv_dir)

CPU times: user 17.4 s, sys: 4.06 s, total: 21.5 s
Wall time: 26.2 s


## Combine CSV Files

In [7]:
dir_files = os.listdir(raw_csv_dir)
output_df = pd.DataFrame()

In [8]:
%%time
# Exclude MACOSX_ & observed_daily_rainfall_SYD.csv
for fname in dir_files:
    if fname[-8:] != "_NSW.csv":
        continue

    model_name = fname.split('_')[0]

    df = pd.read_csv(os.path.join(raw_csv_dir, fname), index_col=0)
    df['model'] = model_name
    output_df = pd.concat([output_df, df])
    del df

output_df.to_csv(os.path.join(output_dir, "combined.csv"))

CPU times: user 6min 39s, sys: 25.7 s, total: 7min 5s
Wall time: 7min 6s


In [9]:
%%sh
du -sh ../data/combined.csv

5.6G	../data/combined.csv


In [10]:
output_df.shape

(62467843, 6)

In [11]:
output_df.head()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.03129635,AWI-ESM-1-1-LR
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,1.083881e-13,AWI-ESM-1-1-LR
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,1.056313e-13,AWI-ESM-1-1-LR
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,1.08051e-13,AWI-ESM-1-1-LR
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,9.914916e-14,AWI-ESM-1-1-LR


In [12]:
del output_df

### Comparison

| Team Member| Operating System | RAM        | Processor                        | Is SSD | CPU Time   | Wall Time |
|:----------:|:----------------:|:----------:|:--------------------------------:|:------:|:----------:|:---------:|
| James      |                  |            |                                  |        |            |           |
| Kyle       |                  |            |                                  |        |            |           |
| LG         |                  |            |                                  |        |            |           |
| Philson    | Ubuntu 20.04     | 16 GB DDR4 | 1.8GHz Quad-Core Intel i7-8565U  | Yes    | 6min 39s   | 7min 06s  |

**Discussion:**
> Placeholder

## Load the Combined CSV and simple EDA

### Approach 1

#### Comparison

| Team Member| Operating System | RAM        | Processor                        | Is SSD | CPU Time   | Wall Time |
|:----------:|:----------------:|:----------:|:--------------------------------:|:------:|:----------:|:---------:|
| James      |                  |            |                                  |        |            |           |
| Kyle       |                  |            |                                  |        |            |           |
| LG         |                  |            |                                  |        |            |           |
| Philson    | Ubuntu 20.04     | 16 GB DDR4 | 1.8GHz Quad-Core Intel i7-8565U  | Yes    |            |           |

**Discussion:**
> Placeholder

### Approach 2

#### Comparison

| Team Member| Operating System | RAM        | Processor                        | Is SSD | CPU Time   | Wall Time |
|:----------:|:----------------:|:----------:|:--------------------------------:|:------:|:----------:|:---------:|
| James      |                  |            |                                  |        |            |           |
| Kyle       |                  |            |                                  |        |            |           |
| LG         |                  |            |                                  |        |            |           |
| Philson    | Ubuntu 20.04     | 16 GB DDR4 | 1.8GHz Quad-Core Intel i7-8565U  | Yes    |            |           |

**Discussion:**
> Placeholder

## Simple EDA in R