In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

# 3. Download Data

### 3.1 Download with figshare API

In [2]:
# Use API to download data
article_id = "14096681"
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "NSWrainfall/"
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  
files = data["files"]     

## 3.2 Extract Zip File

In [3]:
%%time
# create zip file
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 5.29 s, sys: 6.59 s, total: 11.9 s
Wall time: 3min 20s


In [4]:
%%time
# extract data into directory
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 17.5 s, sys: 4.45 s, total: 22 s
Wall time: 24.8 s


# 4. Combine CSV

In [5]:
%%time
excluded_files = ["NSWrainfall\\observed_daily_rainfall_SYD.csv"]
files = glob.glob('NSWrainfall/*.csv')
files = list(set(files) - set(excluded_files))
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall("/([^_]*)", file)[0])
                for file in files)
              )
df.to_csv("NSWrainfall/combined_data.csv")

CPU times: user 6min 5s, sys: 31.2 s, total: 6min 36s
Wall time: 6min 44s


| Team Member | Operating System | RAM | Processor | Is SSD | Time Taken |
| --- | --- | --- | --- | --- | --- |
| Austin Shih | MacOS | 64GB | Intel i9 2.4 GHz 8-Core | Yes | 6m 20s |
| Renee Kwon | MacOS | 8GB | Dual-Core Intel Core i5 | Yes | 12min 27s |
| Wilfred Hass | Ubuntu 22.04 | 16G | Intel i7-7700HQ 2.8 GHz 4-core | Yes | 6m 44s |
| Fujie Sun | WinOS | 16GB | AMD Ryzen 7 5800HS  | Yes | 8min 34s  |

In [6]:
df.head()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-36.0,-34.5,141.0,143.0,3.176243e-08,INM-CM4-8
1889-01-02 12:00:00,-36.0,-34.5,141.0,143.0,3.678816e-09,INM-CM4-8
1889-01-03 12:00:00,-36.0,-34.5,141.0,143.0,2.017436e-07,INM-CM4-8
1889-01-04 12:00:00,-36.0,-34.5,141.0,143.0,3.976414,INM-CM4-8
1889-01-05 12:00:00,-36.0,-34.5,141.0,143.0,2.978595,INM-CM4-8


# 5. Simple EDA

| Team Member | Operating System | RAM | Processor | Is SSD | Time Taken |
| --- | --- | --- | --- | --- | --- |
| Austin Shih | MacOS | 64GB | Intel i9 2.4 GHz 8-Core | Yes | --- |
| Renee Kwon | MacOS | 8GB | Dual-Core Intel Core i5  | Yes | --- |
| Wilfred Hass | --- | --- | ---  | --- | --- |
| Fujie Sun | WinOS | 16GB | AMD Ryzen 7 5800HS  | Yes | --- |