In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler

## Download the data 

In [3]:
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "../data/" # This notebook should be ran mannually

response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)
files = data["files"]

for file in files:
    if file["name"] == "data.zip":
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

## Combine the data

In [4]:
import dask.dataframe as dd

see = dd.read_csv(
    "../data/ACCESS-CM2_daily_rainfall_NSW.csv",
    assume_missing=True,
)

see

Unnamed: 0_level_0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,object,float64,float64,float64,float64,float64
,...,...,...,...,...,...
,...,...,...,...,...,...


In [5]:
%%time
%%memit
combined_data = dd.from_pandas(pd.DataFrame({'time': [], "lat_min": [], "lat_max": [], "lon_min": [], "lon_max": [], "rain (mm/day)": [], "model": []}), npartitions=1)
for filename in os.listdir(output_directory):
    if filename[-4: ] == ".csv":
        model = filename.partition('_daily_rainfall')[0]
        ddf = dd.read_csv(output_directory + filename, assume_missing=True)
        if len(ddf.columns) == 2:
            ddf['lat_min'] = None
            ddf['lat_max'] = None
            ddf['lon_min'] = None
            ddf['lon_max'] = None
            ddf = ddf[['time', 'lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']]
        ddf["model"] = model
        combined_data = dd.concat([combined_data, ddf], axis=0)
        




peak memory: 113.21 MiB, increment: 9.02 MiB
CPU times: user 980 ms, sys: 102 ms, total: 1.08 s
Wall time: 2.54 s


In [6]:
%%time
%%memit
combined_data.to_csv(output_directory + "combined_data.csv")

peak memory: 1122.01 MiB, increment: 1010.62 MiB
CPU times: user 12min 19s, sys: 55.2 s, total: 13min 14s
Wall time: 12min 14s


In [7]:
combined_data = dd.read_csv(output_directory + "combined_data.csv/*")
combined_data = combined_data.drop(['Unnamed: 0'], axis = 1)

In [17]:
combined_data.head(npartitions=10)

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM


### Discussion

**Compare run times and memory usages by using DASK on different machines within our team:**

- peak memory: 1122.01 MiB, increment: 1010.62 MiB, CPU times: user 12min 19s, sys: 55.2 s, total: 13min 14s, Wall time: 12min 14s
- peak memory: 3246.38 MiB, increment: 3074.11 MiB, CPU times: user 13min 40s, sys: 1min 6s, total: 14min 47s, Wall time: 12min 48s
- peak memory: 986.83 MiB, increment: 816.16 MiB, CPU times: user 8min 22s, sys: 31 s, total: 8min 53s, Wall time: 8min 2s
- (TODO) peak memory:  MiB, increment:  MiB, CPU times: user min s, sys:  s, total: min s, Wall time: min s


In general, when we run the task of combining the data, computer random access memory (RAM) consumes very much. As you can see above, our computers took more than 8 minutes to run and peak memories were all around 1000 MiB. 