In [1]:
import re
import os,sys,inspect
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage
import dask.dataframe as dd

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler



In [3]:
# get the folders
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir) # this refers to the project root folder
raw_folder = parentdir + "/data/raw/"

processed_folder = parentdir + "/data/processed/"
if not os.path.exists(raw_folder):
    os.makedirs(raw_folder)
    
if not os.path.exists(processed_folder):
    os.makedirs(processed_folder)

combined_file = processed_folder + "combined_data.csv"

files_to_dl = ["data.zip"] # need only this zip file

# avoid re-loading the data if the file already exists locally
force_download = False # set to True to re-download and unzip the file

# Download the data
<hr>

In [4]:
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want

In [5]:
%%time
for file in files:
    if file["name"] in files_to_dl:
        if (force_download or not os.path.exists(raw_folder + file["name"])):
            os.makedirs(raw_folder, exist_ok=True) # create the folder if not exists
            urlretrieve(file["download_url"], raw_folder + file["name"])

Wall time: 0 ns


In [6]:
%%time
# extract the zip file
n_files = len(os.listdir(raw_folder))
if (force_download or n_files != 31): # if we must unzip the latest downloaded file or the file was not unzipped
    with zipfile.ZipFile(os.path.join(raw_folder, "data.zip"), 'r') as f:
        f.extractall(raw_folder)

Wall time: 26.2 s


# Combine the data
<hr>

## Combine the data using Pandas

In [7]:
%%time
%memit
import pandas as pd
use_cols = ["time", "lat_min", "lat_max", "lon_min", "lon_max", "rain (mm/day)"]
files = glob.glob(raw_folder + '*NSW.csv') # exclude observed_daily_rainfall_SYD

df = pd.concat((pd.read_csv(file, index_col=0, usecols=use_cols)
                .assign(model=file[max(file.rfind('/'), file.rfind('\\'))+1:file.index("_daily")])
                for file in files)
              )
df.to_csv(combined_file)

peak memory: 149.07 MiB, increment: 0.00 MiB
Wall time: 9min 47s


In [8]:
print("Size of the combined file:", os.path.getsize(combined_file)/(2**30), "GB")

Size of the combined file: 5.618728716857731 GB


In [9]:
%%time
df = pd.read_csv(combined_file)

Wall time: 1min 57s


In [10]:
print(df.shape)

(62467843, 7)


In [11]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658,ACCESS-CM2


## Combine the data using Dask

In [12]:
# %%time
# %%memit
# dask_combined_file = processed_folder + "dask_combined_data.csv"
# ddf = dd.read_csv(raw_folder + '*NSW.csv', assume_missing=True, usecols=use_cols, include_path_column=True)
# ddf.to_csv(dask_combined_file, single_file=True)

In [13]:
# print("Size of the combined file:", os.path.getsize(dask_combined_file)/(2**30), "GB")

In [14]:
%%time
ddf = dd.read_csv(combined_file)
ddf.head()

Wall time: 1.63 s


Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658,ACCESS-CM2


# Observations
<hr>

Our team have **downloaded the data** and **executed the data combination script using `Pandas` on 4 laptops**. Below are the detailed results:

OS|CPU|Memory|Runtime|Memory
--|---|------|-------|------
macOS BigSur|2.7 GHz Dual-Core Intel Core i5|16 GB|CPU times: user 7min 37s, sys: 24.5 s, total: 8min 1s. Wall time: 8min 35s|peak memory: 155.76 MiB, increment: 0.04 MiB
Windows 10 Education Insider Preview|Intel(R) Core(TM) i7-10510U CPU @ 1.80GHz 2.30 GHz|16 GB|Wall time: 8min 55s|peak memory: 138.60 MiB, increment: 0.25 MiB
macOS Catalina version 10.15.7|2.4 GHz Quad-Core Intel Core i5|8 GB|CPU times: user 5min 45s, sys: 21.5 s, total: 6min 7s. Wall time: 6min 15s|peak memory: 125.83 MiB, increment: 0.25 MiB
Windows 10 Education|Intel(R) Core(TM) i7-8550U CPU @ 1.80 GHz 1.99 GHz| 16 GB|Wall time: 10min 11s|peak memory: 140.70 MiB, increment: 0.20 MiB

As we can see from the result table above, the **runtime varied across machines** depending on their configurations, but they **all took considerable amount of time and memory**. It is noteworthy that:

* Due to the limited hard drive space, we had to comment out the script used for **combining data** files with `Dask`; also in the previous execution, using `Dask` to **combine these files actually took more time** than using `Pandas`. 

* `Dask`'s `read_csv` function is, however, super **fast** in comparison to using `Pandas`'s due to the fact that `Pandas` **loads the whole data object into memory** whilst `Dask` **loads data in chunks** and applies parallel processing.

**In conclusion, merging csv files into one giant file and loading it every time is clearly not an efficient way to work with big data files.**