In [1]:
import re
import os,sys,inspect
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage

In [9]:
%load_ext rpy2.ipython
%load_ext memory_profiler

# Download the data

In [2]:
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

In [3]:
# get the download folder
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir) # this refers to the project root folder
output_directory = parentdir + "/data/raw/"

In [4]:
%%time
files_to_dl = ["data.zip"] # need only this zip file
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True) # create the folder if not exists
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 5.57 s, sys: 5.74 s, total: 11.3 s
Wall time: 2min 43s


In [5]:
%%time
# extract the zip file
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 18.5 s, sys: 3.45 s, total: 22 s
Wall time: 22.3 s


# Combine the data

In [6]:
processed_folder = parentdir + "/data/processed/"
if not os.path.exists(processed_folder):
    os.makedirs(processed_folder)
combined_file = processed_folder + "combined_data.csv"

## Combine the data using Pandas

In [12]:
%%time
%memit
import pandas as pd
use_cols = ["time", "lat_min", "lat_max", "lon_min", "lon_max", "rain (mm/day)"]
files = glob.glob(output_directory + '*NSW.csv') # exclude observed_daily_rainfall_SYD

df = pd.concat((pd.read_csv(file, index_col=0, usecols=use_cols)
                .assign(model=file[file.rfind("/")+1:file.index("_daily")])
                for file in files)
              )
df.to_csv(combined_file)

peak memory: 323.93 MiB, increment: 0.00 MiB
CPU times: user 7min 6s, sys: 22.6 s, total: 7min 29s
Wall time: 7min 47s


In [39]:
print("Size of the combined file:", os.path.getsize(combined_file)/(2**30), "GB")

Size of the combined file: 5.560551003552973 GB


In [31]:
%%time
df = pd.read_csv(combined_file)

CPU times: user 1min 4s, sys: 15.2 s, total: 1min 19s
Wall time: 1min 23s


In [32]:
print(df.shape)

(62467843, 7)


In [33]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


## Combine the data using Dask

In [40]:
import dask.dataframe as dd

In [41]:
%%time
%%memit

ddf = dd.read_csv(output_directory + '*NSW.csv',assume_missing=True,usecols=use_cols)
ddf.to_csv(combined_file, single_file=True)

peak memory: 8310.02 MiB, increment: 4564.62 MiB
CPU times: user 9min 16s, sys: 31.4 s, total: 9min 47s
Wall time: 9min 10s


In [42]:
%%time
ddf = dd.read_csv(combined_file)

CPU times: user 17.1 ms, sys: 18.4 ms, total: 35.4 ms
Wall time: 33.8 ms
