# 1. Downloading the data

In [51]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import pyarrow.dataset as ds
import rpy2_arrow.pyarrow_rarrow as pyra
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [2]:
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "rainfall/"

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)
files = data["files"]
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [4]:
%%time
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 4.73 s, sys: 6.15 s, total: 10.9 s
Wall time: 2min 26s


In [5]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 16.6 s, sys: 2.01 s, total: 18.6 s
Wall time: 18.9 s


# 2. Combining data CSVs

In [10]:
path = "rainfall/observed_daily_rainfall_SYD.csv"
if(os.path.exists(path) and os.path.isfile(path)):
    os.remove(path)

In [11]:
%%time
files = glob.glob("rainfall/*.csv")
df = pd.concat((pd.read_csv(file, index_col=0).assign(model=re.findall(r"[^\/]+(?=_daily)", file)[0]) for file in files))
df.to_csv("rainfall/combined_data.csv")

CPU times: user 5min 42s, sys: 11 s, total: 5min 53s
Wall time: 5min 54s


# 3. Loading the combined CSV to memory and performing a simple EDA

In [57]:
data = pd.read_csv("rainfall/combined_data.csv", index_col=0, parse_dates=True)
data.head()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


## 3.1. Changing dtype of your data

In [29]:
print(f"Memory usage with float64: {data[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {data[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 2998.46 MB
Memory usage with float32: 1749.10 MB


In [30]:
%%time
data["model"].value_counts()

CPU times: user 2.88 s, sys: 17 ms, total: 2.9 s
Wall time: 2.89 s


MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-ESM2           3541230
CMCC-CM2-SR5        3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64

In [31]:
data_dtype = data.copy()
data_dtype[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']] = data_dtype[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore')

In [32]:
%%time
data_dtype["model"].value_counts()

CPU times: user 2.92 s, sys: 18 ms, total: 2.94 s
Wall time: 2.93 s


MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-ESM2           3541230
CMCC-CM2-SR5        3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64

## 3.2. Loading just columns we want

In [34]:
%%time 
use_cols = ["model"]
data_col = pd.read_csv("rainfall/combined_data.csv",usecols=use_cols)
data_col["model"].value_counts()

CPU times: user 28 s, sys: 2.03 s, total: 30 s
Wall time: 30.1 s


MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-ESM2           3541230
CMCC-CM2-SR5        3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64

## 3.3. Loading in chunks

In [22]:
%%time
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("rainfall/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
AWI-ESM-1-1-LR       966420
BCC-CSM2-MR         3035340
BCC-ESM1             551880
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
CanESM5              551880
EC-Earth3-Veg-LR    3037320
FGOALS-f3-L         3219300
FGOALS-g3           1287720
GFDL-CM4            3219300
GFDL-ESM4           3219300
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
MIROC6              2070900
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-HR       5154240
MPI-ESM1-2-LR        966420
MRI-ESM2-0          3037320
NESM3                966420
NorESM2-LM           919800
NorESM2-MM          3541230
SAM0-UNICON         3541153
TaiESM1             3541230
dtype: int64
CPU times: user 54.2 s, sys: 6.17 s, total: 1min
Wall time: 1min


# 4. Perform a simple EDA in R

## 4.1. Parquet file

In [54]:
%%time
data.to_parquet("rainfall/combined_data.parquet")

CPU times: user 13.6 s, sys: 4.7 s, total: 18.3 s
Wall time: 15 s


In [45]:
%%R 
start_time <- Sys.time()
suppressMessages(library(dplyr))
suppressMessages(library(arrow))
df_parquet <- open_dataset("rainfall/combined_data.parquet")
result <- df_parquet %>% count(model)
end_time <- Sys.time()
print(result %>% collect())
print(end_time - start_time)

# A tibble: 27 × 2
   model                  n
   <chr>              <int>
 1 MPI-ESM-1-2-HAM   966420
 2 AWI-ESM-1-1-LR    966420
 3 EC-Earth3-Veg-LR 3037320
 4 MPI-ESM1-2-HR    5154240
 5 CMCC-CM2-SR5     3541230
 6 SAM0-UNICON      3541153
 7 NorESM2-LM        919800
 8 GFDL-ESM4        3219300
 9 CanESM5           551880
10 ACCESS-CM2       1932840
# … with 17 more rows
Time difference of 0.735821 secs


## 4.2. Feather file

In [55]:
%%time
data.reset_index().to_feather("rainfall/combined_data.feather")

CPU times: user 6.85 s, sys: 2.69 s, total: 9.55 s
Wall time: 6.73 s


In [50]:
%%R 
start_time <- Sys.time()
suppressMessages(library(dplyr))
df_feather <- read_feather("rainfall/combined_data.feather")
result <- df_feather %>% count(model)
end_time <- Sys.time()
print(result %>% collect())
print(end_time - start_time)

# A tibble: 27 × 2
   model                  n
   <chr>              <int>
 1 ACCESS-CM2       1932840
 2 ACCESS-ESM1-5    1610700
 3 AWI-ESM-1-1-LR    966420
 4 BCC-CSM2-MR      3035340
 5 BCC-ESM1          551880
 6 CanESM5           551880
 7 CMCC-CM2-HR4     3541230
 8 CMCC-CM2-SR5     3541230
 9 CMCC-ESM2        3541230
10 EC-Earth3-Veg-LR 3037320
# … with 17 more rows
Time difference of 8.920077 secs


## 4.3. Arrow Exchange

In [56]:
%%time
df_arrow = ds.dataset("rainfall/combined_data.csv", format="csv")
table = df_arrow.to_table()
r_table = pyra.converter.py2rpy(table)

CPU times: user 51.4 s, sys: 5.61 s, total: 57.1 s
Wall time: 58.3 s


In [53]:
%%R -i r_table
start_time <- Sys.time()
suppressMessages(library(dplyr))
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result %>% collect())
print(end_time - start_time)

# A tibble: 27 × 2
   model                  n
   <chr>              <int>
 1 MPI-ESM-1-2-HAM   966420
 2 AWI-ESM-1-1-LR    966420
 3 NorESM2-LM        919800
 4 ACCESS-CM2       1932840
 5 FGOALS-f3-L      3219300
 6 CMCC-CM2-HR4     3541230
 7 MRI-ESM2-0       3037320
 8 GFDL-CM4         3219300
 9 BCC-CSM2-MR      3035340
10 EC-Earth3-Veg-LR 3037320
# … with 17 more rows
Time difference of 0.01436996 secs


The parquet file was fast to create and exchange. The feather file was fast to create, but the exchange was a bit long. The arrow csv was slow to create but very fast to exchange. I did not use pandas exchange because it is very slow. Among all the three approaches, I would pick parquet because overall it was the fastest in terms of creating and exchanging the file. 