In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage

In [None]:
# Jupyter Lab cell extensions
%load_ext rpy2.ipython 
%load_ext memory_profiler 

## 3. Downloading the data

In [2]:
# Necessary metadata
article_id = 14096681  
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figsharedailyrain/"

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  
files = data["files"]             
files

[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

In [4]:
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

In [5]:
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

## 4. Combining data CSVs

In [6]:
%load_ext memory_profiler

In [7]:
%%time
%memit
use_cols = ["time", "lat_min", "lat_max", "lon_min", "lon_max", "rain (mm/day)"]
files = glob.glob('figsharedailyrain/*.csv')
files[:] = [x for x in files if "observed" not in x]
df = pd.DataFrame(columns=use_cols)
df = pd.concat((pd.read_csv(file, index_col=0, usecols=use_cols)
                .assign(model=re.findall(r'[^\/|\\]+(?=\.)', file.replace('_daily_rainfall_NSW', ''))[0])
                for file in files)
              )  
df.to_csv("figsharedailyrain/combined_data.csv")

peak memory: 91.42 MiB, increment: 0.28 MiB
Wall time: 6min 17s


## 5. Load the combined CSV to memory and perform a simple EDA

In [8]:
%%sh
du -sh figsharedailyrain/combined_data.csv

5.7G	figsharedailyrain/combined_data.csv


In [9]:
%%time
df = pd.read_csv("figsharedailyrain/combined_data.csv")

Wall time: 1min 1s


In [10]:
print(df.shape)

(62467843, 7)


In [11]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658,ACCESS-CM2


In [12]:
%%time
print(df["model"].value_counts())

MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-ESM4           3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NESM3                966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
Wall time: 4.48 s


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62467843 entries, 0 to 62467842
Data columns (total 7 columns):
 #   Column         Dtype  
---  ------         -----  
 0   time           object 
 1   lat_min        float64
 2   lat_max        float64
 3   lon_min        float64
 4   lon_max        float64
 5   rain (mm/day)  float64
 6   model          object 
dtypes: float64(5), object(2)
memory usage: 3.3+ GB


## 6. Perform a simple EDA in R

Pick an approach to transfer the dataframe from python to R.
* Parquet file
* Feather file (We chose this one)
* Pandas exchange
* Arrow exchange

### Reasons to choose **feather**

The team referred to lecture notes and online resources to compare the four approaches above and determined to choose **Feather** at the end as the best practice. The reasons are listed as follows,

- **Feather** is faster than `parquet file` and `arrow exchange` when writing files since it store the data with lesser serialization and deserialization, leading to a higher I/O speed.
- `parquet file` has the advantages of saving storage memory. However, it is more appropriate to choose **Feather** as this use case requests a faster speed than data storage.
- **Feather** fits well with the R programming language since the API is embedded well for reading and writing data using R.
- Team researched online articles to compare those four approaches according to experiments and benchmarking results. The team concluded **Feather** is the ideal choice. 
- **Feather** works well among Jupyter notebook sessions. It also speeds up the data queries without taking much memory on the disk, and there is no need for any unpacking when loaded the data back into RAM.

### 6.1 Use Feather to transfer the dataframe from Python to R 

In [None]:
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.parquet as pq
import rpy2.rinterface
import rpy2_arrow.pyarrow_rarrow as pyra
import pyarrow.feather as feather

In [None]:
%%R
library(arrow)
library(dplyr)
library(tidyr)

In [None]:
%%time
%%memit
dataset = ds.dataset("figsharedailyrain/combined_data.csv", format="csv")
table = dataset.to_table()

In [None]:
%%time
# Write in feather format
feather.write_feather(table, 'figsharedailyrain/combined_data.feather')

### 6.2 Perform a simple EDA in R

In [None]:
%%time
%%R

# Calculate how much time it took to read a feather file
start_time <- Sys.time()
r_table <- arrow::read_feather("figsharedailyrain/combined_data.feather")
print(class(r_table))

# Print the different counts of the models 
result <- r_table %>% count(model) 
end_time <- Sys.time()

print(end_time - start_time)
print(result)

In [None]:
%%R

# Print the different counts of the time
result <- r_table %>% count(time) 
print(result)

### Discussion

TODO