In [1]:
# load libraries
import io
import os
import json
import glob
#import intake
import requests
import numpy as np
import pandas as pd
#import xarray as xr
from urllib.request import urlretrieve
#import proplot as pplot
#from joblib import Parallel, delayed
#import warnings
#warnings.filterwarnings("ignore")  # ignore some annoying matplotlib warnings
from memory_profiler import memory_usage
import zipfile

In [2]:
# more library loading
%load_ext rpy2.ipython
%load_ext memory_profiler



### 3. Downloading the data

In [4]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshare/"

In [5]:
# metadata output
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # 
files = data["files"] # we only want the data and readme 'name' key value

In [6]:
%%time
#download readme and data.zip files only
files_to_dl = ["README.md", "data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

Wall time: 4min 54s


In [7]:
%%time
#extract zip files to repo
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

Wall time: 17 s


### 4. Combining data CSVs

In [3]:
%%time
%memit

# Shows time that regular python takes to merge file
# Join all data together
import pandas as pd
use_cols = ["time",'lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']

files = glob.glob('./figshare/*.csv')
df_all = None

for file in files:
    filename = os.path.basename(file)
    
    if '_daily_rainfall_NSW.csv' in filename:
        print(f"Processing the file {filename}")
        model = filename.split('_daily_rainfall_NSW.csv')[0]

        df = pd.read_csv(file, usecols=use_cols, index_col=0)
        df['model'] = model    

        if df_all is None:
            df_all = df
        else:
            df_all = df_all.append(df)

peak memory: 137.07 MiB, increment: 0.23 MiB
Processing the file ACCESS-CM2_daily_rainfall_NSW.csv
Processing the file ACCESS-ESM1-5_daily_rainfall_NSW.csv
Processing the file AWI-ESM-1-1-LR_daily_rainfall_NSW.csv
Processing the file BCC-CSM2-MR_daily_rainfall_NSW.csv
Processing the file BCC-ESM1_daily_rainfall_NSW.csv
Processing the file CanESM5_daily_rainfall_NSW.csv
Processing the file CMCC-CM2-HR4_daily_rainfall_NSW.csv
Processing the file CMCC-CM2-SR5_daily_rainfall_NSW.csv
Processing the file CMCC-ESM2_daily_rainfall_NSW.csv
Processing the file EC-Earth3-Veg-LR_daily_rainfall_NSW.csv
Processing the file FGOALS-f3-L_daily_rainfall_NSW.csv
Processing the file FGOALS-g3_daily_rainfall_NSW.csv
Processing the file GFDL-CM4_daily_rainfall_NSW.csv
Processing the file GFDL-ESM4_daily_rainfall_NSW.csv
Processing the file INM-CM4-8_daily_rainfall_NSW.csv
Processing the file INM-CM5-0_daily_rainfall_NSW.csv
Processing the file KIOST-ESM_daily_rainfall_NSW.csv
Processing the file MIROC6_dail

In [4]:
# save combined file
df_all.to_csv('./figshare/combined_data.csv')

In [5]:
%%sh 
#get file size of combined csv
du -sh figshare/combined_data.csv

5.7G	figshare/combined_data.csv


**Observations**

Our team members had the following computer specs:  

| Team Member       | Ram     | Processor     |
| :------------- | :----------: | -----------: |
|  Cal | 16GB   | AMD Ryzen 5 3600 6-core    |
| Justin  | 32GB  | Intel i5 | 
| Anita   |  x |x  | 
| Yuan  |  x |x  | 

We used the Pandas default writing method, and found the processing times and peak memory usage to vary by member as: 

| Team Member       |  Processing Time     | Peak Memory Usage |
| :------------- | :---------- | :-----------: |
|  Cal |  1min 13sec  | 137 mb |
|  Justin |  1min 07sec  | 1593 mb |
|  Anita |  xmin xsec  | x mb |
|  Yuan |  xmin xsec  | x mb |

### 5. Load the combined CSV to memory and perform a simple EDA

In [6]:
%%time
%%memit
df = pd.read_csv("figshare/combined_data.csv")
print(df["model"].value_counts())

MPI-ESM1-2-HR       5154240
CMCC-ESM2           3541230
NorESM2-MM          3541230
CMCC-CM2-SR5        3541230
CMCC-CM2-HR4        3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-CM4            3219300
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
AWI-ESM-1-1-LR       966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
peak memory: 11281.64 MiB, increment: 7587.66 MiB
Wall time: 1min 11s


In [7]:
%%time
%%memit
use_cols = ["time", "rain (mm/day)", "model"]
df = pd.read_csv("figshare/combined_data.csv", usecols = use_cols)
print(df["model"].value_counts())

MPI-ESM1-2-HR       5154240
CMCC-ESM2           3541230
NorESM2-MM          3541230
CMCC-CM2-SR5        3541230
CMCC-CM2-HR4        3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-CM4            3219300
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
AWI-ESM-1-1-LR       966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
peak memory: 11356.05 MiB, increment: 4572.77 MiB
Wall time: 51 s


*observations*

By loading in only 3 out of the original 7 columns our combined CSV, we obtained the following results: 
- peak memory was not improved: 11.36 GB (reduced columns) versus 11.28 GB (all columns)
- memory incremental usage was cut in half: 4.6 GB (reduced columns) versus 7.6 GB (all columns)
- processing run time was moderately improved: 53.1 seconds (reduced columns) versus 1min 11 seconds (all columns)

### 6. Perform a simple EDA in R

In [3]:
import pandas as pd
## install the packages https://arrow.apache.org/docs/python/install.html
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.parquet as pq
## How to install put instructions https://anaconda.org/conda-forge/rpy2
import rpy2.rinterface
# install this https://pypi.org/project/rpy2-arrow/#description  pip install rpy2-arrow
# have to install this as well conda install -c conda-forge r-arrow 
import rpy2_arrow.pyarrow_rarrow as pyra
### instruction
import pyarrow.feather as feather

In [4]:
%%R
#just seeing if its available
library("arrow")
library("dplyr")

R[write to console]: 
Attaching package: 'dplyr'


R[write to console]: The following objects are masked from 'package:stats':

    filter, lag


R[write to console]: The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [5]:
%%time
%%memit
## read more on the datasets here  https://arrow.apache.org/docs/python/dataset.html
dataset = ds.dataset("figshare/combined_data.csv", format="csv")
## this is of arrow table format
table = dataset.to_table()

peak memory: 4137.67 MiB, increment: 3874.96 MiB
Wall time: 29.3 s


In [6]:
%%time
# experiment in writing in feather format 
feather.write_feather(table, 'figshare/feather')
# note that this won't work on my machine

OSError: [WinError 1224] Failed to open local file 'figshare/feather'. Detail: [Windows error 1224] The requested operation cannot be performed on a file with a user-mapped section open.


In [None]:
%%time
%%R
### her we are showing how much time it took to read a feather file what we wrote in python
# note this won't work given the problem in the last code chunk
library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_feather("figshare/feather")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result)
print(end_time - start_time)

**observation**

The following table shows how the reading in, and performing of simple EDA (i.e., getting the number of observations per model) varied based on our team members and using default Pandas versus using Feather for R:

| Team Member       | Processing Time (Feather vs Pandas)  | Peak Memory Usage (Feather vs Pandas)  |
| :------------- | :----------: | -----------: |
|  Cal |  x  | x    |
| Justin  | x  | x | 
| Anita   |  x |x  | 
| Yuan  |  x |x  | 