In [25]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage

import pyarrow.dataset as ds
import rpy2_arrow.pyarrow_rarrow as pyra
import pyarrow.feather as feather
import pyarrow.parquet as pq

In [26]:
%load_ext rpy2.ipython
%load_ext memory_profiler

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


### Loading Datasets into Different Formats

In [27]:
%%time
%%memit

#loading datasets

dataset = ds.dataset("figshare_rainfall/combined_data.csv", format="csv")
## this is of arrow table format
table = dataset.to_table()

peak memory: 851.74 MiB, increment: 772.18 MiB
CPU times: user 26.7 s, sys: 23.6 s, total: 50.4 s
Wall time: 57.4 s


In [28]:
%%time
# writing to feather format
feather.write_feather(table, 'figshare_rainfall/combined.feather')

CPU times: user 6.21 s, sys: 15.6 s, total: 21.8 s
Wall time: 15.9 s


In [29]:
%%time
## writing as a single parquet 
pq.write_table(table, 'figshare_rainfall/combined.parquet')

CPU times: user 12.9 s, sys: 2.36 s, total: 15.3 s
Wall time: 17.6 s


### Size of Files

In [30]:
%%sh
# I am just seeing the size of the csv data
du -sh figshare_rainfall/combined_data.csv
# I am just seeing the size of the feather data
du -sh figshare_rainfall/combined.feather
# I am just seeing the size of the parquet data
du -sh figshare_rainfall/combined.parquet

5.6G	figshare_rainfall/combined_data.csv
1.0G	figshare_rainfall/combined.feather
544M	figshare_rainfall/combined.parquet


### Transfering to R and trying to do simple count(model)

In [31]:
%%time
%%memit
## Here we are loading the arrow dataframe that we have loaded previously

r_table = pyra.converter.py2rpy(table)

5695
rarrow.ChunkedArray: 0.0647420883178711
5695
rarrow.ChunkedArray: 0.03989720344543457
5695
rarrow.ChunkedArray: 0.03466081619262695
5695
rarrow.ChunkedArray: 0.04118704795837402
5695
rarrow.ChunkedArray: 0.02564525604248047
5695
rarrow.ChunkedArray: 0.02684497833251953
5695
rarrow.ChunkedArray: 0.03167581558227539
peak memory: 4089.88 MiB, increment: 107.16 MiB
CPU times: user 29.9 s, sys: 3.54 s, total: 33.4 s
Wall time: 37.8 s


In [32]:
%%time
%%R -i r_table

## arrow Speed
start_time <- Sys.time()
print(class(r_table))
##add details on collect here
library(dplyr)
# Arrow speed
result <- r_table %>% collect() %>% count(model)
print(class(r_table %>% collect()))
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "Table"       "ArrowObject" "R6"         
[1] "tbl_df"     "tbl"        "data.frame"
[90m# A tibble: 27 x 2[39m
   model                  n
   [3m[90m<chr>[39m[23m              [3m[90m<int>[39m[23m
[90m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[90m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[90m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[90m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[90m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[90m 6[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[90m 7[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[90m 8[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[90m 9[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[90m10[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[90m# … with 17 more rows[39m
Time difference of 39.74731 secs
CPU times: user 14.2 s, sys: 23.9 s, total: 38.1 s
Wall time: 40.6 s


In [33]:
%%time
%%R

### Feather speed

library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_feather("figshare_rainfall/combined.feather")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
[90m# A tibble: 27 x 2[39m
   model                  n
   [3m[90m<chr>[39m[23m              [3m[90m<int>[39m[23m
[90m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[90m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[90m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[90m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[90m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[90m 6[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[90m 7[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[90m 8[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[90m 9[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[90m10[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[90m# … with 17 more rows[39m
Time difference of 1.391523 mins
CPU times: user 14.4 s, sys: 35.4 s, total: 49.9 s
Wall time: 1min 23s


In [34]:
%%time
%%R

### Parquet speed

library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_parquet("figshare_rainfall/combined.parquet")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
[90m# A tibble: 27 x 2[39m
   model                  n
   [3m[90m<chr>[39m[23m              [3m[90m<int>[39m[23m
[90m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[90m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[90m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[90m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[90m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[90m 6[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[90m 7[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[90m 8[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[90m 9[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[90m10[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[90m# … with 17 more rows[39m
Time difference of 38.89887 secs
CPU times: user 14.5 s, sys: 21.8 s, total: 36.3 s
Wall time: 39 s


### Decision

Our group chose to go with parquet to transfer the dataframe from Python to R. The file size for our combined data in parquet format was 544mb compared to 5.6GB and 1.0GB for csv and feather formats respectively. Meanwhile in the task of reading the file in R and running a simple `count(model)`, in parquet it had a wall time of 39 seconds which was compared to 40.6 seconds for arrow format and 1 minute and 23 seconds with feather format. 

While speed wise it seems that parquet and arrow do not differ too much, when we consider the file size of a parquet file compared to a csv it becomes the parquet format seems to be the better option.  Thus we decided to go with the parquet format to transfer the dataframe in R. 

We did not consider simply importing the pandas dataframe through `%%R -i` because of the amount of serialization and deserialization that would have to be done when transferring it from Python to R.

In [35]:
%%time
%%R -i r_table

## arrow Speed - running summary()
start_time <- Sys.time()
print(class(r_table))
##add details on collect here
library(dplyr)
# Arrow speed
result <- r_table %>% collect() %>% summary()
print(class(r_table %>% collect()))
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "Table"       "ArrowObject" "R6"         
[1] "tbl_df"     "tbl"        "data.frame"
      time                        lat_min           lat_max      
 Min.   :1888-12-31 16:00:00   Min.   :-36       Min.   :-36.00  
 1st Qu.:1920-07-02 04:00:00   1st Qu.:-35       1st Qu.:-33.66  
 Median :1952-01-01 04:00:00   Median :-33       Median :-32.04  
 Mean   :1952-01-01 08:32:08   Mean   :-33       Mean   :-31.98  
 3rd Qu.:1983-07-02 05:00:00   3rd Qu.:-31       3rd Qu.:-30.16  
 Max.   :2014-12-31 04:00:00   Max.   :-30       Max.   :-27.91  
                               NA's   :3219300                   
    lon_min           lon_max      rain (mm/day)        model          
 Min.   :141       Min.   :141.2   Min.   :  0       Length:62467843   
 1st Qu.:143       1st Qu.:145.0   1st Qu.:  0       Class :character  
 Median :147       Median :148.1   Median :  0       Mode  :character  
 Mean   :147       Mean   :148.2   Mean   :  2                         
 3rd Qu.:150       3rd 

In [36]:
%%time
%%R

### Feather speed - running summary()

library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_feather("figshare_rainfall/combined.feather")
print(class(r_table))
library(dplyr)
result <- r_table %>% summary()
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
      time                        lat_min           lat_max      
 Min.   :1888-12-31 16:00:00   Min.   :-36       Min.   :-36.00  
 1st Qu.:1920-07-02 04:00:00   1st Qu.:-35       1st Qu.:-33.66  
 Median :1952-01-01 04:00:00   Median :-33       Median :-32.04  
 Mean   :1952-01-01 08:32:08   Mean   :-33       Mean   :-31.98  
 3rd Qu.:1983-07-02 05:00:00   3rd Qu.:-31       3rd Qu.:-30.16  
 Max.   :2014-12-31 04:00:00   Max.   :-30       Max.   :-27.91  
                               NA's   :3219300                   
    lon_min           lon_max      rain (mm/day)        model          
 Min.   :141       Min.   :141.2   Min.   :  0       Length:62467843   
 1st Qu.:143       1st Qu.:145.0   1st Qu.:  0       Class :character  
 Median :147       Median :148.1   Median :  0       Mode  :character  
 Mean   :147       Mean   :148.2   Mean   :  2                         
 3rd Qu.:150       3rd Qu.:151.3   3rd Qu.:  1                       

In [37]:
%%time
%%R

### Parquet speed - running summary()

library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_parquet("figshare_rainfall/combined.parquet")
print(class(r_table))
library(dplyr)
result <- r_table %>% summary()
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
      time                        lat_min           lat_max      
 Min.   :1888-12-31 16:00:00   Min.   :-36       Min.   :-36.00  
 1st Qu.:1920-07-02 04:00:00   1st Qu.:-35       1st Qu.:-33.66  
 Median :1952-01-01 04:00:00   Median :-33       Median :-32.04  
 Mean   :1952-01-01 08:32:08   Mean   :-33       Mean   :-31.98  
 3rd Qu.:1983-07-02 05:00:00   3rd Qu.:-31       3rd Qu.:-30.16  
 Max.   :2014-12-31 04:00:00   Max.   :-30       Max.   :-27.91  
                               NA's   :3219300                   
    lon_min           lon_max      rain (mm/day)        model          
 Min.   :141       Min.   :141.2   Min.   :  0       Length:62467843   
 1st Qu.:143       1st Qu.:145.0   1st Qu.:  0       Class :character  
 Median :147       Median :148.1   Median :  0       Mode  :character  
 Mean   :147       Mean   :148.2   Mean   :  2                         
 3rd Qu.:150       3rd Qu.:151.3   3rd Qu.:  1                       