In [1]:
%load_ext rpy2.ipython

In [269]:
import pandas as pd
import pyarrow.feather as feather
import pyarrow as pa
import pyarrow.parquet as pq
import rpy2_arrow.pyarrow_rarrow as pyra

In [3]:
# replace this with the combined dataframe

df = pd.read_csv("data/14096681/data/ACCESS-CM2_daily_rainfall_NSW.csv")
df2 = pd.read_csv("data/14096681/data/GFDL-CM4_daily_rainfall_NSW.csv")
df3 = pd.read_csv("data/14096681/data/observed_daily_rainfall_SYD.csv")

In [4]:
# add a column called model

df["model"] = "ACCESS-CM2_daily_rainfall_NSW"
df2["model"] = "GFDL-CM4_daily_rainfall_NSW"
df3["model"] = "observed_daily_rainfall_SYD"

In [5]:
df3

Unnamed: 0,time,rain (mm/day),model
0,1889-01-01,0.006612,observed_daily_rainfall_SYD
1,1889-01-02,0.090422,observed_daily_rainfall_SYD
2,1889-01-03,1.401452,observed_daily_rainfall_SYD
3,1889-01-04,14.869798,observed_daily_rainfall_SYD
4,1889-01-05,0.467628,observed_daily_rainfall_SYD
...,...,...,...
46015,2014-12-27,0.037472,observed_daily_rainfall_SYD
46016,2014-12-28,0.158061,observed_daily_rainfall_SYD
46017,2014-12-29,0.025719,observed_daily_rainfall_SYD
46018,2014-12-30,0.729390,observed_daily_rainfall_SYD


In [6]:
# merge dataframes 

merge = pd.concat([df, df2, df3])

In [7]:
# final merged dataframe 

merge

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2_daily_rainfall_NSW
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.000000e+00,ACCESS-CM2_daily_rainfall_NSW
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.000000e+00,ACCESS-CM2_daily_rainfall_NSW
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.000000e+00,ACCESS-CM2_daily_rainfall_NSW
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,1.047658e-02,ACCESS-CM2_daily_rainfall_NSW
...,...,...,...,...,...,...,...
46015,2014-12-27,,,,,3.747200e-02,observed_daily_rainfall_SYD
46016,2014-12-28,,,,,1.580613e-01,observed_daily_rainfall_SYD
46017,2014-12-29,,,,,2.571914e-02,observed_daily_rainfall_SYD
46018,2014-12-30,,,,,7.293899e-01,observed_daily_rainfall_SYD


In [258]:
%%time
%%R -i merge # change name for the actual df

# transfer dataframe to R as python dataframe

library(tidyverse)
library(here)
library(feather)
library(arrow)

start_time <- Sys.time()
result <- merge %>% count(model)
print(result)
end_time <- Sys.time()
print(end_time - start_time)

                          model       n
1 ACCESS-CM2_daily_rainfall_NSW 1932840
2   GFDL-CM4_daily_rainfall_NSW 3219300
3   observed_daily_rainfall_SYD   46020
Time difference of 0.213279 secs
CPU times: user 2min 9s, sys: 8.04 s, total: 2min 17s
Wall time: 2min 31s


In [8]:
%%time
# write the dataframe to feather format 

feather.write_feather(merge, 'data/final_data.feather')

In [265]:
%%time

# write the dataframe to arrow and then parquet format
# code adapted from source 1

table = pa.Table.from_pandas(merge)
pq.write_table(table, 'data/final_data.parquet')

CPU times: user 1.77 s, sys: 376 ms, total: 2.14 s
Wall time: 2.02 s


In [271]:
%%time

# write the dataframe to arrow format 
# code adapted from source 4 

final_table = pyra.converter.py2rpy(table)

1
rarrow.ChunkedArray: 0.025060176849365234
1
rarrow.ChunkedArray: 0.002795696258544922
1
rarrow.ChunkedArray: 0.0016739368438720703
1
rarrow.ChunkedArray: 0.0016710758209228516
1
rarrow.ChunkedArray: 0.0033178329467773438
1
rarrow.ChunkedArray: 0.002190113067626953
1
rarrow.ChunkedArray: 0.0043811798095703125
1
rarrow.ChunkedArray: 0.0029799938201904297
CPU times: user 18.9 ms, sys: 69.3 ms, total: 88.2 ms
Wall time: 307 ms


In [257]:
%%time
%%R 

# transfer dataframe to R as a feather

file_path = here("data", "final_data.feather")
start_time <- Sys.time()
df <- read_feather(file_path)
result <- df %>% count(model)
print(result)
end_time <- Sys.time()
print(end_time - start_time)

[90m# A tibble: 3 x 2[39m
  model                               n
[90m*[39m [3m[90m<chr>[39m[23m                           [3m[90m<int>[39m[23m
[90m1[39m ACCESS-CM2_daily_rainfall_NSW 1[4m9[24m[4m3[24m[4m2[24m840
[90m2[39m GFDL-CM4_daily_rainfall_NSW   3[4m2[24m[4m1[24m[4m9[24m300
[90m3[39m observed_daily_rainfall_SYD     [4m4[24m[4m6[24m020
Time difference of 2.220245 secs
CPU times: user 1.22 s, sys: 940 ms, total: 2.16 s
Wall time: 2.32 s


In [267]:
%%time
%%R 

# transfer dataframe to R as a parquet 

file_path = here("data", "final_data.parquet")
start_time <- Sys.time()
df <- read_parquet(file_path)
result <- df %>% count(model)
print(result)
end_time <- Sys.time()
print(end_time - start_time)

R[write to console]: 
Attaching package: ‘arrow’


R[write to console]: The following objects are masked from ‘package:feather’:

    read_feather, write_feather


R[write to console]: The following object is masked from ‘package:utils’:

    timestamp




[90m# A tibble: 3 x 2[39m
  model                               n
[90m*[39m [3m[90m<chr>[39m[23m                           [3m[90m<int>[39m[23m
[90m1[39m ACCESS-CM2_daily_rainfall_NSW 1[4m9[24m[4m3[24m[4m2[24m840
[90m2[39m GFDL-CM4_daily_rainfall_NSW   3[4m2[24m[4m1[24m[4m9[24m300
[90m3[39m observed_daily_rainfall_SYD     [4m4[24m[4m6[24m020
Time difference of 4.098216 secs
CPU times: user 2.26 s, sys: 1.38 s, total: 3.64 s
Wall time: 5.17 s


In [274]:
%%time
%%R -i final_table

# transfer dataframe to R from arrow 
# code adapted from source 4

start_time <- Sys.time()
result <- final_table %>% collect() %>% count(model)
print(result)
end_time <- Sys.time()
print(end_time - start_time)

[90m# A tibble: 3 x 2[39m
  model                               n
[90m*[39m [3m[90m<chr>[39m[23m                           [3m[90m<int>[39m[23m
[90m1[39m ACCESS-CM2_daily_rainfall_NSW 1[4m9[24m[4m3[24m[4m2[24m840
[90m2[39m GFDL-CM4_daily_rainfall_NSW   3[4m2[24m[4m1[24m[4m9[24m300
[90m3[39m observed_daily_rainfall_SYD     [4m4[24m[4m6[24m020
Time difference of 3.812246 secs
CPU times: user 1.79 s, sys: 731 ms, total: 2.53 s
Wall time: 3.87 s


In [268]:
%%sh

du -sh data/final_data.feather
du -sh data/final_data.parquet

112M	data/final_data.feather
 80M	data/final_data.parquet


**Reasoning**

From the exploration of the different memory and time usage, we can see that parquet store less memory then the feather files as inferred from the shell command above that tells us the directories' usage. From our research, we infer that this is due to the use of dictionary encoding and certain compressions that make this possible (2). Furthermore, we noticed the time it takes to do a basic query is much faster for parquet and feather files than passing the pandas dataframe through pandas exchange because it may be reading all the rows of the file to get the answer to our simple count query. However, parquet files store the metadata of the file and can easily access the columns since they are stored in a columnar format (3) and read the files without having to loop through everything. However, the time difference between feather and parquet files is minimal. We hypothesize that this may be due to the fact that our files are not large enough for us to see the difference. We have also considered using an arrow exchange to go from Python to R. In general, arrow is great for in-memory computing (5) and we noticed it was faster than parquet and feather files. It is also less expensive to write than the parquet file format (5). We have also found that parquet and arrow files are used together as a way of performing many operations in the arrow format and then storing the file as a parquet for long term archival storage (5). Both parquet and arrow files are easily integratable to Spark which will be used a tool for later milestones (2 & 5). But this arrow exchange only has select operations that can be performed in R and this process is still in development as mentioned in our 525 lecture 2 (4). Overall, the conversion to parquet file was easy and simple and the fastest while allowing us to stick to the normal operations in R to read the data into a dataframe again in R so because of this andthe fact that we don't anticipate no additional data that will be added at a later date, we will be using parquet as our final choice.

**References**

1. https://stackoverflow.com/questions/41066582/python-save-pandas-data-frame-to-parquet-file
2. https://stackoverflow.com/questions/48083405/what-are-the-differences-between-feather-and-parquet
3. https://luminousmen.com/post/big-data-file-formats
4. https://github.ubc.ca/MDS-2020-21/DSCI_525_web-cloud-comp_students
5. https://stackoverflow.com/questions/56472727/difference-between-apache-parquet-and-arrow
