In [1]:
%load_ext rpy2.ipython

In [261]:
import pandas as pd
import pyarrow.feather as feather
import pyarrow as pa
import pyarrow.parquet as pq

In [3]:
# replace this with the combined dataframe

df = pd.read_csv("data/14096681/data/ACCESS-CM2_daily_rainfall_NSW.csv")
df2 = pd.read_csv("data/14096681/data/GFDL-CM4_daily_rainfall_NSW.csv")
df3 = pd.read_csv("data/14096681/data/observed_daily_rainfall_SYD.csv")

In [4]:
# add a column called model

df["model"] = "ACCESS-CM2_daily_rainfall_NSW"
df2["model"] = "GFDL-CM4_daily_rainfall_NSW"
df3["model"] = "observed_daily_rainfall_SYD"

In [5]:
df3

Unnamed: 0,time,rain (mm/day),model
0,1889-01-01,0.006612,observed_daily_rainfall_SYD
1,1889-01-02,0.090422,observed_daily_rainfall_SYD
2,1889-01-03,1.401452,observed_daily_rainfall_SYD
3,1889-01-04,14.869798,observed_daily_rainfall_SYD
4,1889-01-05,0.467628,observed_daily_rainfall_SYD
...,...,...,...
46015,2014-12-27,0.037472,observed_daily_rainfall_SYD
46016,2014-12-28,0.158061,observed_daily_rainfall_SYD
46017,2014-12-29,0.025719,observed_daily_rainfall_SYD
46018,2014-12-30,0.729390,observed_daily_rainfall_SYD


In [6]:
# merge dataframes 

merge = pd.concat([df, df2, df3])

In [7]:
# final merged dataframe 

merge

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2_daily_rainfall_NSW
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.000000e+00,ACCESS-CM2_daily_rainfall_NSW
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.000000e+00,ACCESS-CM2_daily_rainfall_NSW
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.000000e+00,ACCESS-CM2_daily_rainfall_NSW
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,1.047658e-02,ACCESS-CM2_daily_rainfall_NSW
...,...,...,...,...,...,...,...
46015,2014-12-27,,,,,3.747200e-02,observed_daily_rainfall_SYD
46016,2014-12-28,,,,,1.580613e-01,observed_daily_rainfall_SYD
46017,2014-12-29,,,,,2.571914e-02,observed_daily_rainfall_SYD
46018,2014-12-30,,,,,7.293899e-01,observed_daily_rainfall_SYD


In [258]:
%%time
%%R -i merge

# transfer dataframe to R as a csv

library(tidyverse)
library(here)
library(feather)

start_time <- Sys.time()
result <- merge %>% count(model)
print(result)
end_time <- Sys.time()
print(end_time - start_time)

                          model       n
1 ACCESS-CM2_daily_rainfall_NSW 1932840
2   GFDL-CM4_daily_rainfall_NSW 3219300
3   observed_daily_rainfall_SYD   46020
Time difference of 0.213279 secs
CPU times: user 2min 9s, sys: 8.04 s, total: 2min 17s
Wall time: 2min 31s


In [8]:
%%time
# write the dataframe to feather format 

feather.write_feather(merge, 'data/final_data.feather')

In [265]:
%%time

# write the dataframe to parquet format
# source - https://stackoverflow.com/questions/41066582/python-save-pandas-data-frame-to-parquet-file

table = pa.Table.from_pandas(merge)
pq.write_table(table, 'data/final_data.parquet')

CPU times: user 1.77 s, sys: 376 ms, total: 2.14 s
Wall time: 2.02 s


In [257]:
%%time
%%R 

# transfer dataframe to R as a feather

library(tidyverse)
library(here)
library(feather)

file_path = here("data", "final_data.feather")
start_time <- Sys.time()
df <- arrow::read_feather(file_path)
result <- df %>% count(model)
print(result)
end_time <- Sys.time()
print(end_time - start_time)

[90m# A tibble: 3 x 2[39m
  model                               n
[90m*[39m [3m[90m<chr>[39m[23m                           [3m[90m<int>[39m[23m
[90m1[39m ACCESS-CM2_daily_rainfall_NSW 1[4m9[24m[4m3[24m[4m2[24m840
[90m2[39m GFDL-CM4_daily_rainfall_NSW   3[4m2[24m[4m1[24m[4m9[24m300
[90m3[39m observed_daily_rainfall_SYD     [4m4[24m[4m6[24m020
Time difference of 2.220245 secs
CPU times: user 1.22 s, sys: 940 ms, total: 2.16 s
Wall time: 2.32 s


In [267]:
%%time
%%R 

# transfer dataframe to R as a parquet 

library(arrow)

file_path = here("data", "final_data.parquet")
start_time <- Sys.time()
df <- read_parquet(file_path)
result <- df %>% count(model)
print(result)
end_time <- Sys.time()
print(end_time - start_time)

R[write to console]: 
Attaching package: ‘arrow’


R[write to console]: The following objects are masked from ‘package:feather’:

    read_feather, write_feather


R[write to console]: The following object is masked from ‘package:utils’:

    timestamp




[90m# A tibble: 3 x 2[39m
  model                               n
[90m*[39m [3m[90m<chr>[39m[23m                           [3m[90m<int>[39m[23m
[90m1[39m ACCESS-CM2_daily_rainfall_NSW 1[4m9[24m[4m3[24m[4m2[24m840
[90m2[39m GFDL-CM4_daily_rainfall_NSW   3[4m2[24m[4m1[24m[4m9[24m300
[90m3[39m observed_daily_rainfall_SYD     [4m4[24m[4m6[24m020
Time difference of 4.098216 secs
CPU times: user 2.26 s, sys: 1.38 s, total: 3.64 s
Wall time: 5.17 s


In [268]:
%%sh

du -sh data/final_data.feather
du -sh data/final_data.parquet

112M	data/final_data.feather
 80M	data/final_data.parquet


**Reasoning**

1 - https://luminousmen.com/post/big-data-file-formats
2 - https://stackoverflow.com/questions/48083405/what-are-the-differences-between-feather-and-parquet

From the exploration of the different memory and time usage, we can see that parquet files store less memory then the feather files. This is due to the use of dictionary encoding and certain compressions that make this possible (2). Furthermore, the time it takes to do a basic reading action is much faster for parquet and feather files than csv files because in csv files, it is reading the entire dataframe to get a simple answer to our count query. However, parquet files store the metadata of the file and can easily access the columns and read the files without having to go through all the columns. This is why it is much faster than csv files. However, on my computer, the feather and parquet files take a similar amount for a simple count query. We hypothesize that this may be due to the fact that our files are not large enough for us to see the difference. But we have to consider that we will be using Spark as tool on the cloud and Parquet files are easily transferable to Spark (2) and because of this reason, we will be choosing the parquet files as our final choice. 