In [1]:
import pandas as pd

In [2]:
output_directory = "../data"
rerun = False

Read in csv to Python

In [3]:
if rerun:
    df = pd.read_csv(f"{output_directory}/combined_data.csv",index_col=0, parse_dates=True)

Save to parquet (as one file)

In [4]:
%%time

if rerun:
    df.to_parquet(f"{output_directory}/combined_data.parquet")

CPU times: total: 0 ns
Wall time: 0 ns


Save to parquet (partition by model)

In [5]:
%%time

if rerun:
    df.to_parquet(f"{output_directory}/combined_data_partition.parquet",partition_cols=['model'])

CPU times: total: 0 ns
Wall time: 0 ns


Compare file sizes:

In [6]:
%%sh
du -sh "../data/combined_data.csv"
du -sh "../data/combined_data.parquet"
du -sh "../data/combined_data_partition.parquet"

5.7G	../data/combined_data.csv
542M	../data/combined_data.parquet
550M	../data/combined_data_partition.parquet


Time Python read parquet file and count:

In [7]:
%%time

df = pd.read_parquet(f"{output_directory}/combined_data.parquet")
print(df["model"].value_counts())

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
CPU times: total: 19.8 s
Wall time: 7.39 s


Time R read parquet file and count:

In [8]:
%load_ext rpy2.ipython



In [9]:
%%time
%%R

suppressPackageStartupMessages(library(dplyr,quietly=TRUE))
output_directory <- "../data"
#df <- arrow::read_parquet(paste0(output_directory,"/","combined_data_partition.parquet"))
df <- arrow::open_dataset(paste0(output_directory,"/","combined_data_partition.parquet"), 
                          format="parquet", 
                          partitioning=c("model"))
df %>%
    group_by(model) %>%
    summarize(cnt=n()) %>%
    ungroup() %>%
    collect()

[38;5;246m# A tibble: 27 x 2[39m
   model                cnt
   [3m[38;5;246m<chr>[39m[23m              [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[38;5;250m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[38;5;250m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[38;5;250m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[38;5;250m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 6[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m 7[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m 8[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m 9[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[38;5;250m10[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[38;5;246m# ... with 17 more rows[39m
CPU times: total: 2.17 s
Wall time: 2.18 s


## Discussion

- Pandas build in function can write/read to parquet files easily (R with the arrow package)
- Parquet file type significantly reduce size in storage (duh) (5.7GB shrink to 542M without partitioning and 550M with partition)
    - No partition = slightly smaller in total size
    - Partition = slightly more in total size but probably best for I/O since you can choose easily what to read in
- Python took significantly more time to read and count rows than R
    - could be a pandas vs arrow optimization problem