In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage


import dask.dataframe as dd


import pyarrow.dataset as ds
import rpy2_arrow.pyarrow_rarrow as pyra
import pyarrow.feather as feather
import pyarrow.parquet as pq

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler

# Milestone 1: Tackling big data on your laptop


---

## 1. Downloading the data

1). Download the data from [figshare](https://figshare.com/articles/dataset/Daily_rainfall_over_NSW_Australia/14096681) to the local computer using the [figshare API](https://docs.figshare.com) using the `requests` library.

In [3]:
# article_id = 14096681
# url = f"https://api.figshare.com/v2/articles/{article_id}"
# headers = {"Content-Type": "application/json"}
# output_directory = "figshare_rainfall"

In [4]:
# response = requests.request("GET", url, headers=headers)
# data = json.loads(response.text)
# files = data["files"]
# files

In [5]:
# %%time
# files_to_dl = ["data.zip"]
# for file in files:
#     if file["name"] in files_to_dl:
#         os.makedirs(output_directory, exist_ok=True)
#         urlretrieve(file["download_url"], os.path.join(output_directory, file["name"]))

2). Extract the zip file programmatically

In [6]:
# %%time
# with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
#     f.extractall(output_directory)

---

## 2. Combining data CSVs

- We used `pandas` to combine data CSVs into a single CSV.
- An extra column called "model" is added which identifies the model eg: for file name "SAM0-UNICON_daily_rainfall_NSW.csv", the model name is SAM0-UNICON
- The comparison of run times and memory usages of these options on different machines within our team is documented in the GitHub [issue](](https://github.com/UBC-MDS/grp15_rainfall_analysis/issues/5).
- We observed that memory increment is quite low among different machines, which means combining entire dataset consumes small memory
    - *Junghoo*
        - peak memory: 359.83 MiB, increment: 0.09 MiB
        - CPU times: user 4min 52s, sys: 9.55 s, total: 5min 22s
        - Wall time: 5min 4s
    - *Micah*
         - peak memory: 347.55 MiB, increment: 0.18 MiB
         - CPU times: user 7min 14s, sys: 26 s, total: 7min 40s
         - Wall time: 7min 57s
    - *Chuang*
         - peak memory: 464.66 MiB, increment: 0.10 MiB
         - CPU times: user 6min 19s, sys: 17 s, total: 6min 36s
         - Wall time: 6min 51s 
    - *Pan*
         - peak memory: 328.88 MiB, increment: 0.25 MiB
         - CPU times: user 5min 31s, sys: 21.3 s, total: 5min 52s
         - Wall time: 5min 59s  



In [7]:
# ### just listing to get an idea how individual file looks like 
# sample_df = pd.read_csv(os.path.join(output_directory, "ACCESS-CM2_daily_rainfall_NSW.csv"))
# sample_df.columns

In [8]:
# ### making sure that all files have the same colums
# files = glob.glob(os.path.join(output_directory, "*.csv"))
# use_cols = list(sample_df.columns)
# for file in files:
#     try:
#         pd.read_csv(file, index_col=0, usecols=use_cols)
#     except:
#         df = pd.read_csv(file, index_col=0)
#         print(f"{os.path.basename(file)} does not have all columns. {os.path.basename(file)} only has {df.columns.to_list()} columns.")

In [9]:
# %%time
# %memit
# # "figshare_rainfall/observed_daily_rainfall_SYD.csv" is missing 'lat_min', 'lat_max', 'lon_min', 'lon_max' columns
# files = glob.glob(os.path.join(output_directory, "*_NSW.csv"))

# # combining using pandas method
# df = pd.concat((pd.read_csv(file, index_col=0, usecols=use_cols)
#                 .assign(model = os.path.basename(file).split("_")[0])
#                 for file in files)
#               )
# df.to_csv(os.path.join(output_directory, "combined_data.csv"))

---

## 3. Load the combined CSV to memory and perform a simple EDA


#### 1). simple pandas - loading the entire data to the memory


#### ***Observation***
- We can see that the CPU time and Wall time are really close to each other, which might mean that there might be only one CPU processing the work.
- We can also observe that the memory increment 6049.38 MiB is quite high, which means loading the entire dataset consumes a lot of memory.

In [10]:
%%time
%%memit
df = pd.read_csv("figshare_rainfall/combined_data.csv")
print(df["model"].value_counts())

MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
TaiESM1             3541230
CMCC-ESM2           3541230
NorESM2-MM          3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
NorESM2-LM           919800
CanESM5              551880
BCC-ESM1             551880
Name: model, dtype: int64
peak memory: 6532.96 MiB, increment: 6295.71 MiB
CPU times: user 1min 1s, sys: 11.3 s, total: 1min 12s
Wall time: 1min 17s


In [11]:
# see how many rows in the dataset
len(df.index)

62467843

In [12]:
# have a look at the first couple of rows of the data
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [13]:
df.dtypes

time              object
lat_min          float64
lat_max          float64
lon_min          float64
lon_max          float64
rain (mm/day)    float64
model             object
dtype: object

#### 2). Changing dtype of the data to reduce memory usage while performing

#### ***Observation***
- By changing the dtype of the 5 out of 7 columns from `float64` to `float32`, we saved half space.

In [14]:
df_reduced_dtype = df[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore')

In [15]:
print(f"Memory usage with float64: {df[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df_reduced_dtype.memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 2498.71 MB
Memory usage with float32: 1249.36 MB


#### 3). Loading in chunks
#### ***Observation***
- memory increment: 1116.05 MiB is dramatically decreased (it was 6049.38 MiB using pandas) when we load data in small chunks.
- But the CPU time and Wall time are still close to each other, which means the work is still ***not*** executed parallely.

In [16]:
%%time
%%memit
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("figshare_rainfall/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
AWI-ESM-1-1-LR       966420
BCC-CSM2-MR         3035340
BCC-ESM1             551880
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
CanESM5              551880
EC-Earth3-Veg-LR    3037320
FGOALS-f3-L         3219300
FGOALS-g3           1287720
GFDL-CM4            3219300
GFDL-ESM4           3219300
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
MIROC6              2070900
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-HR       5154240
MPI-ESM1-2-LR        966420
MRI-ESM2-0          3037320
NESM3                966420
NorESM2-LM           919800
NorESM2-MM          3541230
SAM0-UNICON         3541153
TaiESM1             3541230
dtype: int64
peak memory: 7696.09 MiB, increment: 1344.12 MiB
CPU times: user 1min 1s, sys: 7.4 s, total: 1min 8s
Wall time: 1min 10s


#### 4). Dask Way

> using `dask` to read that csv file. Internally its loading chunks and doing it parallely.

#### ***Observation***
- memory increment: 1291.08 MiB is dramatically decreased (it was 1116.05 MiB using pandas) when we load data in small chunks.
- However, the CPU time is much greater than the Wall time now, which means the work was done by several processors concurrently.

In [17]:
%%time
%%memit
ddf = dd.read_csv('figshare_rainfall/combined_data.csv')
print(ddf["model"].value_counts().compute())

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
peak memory: 5125.86 MiB, increment: 1338.71 MiB
CPU times: user 1min 27s, sys: 17.8 s, total: 1min 45s
Wall time: 41.9 s


---

## 4. Perform a simple EDA in R

### 1). Loading Datasets into Different Formats

#### a). pandas

In [18]:
%%time
%%memit
df = pd.read_csv("figshare_rainfall/combined_data.csv", nrows=1_000_000)

peak memory: 3702.75 MiB, increment: 16.93 MiB
CPU times: user 1.02 s, sys: 127 ms, total: 1.15 s
Wall time: 2.11 s


#### b). arrow table format

In [19]:
%%time
%%memit

#loading datasets

dataset = ds.dataset("figshare_rainfall/combined_data.csv", format="csv")
table = dataset.to_table()

peak memory: 4854.95 MiB, increment: 1236.44 MiB
CPU times: user 20.3 s, sys: 10.8 s, total: 31.1 s
Wall time: 27.5 s


#### c). feather format

In [20]:
%%time
%%memit
# writing to feather format
feather.write_feather(table, 'figshare_rainfall/combined.feather')

peak memory: 5319.28 MiB, increment: 1758.34 MiB
CPU times: user 4.92 s, sys: 12.3 s, total: 17.2 s
Wall time: 9.47 s


#### d). parquet format

In [21]:
%%time
%%memit
## writing as a single parquet 
pq.write_table(table, 'figshare_rainfall/combined.parquet')

peak memory: 5459.55 MiB, increment: 138.73 MiB
CPU times: user 9.63 s, sys: 1.77 s, total: 11.4 s
Wall time: 13.2 s


#### Compare the size of the data in 3 different format.

In [22]:
%%sh
# I am just seeing the size of the csv data
du -sh figshare_rainfall/combined_data.csv
# I am just seeing the size of the feather data
du -sh figshare_rainfall/combined.feather
# I am just seeing the size of the parquet data
du -sh figshare_rainfall/combined.parquet

5.6G	figshare_rainfall/combined_data.csv
1.0G	figshare_rainfall/combined.feather
544M	figshare_rainfall/combined.parquet


### 2). Transfering to R and trying to do simple EDA - `count(model)`

#### a). pandas

> Note: we tried to load the entire dataset into R from python pandas object, but it takes ages :| Therefore, we decided to load just 1 million rows of the data, which is just 1.6% of the whole dataset. However, loading the 1.6% of the whole dataset and performing `count(model)` took same amount of time as it does for the whole dataset in arrow table format. This indicates that the exchange of data between python pandas object and R is a really expensive operation because of the serialization and deserialization.

In [23]:
%%time
%%R -i df
## transferring the python dataframe to R
start_time <- Sys.time()
library(dplyr)
print(class(df))
result <- df %>% count(model)
print(result)
end_time <- Sys.time()
print(end_time - start_time)

R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




[1] "data.frame"
            model      n
1  AWI-ESM-1-1-LR  33580
2 MPI-ESM-1-2-HAM 966420
Time difference of 0.7901061 secs
CPU times: user 27.6 s, sys: 1.13 s, total: 28.7 s
Wall time: 29.4 s


#### b). in arrow table format

In [24]:
%%time
%%memit
## Here we are loading the arrow dataframe that we have loaded previously

r_table = pyra.converter.py2rpy(table)

5695
rarrow.ChunkedArray: 0.024595975875854492
5695
rarrow.ChunkedArray: 0.02149200439453125
5695
rarrow.ChunkedArray: 0.022471189498901367
5695
rarrow.ChunkedArray: 0.027393579483032227
5695
rarrow.ChunkedArray: 0.029187679290771484
5695
rarrow.ChunkedArray: 0.027768850326538086
5695
rarrow.ChunkedArray: 0.023662090301513672
peak memory: 6099.84 MiB, increment: 61.80 MiB
CPU times: user 24.8 s, sys: 639 ms, total: 25.4 s
Wall time: 26.6 s


In [25]:
%%time
%%R -i r_table

## arrow Speed
start_time <- Sys.time()
print(class(r_table))
##add details on collect here
library(dplyr)
# Arrow speed
result <- r_table %>% collect() %>% count(model)
print(class(r_table %>% collect()))
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "Table"       "ArrowObject" "R6"         
[1] "tbl_df"     "tbl"        "data.frame"
[90m# A tibble: 27 x 2[39m
   model                  n
   [3m[90m<chr>[39m[23m              [3m[90m<int>[39m[23m
[90m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[90m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[90m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[90m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[90m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[90m 6[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[90m 7[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[90m 8[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[90m 9[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[90m10[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[90m# … with 17 more rows[39m
Time difference of 10.11736 secs
CPU times: user 10.3 s, sys: 12.4 s, total: 22.8 s
Wall time: 10.2 s


#### c). in Feather format

In [26]:
%%time
%%R

### Feather speed

library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_feather("figshare_rainfall/combined.feather")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
[90m# A tibble: 27 x 2[39m
   model                  n
   [3m[90m<chr>[39m[23m              [3m[90m<int>[39m[23m
[90m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[90m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[90m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[90m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[90m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[90m 6[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[90m 7[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[90m 8[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[90m 9[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[90m10[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[90m# … with 17 more rows[39m
Time difference of 19.67986 secs
CPU times: user 11 s, sys: 20.7 s, total: 31.8 s
Wall time: 19.7 s


#### d). in Parquet format

In [27]:
%%time
%%R

### Parquet speed

library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_parquet("figshare_rainfall/combined.parquet")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
[90m# A tibble: 27 x 2[39m
   model                  n
   [3m[90m<chr>[39m[23m              [3m[90m<int>[39m[23m
[90m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[90m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[90m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[90m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[90m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[90m 6[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[90m 7[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[90m 8[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[90m 9[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[90m10[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[90m# … with 17 more rows[39m
Time difference of 10.07803 secs
CPU times: user 10.1 s, sys: 5.41 s, total: 15.5 s
Wall time: 10.1 s


### 3). Transfering to R and trying to do simple EDA - `summary()`

#### a). pandas

> Note: Again, we tried to load the entire dataset into R from python pandas object, but it takes ages :| Therefore, we decided to load just 1 million rows of the data, which is just 1.6% of the whole dataset. However, loading the 1.6% of the whole dataset and performing `summary()` took same amount of time as it does for the whole dataset in arrow table format. This indicates that the exchange of data between python pandas object and R is a really expensive operation because of the serialization and deserialization.

In [28]:
%%time
%%R -i df

start_time <- Sys.time()
library(dplyr)
print(class(df))
result <- df %>% collect() %>% summary()
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "data.frame"
     time              lat_min          lat_max          lon_min     
 Length:1000000     Min.   :-35.44   Min.   :-33.57   Min.   :141.6  
 Class :character   1st Qu.:-35.44   1st Qu.:-33.57   1st Qu.:143.4  
 Mode  :character   Median :-33.57   Median :-31.71   Median :147.2  
                    Mean   :-33.64   Mean   :-31.77   Mean   :147.0  
                    3rd Qu.:-31.71   3rd Qu.:-29.84   3rd Qu.:150.9  
                    Max.   :-31.71   Max.   :-29.84   Max.   :152.8  
    lon_max      rain (mm/day)        model          
 Min.   :143.4   Min.   : 0.0000   Length:1000000    
 1st Qu.:145.3   1st Qu.: 0.0000   Class :character  
 Median :149.1   Median : 0.0000   Mode  :character  
 Mean   :148.9   Mean   : 1.5912                     
 3rd Qu.:152.8   3rd Qu.: 0.3206                     
 Max.   :154.7   Max.   :94.2708                     
Time difference of 0.1981711 secs
CPU times: user 26.7 s, sys: 841 ms, total: 27.6 s
Wall time: 27.8 s


#### b). in arrow table format

In [29]:
%%time
%%R -i r_table

## arrow Speed - running summary()
start_time <- Sys.time()
print(class(r_table))
##add details on collect here
library(dplyr)
# Arrow speed
result <- r_table %>% collect() %>% summary()
print(class(r_table %>% collect()))
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "Table"       "ArrowObject" "R6"         
[1] "tbl_df"     "tbl"        "data.frame"
      time                        lat_min           lat_max      
 Min.   :1888-12-31 16:00:00   Min.   :-36       Min.   :-36.00  
 1st Qu.:1920-07-02 04:00:00   1st Qu.:-35       1st Qu.:-33.66  
 Median :1952-01-01 04:00:00   Median :-33       Median :-32.04  
 Mean   :1952-01-01 08:32:08   Mean   :-33       Mean   :-31.98  
 3rd Qu.:1983-07-02 05:00:00   3rd Qu.:-31       3rd Qu.:-30.16  
 Max.   :2014-12-31 04:00:00   Max.   :-30       Max.   :-27.91  
                               NA's   :3219300                   
    lon_min           lon_max      rain (mm/day)        model          
 Min.   :141       Min.   :141.2   Min.   :  0       Length:62467843   
 1st Qu.:143       1st Qu.:145.0   1st Qu.:  0       Class :character  
 Median :147       Median :148.1   Median :  0       Mode  :character  
 Mean   :147       Mean   :148.2   Mean   :  2                         
 3rd Qu.:150       3rd 

#### c). in Feather format

In [30]:
%%time
%%R

### Feather speed - running summary()

library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_feather("figshare_rainfall/combined.feather")
print(class(r_table))
library(dplyr)
result <- r_table %>% summary()
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
      time                        lat_min           lat_max      
 Min.   :1888-12-31 16:00:00   Min.   :-36       Min.   :-36.00  
 1st Qu.:1920-07-02 04:00:00   1st Qu.:-35       1st Qu.:-33.66  
 Median :1952-01-01 04:00:00   Median :-33       Median :-32.04  
 Mean   :1952-01-01 08:32:08   Mean   :-33       Mean   :-31.98  
 3rd Qu.:1983-07-02 05:00:00   3rd Qu.:-31       3rd Qu.:-30.16  
 Max.   :2014-12-31 04:00:00   Max.   :-30       Max.   :-27.91  
                               NA's   :3219300                   
    lon_min           lon_max      rain (mm/day)        model          
 Min.   :141       Min.   :141.2   Min.   :  0       Length:62467843   
 1st Qu.:143       1st Qu.:145.0   1st Qu.:  0       Class :character  
 Median :147       Median :148.1   Median :  0       Mode  :character  
 Mean   :147       Mean   :148.2   Mean   :  2                         
 3rd Qu.:150       3rd Qu.:151.3   3rd Qu.:  1                       

#### d). in Parquet format

In [31]:
%%time
%%R

### Parquet speed - running summary()

library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_parquet("figshare_rainfall/combined.parquet")
print(class(r_table))
library(dplyr)
result <- r_table %>% summary()
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
      time                        lat_min           lat_max      
 Min.   :1888-12-31 16:00:00   Min.   :-36       Min.   :-36.00  
 1st Qu.:1920-07-02 04:00:00   1st Qu.:-35       1st Qu.:-33.66  
 Median :1952-01-01 04:00:00   Median :-33       Median :-32.04  
 Mean   :1952-01-01 08:32:08   Mean   :-33       Mean   :-31.98  
 3rd Qu.:1983-07-02 05:00:00   3rd Qu.:-31       3rd Qu.:-30.16  
 Max.   :2014-12-31 04:00:00   Max.   :-30       Max.   :-27.91  
                               NA's   :3219300                   
    lon_min           lon_max      rain (mm/day)        model          
 Min.   :141       Min.   :141.2   Min.   :  0       Length:62467843   
 1st Qu.:143       1st Qu.:145.0   1st Qu.:  0       Class :character  
 Median :147       Median :148.1   Median :  0       Mode  :character  
 Mean   :147       Mean   :148.2   Mean   :  2                         
 3rd Qu.:150       3rd Qu.:151.3   3rd Qu.:  1                       

### **4). Decision**

Our group chose to go with parquet to transfer the dataframe from Python to R. The file size for our combined data in parquet format was 544mb compared to 5.6GB and 1.0GB for csv and feather formats respectively. Meanwhile in the task of reading the file in R and running a simple `count(model)`, in parquet it had a wall time of 39 seconds which was compared to 40.6 seconds for arrow format and 1 minute and 23 seconds with feather format. 

While speed wise it seems that parquet and arrow do not differ too much, when we consider the file size of a parquet file compared to a csv it becomes the parquet format seems to be the better option.  Thus we decided to go with the parquet format to transfer the dataframe in R. 

We did not consider simply importing the pandas dataframe through `%%R -i` because of the amount of serialization and deserialization that would have to be done when transferring it from Python to R.