In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import numpy as np

In [2]:
import pyarrow.feather as feather
import dask.dataframe as dd
import rpy2.rinterface
import rpy2_arrow.pyarrow_rarrow as pyra
import rpy2.robjects.packages as rpackages

In [3]:
%load_ext memory_profiler

# Importing CSV and converting dataset to feather file

In [4]:
%%time
%%memit
output_directory = "../data/" # This notebook should be ran mannually
combined_data = dd.read_csv(output_directory + "combined_data.csv/*")
combined_data = combined_data.drop(['Unnamed: 0'], axis = 1)

peak memory: 263.00 MiB, increment: 1.61 MiB
CPU times: user 94.8 ms, sys: 49 ms, total: 144 ms
Wall time: 1.01 s


In [5]:
combined_data.head(npartitions=10)

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,0.0,MPI-ESM-1-2-HAM


In [6]:
%%time
%%memit

combined_data.compute().reset_index().to_feather('../data/combined_data_feather.feather')

peak memory: 9008.76 MiB, increment: 8493.55 MiB
CPU times: user 2min 42s, sys: 2min 28s, total: 5min 11s
Wall time: 4min 39s


In [26]:
%%sh
du -sh ../data/combined_data.csv
du -sh ../data/combined_data_feather.feather

6.5G	../data/combined_data.csv
1.4G	../data/combined_data_feather.feather


- We can see the file size is much smaller in a feather file

# Setting up R environment + importing feather file

In [7]:
%load_ext rpy2.ipython

In [8]:
%%R

library(arrow)
library(dplyr)

R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [9]:
%%time
%%R
start_time <- Sys.time()

combined_data_r <- arrow::read_feather("../data/combined_data_feather.feather")
print(class(combined_data_r))
result <- combined_data_r %>% count(time)

end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
[90m# A tibble: 92,040 x 2[39m
   time                    n
   [3m[90m<chr>[39m[23m               [3m[90m<int>[39m[23m
[90m 1[39m 1889-01-01             29
[90m 2[39m 1889-01-01 12:00:00  [4m1[24m330
[90m 3[39m 1889-01-02             29
[90m 4[39m 1889-01-02 12:00:00  [4m1[24m330
[90m 5[39m 1889-01-03             29
[90m 6[39m 1889-01-03 12:00:00  [4m1[24m330
[90m 7[39m 1889-01-04             29
[90m 8[39m 1889-01-04 12:00:00  [4m1[24m330
[90m 9[39m 1889-01-05             29
[90m10[39m 1889-01-05 12:00:00  [4m1[24m330
[90m# … with 92,030 more rows[39m
Time difference of 23.84748 secs
CPU times: user 21.5 s, sys: 10.8 s, total: 32.2 s
Wall time: 24 s


# EDA in R

In [17]:
%%time
%%R

print(paste0("Number of rows:", nrow(combined_data_r)))
print(paste0("Number of cols:", ncol(combined_data_r)))
print("Column names are...")
print(paste0(colnames(combined_data_r)))

[1] "Number of rows:62513863"
[1] "Number of cols:8"
[1] "Column names are..."
[1] "index"         "time"          "lat_min"       "lat_max"      
[5] "lon_min"       "lon_max"       "rain (mm/day)" "model"        
CPU times: user 20.5 ms, sys: 9.43 ms, total: 30 ms
Wall time: 24.5 ms


In [11]:
%%time
%%R
head(combined_data_r)

[90m# A tibble: 6 x 8[39m
  index time          lat_min lat_max lon_min lon_max `rain (mm/day)` model     
  [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m           [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m           [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m     
[90m1[39m     0 1889-01-01 1…   -[31m35[39m[31m.[39m[31m4[39m   -[31m33[39m[31m.[39m[31m6[39m    142.    143.        4.24[90me[39m[31m-13[39m MPI-ESM-1…
[90m2[39m     1 1889-01-02 1…   -[31m35[39m[31m.[39m[31m4[39m   -[31m33[39m[31m.[39m[31m6[39m    142.    143.        4.22[90me[39m[31m-13[39m MPI-ESM-1…
[90m3[39m     2 1889-01-03 1…   -[31m35[39m[31m.[39m[31m4[39m   -[31m33[39m[31m.[39m[31m6[39m    142.    143.        4.50[90me[39m[31m-13[39m MPI-ESM-1…
[90m4[39m     3 1889-01-04 1…   -[31m35[39m[31m.[39m[31m4[39m   -[31m33[39m[31m.[39m[31m6[39m    142.    143.        4.25[90me[39

In [12]:
%%time
%%R
tail(combined_data_r)

[90m# A tibble: 6 x 8[39m
   index time           lat_min lat_max lon_min lon_max `rain (mm/day)` model   
   [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m            [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m           [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m   
[90m1[39m [4m1[24m[4m4[24m[4m1[24m958 2014-12-26 12…   -[31m30[39m[31m.[39m[31m2[39m   -[31m29[39m[31m.[39m[31m2[39m    153.    154.            4.44 SAM0-UN…
[90m2[39m [4m1[24m[4m4[24m[4m1[24m959 2014-12-27 12…   -[31m30[39m[31m.[39m[31m2[39m   -[31m29[39m[31m.[39m[31m2[39m    153.    154.            6.69 SAM0-UN…
[90m3[39m [4m1[24m[4m4[24m[4m1[24m960 2014-12-28 12…   -[31m30[39m[31m.[39m[31m2[39m   -[31m29[39m[31m.[39m[31m2[39m    153.    154.            7.86 SAM0-UN…
[90m4[39m [4m1[24m[4m4[24m[4m1[24m961 2014-12-29 12…   -[31m30[39m[31m.[39m[31m2[39m   -[31m29[39m[31m.[39

In [16]:
%%time
%%R
print(paste0("lat_min Minimum: ", min(combined_data_r$lat_min, na.rm=T)))
print(paste0("lat_min Maximum: ", max(combined_data_r$lat_min, na.rm=T)))
print(paste0("lat_min Mean: ", mean(combined_data_r$lat_min, na.rm=T)))
print(paste0("lat_max Minimum: ", min(combined_data_r$lat_max, na.rm=T)))
print(paste0("lat_max Maximum: ", max(combined_data_r$lat_max, na.rm=T)))
print(paste0("lat_max Mean: ", mean(combined_data_r$lat_max, na.rm=T)))
print(paste0("Rainfall Mean: ", mean(combined_data_r$'rain (mm/day)', na.rm=T)))

[1] "lat_min Minimum: -36.46738961176"
[1] "lat_min Maximum: -29.9"
[1] "lat_min Mean: -33.1048166975557"
[1] "lat_max Minimum: -36"
[1] "lat_max Maximum: -27.9060644734869"
[1] "lat_max Mean: -31.9775662186059"
[1] "Rainfall Mean: 1.90182700665884"
CPU times: user 2.47 s, sys: 1.3 s, total: 3.77 s
Wall time: 3.77 s


# Why we used a .feather file

A .feather file was used mainly due to its lightweight/portable nature.
- The format was quick for saving the data (less than 5 minutes to save the entire dataset) and loading the data (less than 24 seconds).
- The format was to push a very large dataframe into a significantly smaller file size than what was seen in our CSV file (1.4G vs 6.5G). 
- The format is language agnostic so we will be able to read the dataframe into either python.
- The format was something that we were familiar with from previous projects.

We chose feather instead of Parquet because...
- Technically a Parquet file could have compressed the data further but it would have been more computationally expensive and thus slower. 1.4G is a very reasonable size for a dataset this big.