In [12]:
# Installations
import sys
%conda install --yes --prefix {sys.prefix} seaborn

# our package from 524, used to pretty-print file sizes during download. Not super necessary but fun to use.
%pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple nicenumber

%pip install tqdm

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.9.2
  latest version: 4.10.0

Please update conda by running

    $ conda update -n base conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://test.pypi.org/simple/, https://pypi.org/simple
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [13]:
from src import download as dl
from src import functions as f

import pandas as pd
import numpy as np

import rpy2.rinterface
import dask.dataframe as dd

# install the packages https://arrow.apache.org/docs/python/install.html
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.parquet as pq

# How to install put instructions https://anaconda.org/conda-forge/rpy2
import rpy2.rinterface

# install this https://pypi.org/project/rpy2-arrow/#description  pip install rpy2-arrow
# have to install this as well conda install -c conda-forge r-arrow 
import rpy2_arrow.pyarrow_rarrow as pyra

import pyarrow.feather as feather

In [2]:
# %load_ext rpy2.ipython
%load_ext memory_profiler

## 1. Download Data

In [4]:
%%time
%%memit

# download and unzip data files
files = dl.download_files('data.zip', chunk_size=10)

if files:
    dl.unzip(p=files[0], p_dst='csv', delete=False)

INFO    58   src.download               Downloading 814MB file in 10MB chunks.
100%|██████████| 814M/814M [04:19<00:00, 3.14MiB/s]
INFO    69   src.download               File downloaded to: /Users/Jayme/OneDrive/MDS/525/rainfall_group22/data/data.zip
INFO    126  src.download               Unpacking zip to: /Users/Jayme/OneDrive/MDS/525/rainfall_group22/data/csv


peak memory: 936.94 MiB, increment: 660.67 MiB
CPU times: user 15.8 s, sys: 7.58 s, total: 23.4 s
Wall time: 4min 40s


## 2. Combine CSVs

In [3]:
# set download directories
p_data = dl.p_data # top level data dir
p_csv = p_data / 'csv' # sub dir for saving loose csvs
p_combined = p_data / 'rainfall.csv' # main csv file to use

In [5]:
%%time
%%memit

# combine csvs with pandas
csvs = [p for p in p_csv.glob('*.csv')]
dfs = []

# load individual dfs and save to list
for p in csvs:
    model_name = p.name.split('_')[0]

    df = pd.read_csv(p) \
        .assign(model=model_name)

    dfs.append(df)

# concat all dfs
df = pd.concat(dfs) \
    .rename(columns={'rain (mm/day)': 'rain'})

peak memory: 13064.04 MiB, increment: 12683.50 MiB
CPU times: user 54.9 s, sys: 6.91 s, total: 1min 1s
Wall time: 1min 2s


#### Runtimes

Times to combine dataframe csvs for each team member:

|User|OS|Processor|RAM|Load Time|
|:--|:--|:--|:--|:--|
|Jayme|Mac OS|2.4 GHz 8-Core Intel Core i9|32 GB 2667 MHz DDR4| 62s|
|Zhiyong| | | |
|Marc| | | |

In [6]:
print(df.shape)
df.head()

(62513863, 7)


Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain,model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62513863 entries, 0 to 3541152
Data columns (total 7 columns):
 #   Column   Dtype  
---  ------   -----  
 0   time     object 
 1   lat_min  float64
 2   lat_max  float64
 3   lon_min  float64
 4   lon_max  float64
 5   rain     float64
 6   model    object 
dtypes: float64(5), object(2)
memory usage: 3.7+ GB


In [19]:
# save combined data back to csv
df.to_csv(p_combined, index=False)

In [20]:
%%sh
du -sh data/rainfall.csv

5.6G	data/rainfall.csv


In [22]:
%%time
%%memit
# Load csv with dask

ddf = dd.read_csv(p_combined, assume_missing=True)

peak memory: 16101.11 MiB, increment: 0.73 MiB
CPU times: user 79.4 ms, sys: 66.1 ms, total: 145 ms
Wall time: 1.63 s


In [23]:
ddf.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain,model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


## 3. Python EDA

Here we will investigate and summarize the following approaches to reduce memory usage while performing a simple EDA:

1. Baseline
2. Load data in chunks
3. Load only columns of interest
4. Dask

### 3.1 Baseline

Naive approach, read all columns with pandas `read_csv`

In [4]:
print_max_rain = lambda x: print(f'Max rainfall: {x:.2f} mm/day')

In [28]:
%%time
%%memit

max_rain_baseline = pd.read_csv(p_combined).rain.max()
print_max_rain(max_rain_baseline)

Max rainfall: 432.94 mm/day
peak memory: 14973.23 MiB, increment: 2598.79 MiB
CPU times: user 54.6 s, sys: 4.27 s, total: 58.9 s
Wall time: 59.7 s


### 3.2 Load data in chunks

In [5]:
%%time
%%memit

max_rain_chunks = np.finfo('float64').min

for df_chunk in pd.read_csv(p_combined, chunksize=1_000_000):
    cur_max = df_chunk.rain.max()
    if cur_max > max_rain_chunks:
        max_rain_chunks = cur_max

print_max_rain(max_rain_chunks)

Max rainfall: 432.94 mm/day
peak memory: 1322.71 MiB, increment: 1035.25 MiB
CPU times: user 52.1 s, sys: 2.77 s, total: 54.8 s
Wall time: 55.3 s


### 3.3 Load only columns of interest

In [9]:
%%time
%%memit

df_one_col = pd.read_csv(p_combined, usecols=['rain'])
max_rain_one = df_one_col.rain.max()
print_max_rain(max_rain_one)

Max rainfall: 432.94 mm/day
peak memory: 5452.88 MiB, increment: 890.59 MiB
CPU times: user 25.8 s, sys: 1.14 s, total: 27 s
Wall time: 27.4 s


### 3.4 Dask

In [None]:
%%time
%%memit

ddf = dd.read_csv(p_combined)
max_rain_dask = ddf.rain.max().compute()

print_max_rain(max_rain_dask)

Max rainfall: 432.94 mm/day
peak memory: 5555.89 MiB, increment: 920.82 MiB
CPU times: user 1min 15s, sys: 12.1 s, total: 1min 28s
Wall time: 24.8 s
Max rainfall: 432.94 mm/day
peak memory: 5518.82 MiB, increment: 995.95 MiB
CPU times: user 1min 16s, sys: 12.5 s, total: 1min 28s
Wall time: 25 s


#### Summary
The following table summarizes memory usage while loading a csv and performing a simple EDA (find maximum rainfall):

In [19]:
m_results = dict(
    baseline=[14973, 65],
    chunks=[1323, 55],
    single_column=[5452, 27],
    dask=[5529, 25])

pd.DataFrame \
    .from_dict(
        m_results,
        orient='index',
        columns=['Peak Memory Usage (MB)', 'Execution Time (S)']) \
    .rename_axis('Method') \
    .style.pipe(f.bg, rev=False)

Unnamed: 0_level_0,Peak Memory Usage (MB),Execution Time (S)
Method,Unnamed: 1_level_1,Unnamed: 2_level_1
baseline,14973,65
chunks,1323,55
single_column,5452,27
dask,5529,25


#### Observations
- To find the maximum rainfall, in this case we only needed one column in the table, therefor loading all columns was redundant.
- Loading data in chunks reduced our execution time slightly, and greatly reduced peak memory usage.
- Both loading only a single column and Dask had similar memory usage, with Dask executing slightly faster (25s).
- Overall Dask reduced our memory usage by ~1/3 and execution time by ~60%.

### Save data to multiple formats

In [7]:
dataset = ds.dataset(p_combined, format='csv')
arrow_table = dataset.to_table()

In [8]:
%%time
%%memit

feather.write_feather(arrow_table, 'figshare/combined_data.feather')

peak memory: 3242.37 MiB, increment: 2930.09 MiB
CPU times: user 5.07 s, sys: 8.96 s, total: 14 s
Wall time: 8.4 s


In [9]:
%%time
%%memit

pq.write_to_dataset(arrow_table, 'figshare/combined_data.parquet', partition_cols=['model'])

peak memory: 4656.05 MiB, increment: 1407.00 MiB
CPU times: user 23.5 s, sys: 20.9 s, total: 44.4 s
Wall time: 51.4 s


## 4. R EDA

In [11]:
%%time
%%R

library(arrow)
library(dplyr)

# Read feather file from python
start_time <- Sys.time()
r_table <- arrow::read_feather("figshare/combined_data.feather")
print(class(r_table))
result <- r_table %>% select('rain (mm/day)') %>% max()
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
[1] 432.9395
Time difference of 26.65761 secs
CPU times: user 10 s, sys: 19.2 s, total: 29.3 s
Wall time: 26.8 s


#### Discussion
- Our team use `Feather file` approach to transfer the dataframe from python to R
- Our EDA(finding the maximum rain drop) only need a single column, so basically in Python we use Python Arrow to explore the data both for the space and time efficiency
- Feather is how we store the arrow table in memory to disk
- R has a good support on reading feather format data 