# Group 16 Milestone 1

In [1]:
import os
import os.path
import zipfile
import requests
import json
import pandas as pd
from urllib.request import urlretrieve
import glob
import re

In [2]:
# Necessary metadata
url = "https://api.figshare.com/v2/articles/14096681"
headers = {"Content-Type": "application/json"}
output_directory = "../data/"
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)
files = data["files"]

In [3]:
%%time
files_to_dl = "data.zip"
if not os.path.isfile(output_directory + files_to_dl):
    for file in files:
        if file["name"] == files_to_dl:
            os.makedirs(output_directory, exist_ok=True)
            urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 34 µs, sys: 62 µs, total: 96 µs
Wall time: 98.2 µs


In [4]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, files_to_dl), "r") as f:
    f.extractall(output_directory)
os.remove("../data/observed_daily_rainfall_SYD.csv")

CPU times: user 14.7 s, sys: 996 ms, total: 15.7 s
Wall time: 16.4 s


In [5]:
%%time
files = glob.glob("../data/*.csv")
columns = ["time", "lat_min", "lat_max", "lon_min", "lon_max", "rain (mm/day)"]
df = pd.concat((pd.read_csv(file, index_col=0, usecols=columns)
                .assign(model=re.findall(r"[^\/]+(?=\_daily)", file)[0])
                for file in files)
              )

CPU times: user 39.3 s, sys: 5.68 s, total: 45 s
Wall time: 48.6 s


In [6]:
df

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM
...,...,...,...,...,...,...
2014-12-27 12:00:00,-30.157068,-29.214660,153.1250,154.3750,6.689683e+00,SAM0-UNICON
2014-12-28 12:00:00,-30.157068,-29.214660,153.1250,154.3750,7.862555e+00,SAM0-UNICON
2014-12-29 12:00:00,-30.157068,-29.214660,153.1250,154.3750,1.000503e+01,SAM0-UNICON
2014-12-30 12:00:00,-30.157068,-29.214660,153.1250,154.3750,8.541592e+00,SAM0-UNICON


## Runtimes of CSV compilation


| Team Member | OS           | RAM | Processor        | Is SSD | Wall Time Taken |
|-------------|--------------|-----|------------------|--------|-----------------|
| Nikita      | Ubuntu 20.04 | 8GB | 8th Gen Core i7  | Yes    | 1min 32s        |
| Margot      |              |     |                  |        |                 |
| Thea        |              |     |                  |        |                 |
| Kiran       |    MacOS Big Sur          |   8GB  |     Apple M1 chip            |    Yes    |        1min 40s         |

## EDA for R

In [7]:
%load_ext rpy2.ipython

**In order to select which method is the most appropriate to transfer the dataframe from python to R, we chose to try all methods and observe which method had a better performance:**

### Parquet Method

In [34]:
%%time
df.to_parquet("../data/rainfall.parquet")

CPU times: user 15.6 s, sys: 5.27 s, total: 20.9 s
Wall time: 26.2 s


In [37]:
%%time
%%R
library(dplyr)
library(arrow)
parquet_rdf <- read_parquet("../data/rainfall.parquet") |> collect()

CPU times: user 3.96 s, sys: 3.72 s, total: 7.69 s
Wall time: 7.96 s


### Feather Method

In [10]:
%%time
import pyarrow.feather as feather
feather.write_feather(df, '../data/rainfall.feather')

CPU times: user 4.29 s, sys: 4.13 s, total: 8.42 s
Wall time: 5.78 s


In [38]:
%%time
%%R
feather_rdf <- read_feather("../data/rainfall.feather") |> collect()

CPU times: user 1.09 s, sys: 1.74 s, total: 2.83 s
Wall time: 4.65 s


### Arrow Exchange Method

In [41]:
import rpy2.robjects.conversion
import pyarrow
import rpy2.rinterface
import rpy2_arrow.pyarrow_rarrow as pyra
from rpy2.robjects.packages import importr

In [42]:
%%time
arrow_rframe = pyra.converter.py2rpy(pyarrow.Table.from_pandas(df))

CPU times: user 4.6 s, sys: 3.2 s, total: 7.8 s
Wall time: 11.2 s


In [46]:
%%time
%%R -i arrow_rframe
library(dplyr)
arrow_rframe <- arrow_rframe |> collect()
arrow_rframe

# A tibble: 62,467,843 × 7
   lat_min lat_max lon_min lon_max `rain (mm/day)` model           time         
     <dbl>   <dbl>   <dbl>   <dbl>           <dbl> <chr>           <chr>        
 1   -35.4   -33.6    142.    143.        4.24e-13 MPI-ESM-1-2-HAM 1889-01-01 1…
 2   -35.4   -33.6    142.    143.        4.22e-13 MPI-ESM-1-2-HAM 1889-01-02 1…
 3   -35.4   -33.6    142.    143.        4.50e-13 MPI-ESM-1-2-HAM 1889-01-03 1…
 4   -35.4   -33.6    142.    143.        4.25e-13 MPI-ESM-1-2-HAM 1889-01-04 1…
 5   -35.4   -33.6    142.    143.        4.27e-13 MPI-ESM-1-2-HAM 1889-01-05 1…
 6   -35.4   -33.6    142.    143.        4.20e-13 MPI-ESM-1-2-HAM 1889-01-06 1…
 7   -35.4   -33.6    142.    143.        4.19e-13 MPI-ESM-1-2-HAM 1889-01-07 1…
 8   -35.4   -33.6    142.    143.        4.56e-13 MPI-ESM-1-2-HAM 1889-01-08 1…
 9   -35.4   -33.6    142.    143.        2.53e+ 0 MPI-ESM-1-2-HAM 1889-01-09 1…
10   -35.4   -33.6    142.    143.        4.12e- 2 MPI-ESM-1-2-HAM 1889-01-10 1…
#

In [48]:
%%R
arrow_rframe |> group_by(model) |> summarize(mean_rainfall = mean(`rain (mm/day)`))
summary(arrow_rframe)

    lat_min           lat_max          lon_min           lon_max     
 Min.   :-36       Min.   :-36.00   Min.   :141       Min.   :141.2  
 1st Qu.:-35       1st Qu.:-33.66   1st Qu.:143       1st Qu.:145.0  
 Median :-33       Median :-32.04   Median :147       Median :148.1  
 Mean   :-33       Mean   :-31.98   Mean   :147       Mean   :148.2  
 3rd Qu.:-31       3rd Qu.:-30.16   3rd Qu.:150       3rd Qu.:151.3  
 Max.   :-30       Max.   :-27.91   Max.   :154       Max.   :155.6  
 NA's   :3219300                    NA's   :3219300                  
 rain (mm/day)        model               time          
 Min.   :  0       Length:62467843    Length:62467843   
 1st Qu.:  0       Class :character   Class :character  
 Median :  0       Mode  :character   Mode  :character  
 Mean   :  2                                            
 3rd Qu.:  1                                            
 Max.   :433                                            
 NA's   :3219300                         