### 1. Import libraries

In [1]:
import pandas as pd
import requests
import json
import os
import shutil
from tqdm.auto import tqdm
import zipfile
import glob
import re
import pyarrow as pa
import rpy2_arrow.pyarrow_rarrow as pyra

import gc

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


### 2. Download the data

In [2]:
# Metadata
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
out_dir = os.path.join(os.getcwd(), "..", "data", "raw", "figshare")
file_to_download = "data.zip"

# Get file url
file_url = [
    item_["download_url"]
    for item_ in requests.get(url, headers=headers).json()["files"]
    if item_["name"] == file_to_download
][0]

# Check if file has already been downloaded
if os.listdir(out_dir):
    print("File already exists. Skipping.")
else:
    print(f"Writing file file {file_to_download} to directory {out_dir}")

    # Create an HTTP request
    with requests.get(file_url, stream=True) as r:

        # Check content length
        content_length = int(r.headers.get("Content-Length"))

        # SDisplay progress bar
        with tqdm.wrapattr(r.raw, "read", total=content_length, desc="") as raw:

            # Save file
            os.makedirs(out_dir)
            with open(os.path.join(out_dir, 
                                   file_to_download), "wb") as path:
                shutil.copyfileobj(raw, path)

    print("Download complete.")

    # Unzip file with python
    print("Unzipping file...")
    with zipfile.ZipFile(os.path.join(out_dir, file_to_download), "r") as zip_ref:
        zip_ref.extractall(out_dir) # Extract all files to directory
        zip_ref.close()
    print("Unzipping complete.")


File already exists. Skipping.


### 3. Combining data csv

In [3]:
out_processed_dir = os.path.join(os.getcwd(), "..", "data", "processed", "figshare")
file_to_exclude = "observed_daily_rainfall_SYD.csv"
files = glob.glob(out_dir + "/*.csv")

In [4]:
%%timeit -r 1

# Combine data
df = pd.concat(
    (
        pd.read_csv(file, index_col=0).assign(model=re.findall(r"[^\/]+(?=\_daily)", file)[0])
        for file in files
        if file_to_exclude not in file
    )
)

# Write to file
os.makedirs(out_processed_dir, exist_ok=True)  
df.to_csv(os.path.join(out_processed_dir, "processed_rainfall.csv"))


4min 51s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 4min 51s   |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |     6min 40s       |
| Michelle Wang      |                  |      |                        |        |            |

### 4. Load the combined CSV to memory and perform a simple EDA

#### A. Load all columns

In [5]:
gc.collect()

0

In [6]:
%%timeit -r 1

# Load the data
df = pd.read_csv(os.path.join(out_processed_dir, "processed_rainfall.csv"), index_col=0)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")  
print(df.describe())



Model counts:
MPI-ESM1-2-HR       5154240
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
SAM0-UNICON         3541153
GFDL-CM4            3219300
FGOALS-f3-L         3219300
GFDL-ESM4           3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
Data description:
            lat_min       lat_max       lon_min       lon_max  rain (mm/day)
count  5.924854e+07  6.246784e+07  5.924854e+07  6.246784e+07   5.924854e+07
mean  -3.310482e+01 -3.197757e+0

#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 1min 0s   |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |   1min 18s       |
| Michelle Wang      |                  |      |                        |        |            |

#### B. Load only required columns


In [7]:
gc.collect()

0

In [8]:
%%timeit -r 1
use_cols = ["time", "rain (mm/day)", "model"]
df = pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True,
    usecols=use_cols,
)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")
print(df.describe())


Model counts:
MPI-ESM1-2-HR       5154240
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
SAM0-UNICON         3541153
GFDL-CM4            3219300
FGOALS-f3-L         3219300
GFDL-ESM4           3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
Data description:
       rain (mm/day)
count   5.924854e+07
mean    1.901170e+00
std     5.585735e+00
min    -3.807373e-12
25%     3.838413e-06
50%     6.154947e-02
75%     1.020918e+00
max     4.329395e+

#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 46.8s      |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |    1min 26s      |
| Michelle Wang      |                  |      |                        |        |            |

#### C. Change dtype and use only required columns

In [9]:
%%timeit -r 1

use_cols = ["time", "rain (mm/day)", "model"]
dtypes = {"rain (mm/day)": "float32", "model": "str"}

df = pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True,
    usecols=use_cols,
    dtype=dtypes,
)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")
print(df.describe())



Model counts:
MPI-ESM1-2-HR       5154240
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
SAM0-UNICON         3541153
GFDL-CM4            3219300
FGOALS-f3-L         3219300
GFDL-ESM4           3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
Data description:
       rain (mm/day)
count   5.924854e+07
mean    1.901173e+00
std     5.585735e+00
min    -3.807373e-12
25%     3.838413e-06
50%     6.154947e-02
75%     1.020918e+00
max     4.329395e+

#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 46.1s      |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |    1min 21s|
| Michelle Wang      |                  |      |                        |        |            |

#### D. Use chunks

In [10]:
gc.collect()

3

In [11]:
%%timeit -r 1

df = pd.DataFrame()
for chunk in pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True, 
    chunksize=1_000_000):
    df = pd.concat([df, chunk])

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")
print(df.describe())

Model counts:
MPI-ESM1-2-HR       5154240
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
SAM0-UNICON         3541153
GFDL-CM4            3219300
FGOALS-f3-L         3219300
GFDL-ESM4           3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
Data description:
            lat_min       lat_max       lon_min       lon_max  rain (mm/day)
count  5.924854e+07  6.246784e+07  5.924854e+07  6.246784e+07   5.924854e+07
mean  -3.310482e+01 -3.197757e+0

#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 1min 34s   |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |     2min 12s      |
| Michelle Wang      |                  |      |                        |        |            |

### 5. Perform a simple EDA in R

**Approach to transfer data from python to R**

We are more inclined to use the 'Arrow Exchange' method. Using 'Apache Arrow' as an intermediate step can speed up the conversion of a 'pandas. DataFrame'. The pyarrow package uses compiled code to efficiently convert a 'pandas. DataFrame' to a 'Arrow' data structure, and the R package arrow can do the same from a 'Arrow' data structure to a 'R data.frame'.

Time spent on this serialization/deserialization process is very less and is also a zero-copy process.

In [16]:
df = pd.DataFrame()
for chunk in pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True, 
    chunksize=1_000_000):
    df = pd.concat([df, chunk])

In [17]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [18]:
%%time
rdf = pyra.converter.py2rpy(pa.Table.from_pandas(df))

CPU times: user 2.41 s, sys: 429 ms, total: 2.84 s
Wall time: 1.71 s


In [21]:
%%time
%%R -i rdf
library(dplyr)

# Get the model counts
print("Model counts:")
print(count(rdf, model, sort = TRUE))

# Describe the data
print("Data description:")
print(summary(rdf))


[1] "Model counts:"
InMemoryDataset (query)
model: string
n: int32

* Sorted by n [desc]
See $.data for the source Arrow object
[1] "Data description:"
              Length   Class        Mode       
lat_min       62467843 ChunkedArray environment
lat_max       62467843 ChunkedArray environment
lon_min       62467843 ChunkedArray environment
lon_max       62467843 ChunkedArray environment
rain (mm/day) 62467843 ChunkedArray environment
model         62467843 ChunkedArray environment
time          62467843 ChunkedArray environment
CPU times: user 45.2 ms, sys: 616 µs, total: 45.8 ms
Wall time: 38.4 ms


### 