### 1. Import libraries

In [6]:
import pandas as pd
import requests
import json
import os
import shutil
from tqdm.auto import tqdm
import zipfile
import glob
import re
import pyarrow as pa
import rpy2_arrow.pyarrow_rarrow as pyra
import pyarrow.dataset as ds
import gc

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


  from .autonotebook import tqdm as notebook_tqdm


### 2. Download the data

In [2]:
# Metadata
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
out_dir = os.path.join(os.getcwd(), "..", "data", "raw", "figshare")
file_to_download = "data.zip"

# Get file url
file_url = [
    item_["download_url"]
    for item_ in requests.get(url, headers=headers).json()["files"]
    if item_["name"] == file_to_download
][0]

# Check if file has already been downloaded
if os.path.exists(os.path.join(out_dir, file_to_download)):
    print("File already exists. Skipping.")
else:
    print(f"Writing file file {file_to_download} to directory {out_dir}")

    # Create an HTTP request
    with requests.get(file_url, stream=True) as r:

        # Check content length
        content_length = int(r.headers.get("Content-Length"))

        # SDisplay progress bar
        with tqdm.wrapattr(r.raw, "read", total=content_length, desc="") as raw:

            # Save file
            os.makedirs(out_dir)
            with open(os.path.join(out_dir, 
                                   file_to_download), "wb") as path:
                shutil.copyfileobj(raw, path)

    print("Download complete.")

if not any(fname.endswith('.csv') for fname in os.listdir('.')):
    # Unzip file with python
    print("Unzipping file...")
    with zipfile.ZipFile(os.path.join(out_dir, file_to_download), "r") as zip_ref:
        zip_ref.extractall(out_dir) # Extract all files to directory
        zip_ref.close()
    print("Unzipping complete.")


File already exists. Skipping.
Unzipping file...
Unzipping complete.


### 3. Combining data csv

In [3]:
out_processed_dir = os.path.join(os.getcwd(), "..", "data", "processed", "figshare")
file_to_exclude = "observed_daily_rainfall_SYD.csv"
files = glob.glob(out_dir + "/*.csv")

In [4]:
%%timeit -r 1

# Combine data
df = pd.concat(
    (
        pd.read_csv(file, index_col=0).assign(model=re.findall(r"[^\/]+(?=\_daily)", file)[0])
        for file in files
        if file_to_exclude not in file
    )
)

# Write to file
os.makedirs(out_processed_dir, exist_ok=True)  
df.to_csv(os.path.join(out_processed_dir, "processed_rainfall.csv"))


15min 25s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


#### Compare run times on different machines - Combining data

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 4min 51s   |
| Mahsa Sarafrazi    | Windows 11 64-bit| 8 GB | Intel® Core™ i5-1035G4 |Yes     | 17min 4s   |
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |     6min 40s       |
| Michelle Wang      | Windows 10        | 16GB | Intel® Core™ i5-11300H  |      Yes                  |  15min 25s    

### 4. Load the combined CSV to memory and perform a simple EDA

#### A. Load all columns

In [5]:
gc.collect()

27

In [6]:
%%timeit -r 1

# Load the data
df = pd.read_csv(os.path.join(out_processed_dir, "processed_rainfall.csv"), index_col=0)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")  
print(df.describe())



Model counts:
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\MPI-ESM1-2-HR       5154240
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\TaiESM1             3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\NorESM2-MM          3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\CMCC-CM2-HR4        3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\CMCC-CM2-SR5        3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\CMCC-ESM2           3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\SAM0-UNICON         3541153
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\FGOALS

#### Compare run times on different machines - Load all columns

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 1min 0s   |
| Mahsa Sarafrazi    | Windows 11 64-bit   | 8 GB | Intel® Core™ i5-1035G4 | Yes    | 3min 37s  |
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |   1min 18s       |
| Michelle Wang      |  Windows 10        | 16GB | Intel® Core™ i5-11300H  |   Yes |  3min 29s          |

#### B. Load only required columns


In [7]:
gc.collect()

18

In [8]:
%%timeit -r 1
use_cols = ["time", "rain (mm/day)", "model"]
df = pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True,
    usecols=use_cols,
)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")
print(df.describe())


Model counts:
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\MPI-ESM1-2-HR       5154240
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\TaiESM1             3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\NorESM2-MM          3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\CMCC-CM2-HR4        3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\CMCC-CM2-SR5        3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\CMCC-ESM2           3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\SAM0-UNICON         3541153
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\FGOALS

#### Compare run times on different machines - Load only required cols

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 46.8s      |
| Mahsa Sarafrazi    | Windows 64-bit   | 8 GB | Intel® Core™ i5-1035G4 | Yes    | 7min 34s  |
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |    1min 26s      |
| Michelle Wang      | Windows 10        | 16GB | Intel® Core™ i5-11300H  |   Yes |   3min 15s         |

> We can see that time is now slightly reduced: loading required columns reduced time taken for most of us - 3 out of 4 members (previously from about 1 minutes+ to now under 1 minute).

#### C. Change dtype and use only required columns

In [9]:
%%timeit -r 1

use_cols = ["time", "rain (mm/day)", "model"]
dtypes = {"rain (mm/day)": "float32", "model": "str"}

df = pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True,
    usecols=use_cols,
    dtype=dtypes,
)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")
print(df.describe())



Model counts:
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\MPI-ESM1-2-HR       5154240
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\TaiESM1             3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\NorESM2-MM          3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\CMCC-CM2-HR4        3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\CMCC-CM2-SR5        3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\CMCC-ESM2           3541230
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\SAM0-UNICON         3541153
C:\Users\macyt\Documents\UBC_Class\Labs\525_proj\dsci_525_group_22\notebooks\..\data\raw\figshare\FGOALS

#### Compare run times on different machines - Change dtype and load required cols

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 46.1s      |
| Mahsa Sarafrazi    | Windows 11 64-bit   | 8 GB | Intel® Core™ i5-1035G4 | Yes    | 9min 55s   |
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |    1min 21s|
| Michelle Wang      |  Windows 10        | 16GB | Intel® Core™ i5-11300H  |   Yes |   2min 58s         |

> Adding onto the above, changing dtype has further reduced our time for most of us (3 out of 4 members) slightly. 

#### D. Use chunks

In [10]:
gc.collect()

39

In [17]:
%%timeit -r 1

df = pd.DataFrame()
counts = pd.Series(dtype=int)

for chunk in pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True, 
    chunksize=1_000_000):
    df = pd.concat([df, chunk])
    

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data   
print("Data description:")
print(df.describe())

1min 49s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


#### Compare run times on different machines - Chunking

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 1min 34s   |
| Mahsa Sarafrazi    | Windows 64-bit   | 8 GB | Intel® Core™ i5-1035G4 | Yes    | 5min 44s   | 
| Gabe Fairbrother   |  Windows 10      | 32GB | Intel® Core™ i7-10875H | Yes    |     2min 12s      |
| Michelle Wang      |  Windows 10        | 16GB | Intel® Core™ i5-11300H  |   Yes  |    3min 50s        |

> For chunking, it seems like there is not much improvement for most of us.

**EDA Python Conclusion:**
After trying out a few techinques, we can conclude that both loading the required columns and changing datatypes are effective ways of reducing runtime.

**Plotting**


##### Reading the Dataframe:

In [14]:
# df = pd.read_csv(
#     os.path.join(out_processed_dir, "processed_rainfall.csv"),
#     index_col=0,
#     usecols=["time", "rain (mm/day)", "model"],
#     parse_dates=True,
# )
# df.head()

Unnamed: 0_level_0,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1
1889-01-01 12:00:00,3.293256e-13,C:\Users\macyt\Documents\UBC_Class\Labs\525_pr...
1889-01-02 12:00:00,0.0,C:\Users\macyt\Documents\UBC_Class\Labs\525_pr...
1889-01-03 12:00:00,0.0,C:\Users\macyt\Documents\UBC_Class\Labs\525_pr...
1889-01-04 12:00:00,0.0,C:\Users\macyt\Documents\UBC_Class\Labs\525_pr...
1889-01-05 12:00:00,0.01047658,C:\Users\macyt\Documents\UBC_Class\Labs\525_pr...


##### Sampling 1_000_000 rows

In [None]:
# df_sample = df.sample(n=1000000, random_state=42)
# df_sample.to_csv(os.path.join(out_processed_dir, "EDA.csv"))

##### Rainfall distribution:

In [None]:
# plot = (
#     alt.Chart(df_sample, title="Total rain distribution")
#     .mark_boxplot(extent="min-max")
#     .encode(alt.X("rain (mm/day)"))
# )
# plot

##### Rainfall distribution based on model:

In [None]:
# plot = (
#     alt.Chart(df_sample, title="Rain distribution based on model")
#     .mark_boxplot(extent="min-max")
#     .encode(
#         alt.X("rain (mm/day)"),
#         alt.Y(
#             "model",
#             sort=alt.EncodingSortField(
#                 field="rain (mm/day)", op="median", order="descending"
#             ),
#         ),
#         color="model",
#     )
# )
# plot

#### Rainfall histogram based on model

In [None]:
# alt.data_transformers.disable_max_rows()
# plot_hist = (
#     alt.Chart(df_sample, title="Rain fall histogram based on model")
#     .mark_bar()
#     .encode(alt.X("rain (mm/day)"), alt.Y("count():Q"), color="model")
#     .properties(width=180, height=180)
#     .facet(facet="model", columns=9)
# )
# plot_hist

### 6. Perform a simple EDA in R

We shall try out a few methods with simple EDA to test the efficiency of each method to convert data into R formats: Parquet file, Feather and Arrow.

In [13]:
# Create df with only the model column

out_processed_dir = os.path.join(os.getcwd(), "..", "data", "processed", "figshare")
use_cols = ["model"]
dtypes = {"model": "str"}

df = pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True,
    usecols=use_cols,
    dtype=dtypes,
)

#### a) Parquet file method

In [14]:
%%time

if os.path.exists(os.path.join(out_processed_dir, "rainfall.parquet")):
    print("Parquet File already exists. Skipping.")
else:
    df.to_parquet(os.path.join(out_processed_dir, "rainfall.parquet"))

CPU times: total: 21.2 s
Wall time: 22.6 s


In [2]:
%reload_ext rpy2.ipython



In [17]:
%%time
%%R
suppressMessages(library(arrow, warn.conflicts = FALSE))
suppressMessages(library(dplyr, warn.conflicts = FALSE))
library(here)

ds <- open_dataset(here("data/processed/figshare/rainfall.parquet"))
result <- ds %>% count(model, sort=TRUE)

# My windows comp crashes for this line! 
# print(result %>% collect)

CPU times: total: 62.5 ms
Wall time: 85.8 ms


> To convert pandas df to parquet, it took 22.6s and then loading the 'ds' parquet file was super fast because it hasn't processed anything. 

#### b) Feather method

In [17]:
%%time

# Create feather file from pandas df
if os.path.exists(os.path.join(out_processed_dir, "rainfall.feather")):
    print("Feather File already exists. Skipping.")
else:
    df.reset_index().to_feather(os.path.join(out_processed_dir, "rainfall.feather"))

CPU times: total: 8.55 s
Wall time: 8.56 s


In [18]:
%%R
suppressMessages(library(arrow, warn.conflicts = FALSE))
suppressMessages(library(dplyr, warn.conflicts = FALSE))
library(here)

# Can't seem to read this!
# f_df <- read_feather(here("data/processed/figshare/rainfall.parquet"))

> Converting pandas df to feather file took 8.5s. 

#### c) Arrow method

In [None]:
# NOTE: This code crashes for my windows comp! Using the next cell instead.
# %%time

# dataset = ds.dataset(os.path.join(out_processed_dir, "processed_rainfall.csv"), format="csv")
# table = dataset.to_table()
# r_table = pyra.converter.py2rpy(table)

In [5]:
%%time
rdf = pyra.converter.py2rpy(pa.Table.from_pandas(df))

CPU times: total: 7.47 s
Wall time: 7.51 s


In [21]:
%%time
%%R -i rdf
library(dplyr)

# Get the model counts
result <- rdf %>% count(model, sort=TRUE)
print(result %>% collect())

# Describe the data
print("Data description:")
print(summary(rdf))


[38;5;246m# A tibble: 27 x 2[39m
   model                                                                       n
   [3m[38;5;246m<chr>[39m[23m                                                                   [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m [38;5;246m"[39mC:\\Users\\macyt\\Documents\\UBC_Class\\Labs\\525_proj\\dsci_525_gro~ 5.15[38;5;246me[39m6
[38;5;250m 2[39m [38;5;246m"[39mC:\\Users\\macyt\\Documents\\UBC_Class\\Labs\\525_proj\\dsci_525_gro~ 3.54[38;5;246me[39m6
[38;5;250m 3[39m [38;5;246m"[39mC:\\Users\\macyt\\Documents\\UBC_Class\\Labs\\525_proj\\dsci_525_gro~ 3.54[38;5;246me[39m6
[38;5;250m 4[39m [38;5;246m"[39mC:\\Users\\macyt\\Documents\\UBC_Class\\Labs\\525_proj\\dsci_525_gro~ 3.54[38;5;246me[39m6
[38;5;250m 5[39m [38;5;246m"[39mC:\\Users\\macyt\\Documents\\UBC_Class\\Labs\\525_proj\\dsci_525_gro~ 3.54[38;5;246me[39m6
[38;5;250m 6[39m [38;5;246m"[39mC:\\Users\\macyt\\Documents\\UBC_Class\\Labs\\525_proj\\dsci_525_gro~ 3.

> Converting pandas df to arrow table object is fast: only 7.5s. Then printing the results of count by models and summary of dataset was only around 2.5s.

**Final chosen approach: Arrow exchange**

- After experimenting with different conversion methods to R, we concluded that the 'Arrow Exchange' method works best. 
- With parquet method, it took 26s just to convert the pandas file to parquet. And conversion to feather file format took 8.5s. The fastest was arrow which took around 7s. 
- The pyarrow package uses compiled code to efficiently convert a `pandas DataFrame` to an `Arrow` data structure, and the R arrow package can do the same from a `Arrow` data structure to a `R data.frame`.
- The `arrow` table structure is also well-integrated with R's Dplyr package functionalities and makes EDA extremely fast and convenient, as exemplified in the code above where the printing of EDA results only took 2s.
- Time spent on arrow's serialization/deserialization process is minimal and is also a zero-copy process.

### 