### 1. Import libraries

In [None]:
import pandas as pd
import requests
import json
import os
import shutil
from tqdm.auto import tqdm
import zipfile
import glob
import re
import pyarrow as pa
import rpy2_arrow.pyarrow_rarrow as pyra


### 2. Download the data

In [None]:
# Metadata
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
out_dir = os.path.join("../data", "raw", "figshare")
file_to_download = "data.zip"

# Get file url
file_url = [
    item_["download_url"]
    for item_ in requests.get(url, headers=headers).json()["files"]
    if item_["name"] == file_to_download
][0]

# Check if file has already been downloaded
if os.listdir(out_dir):
    print("File already exists. Skipping.")
else:
    print(f"Writing file file {file_to_download} to directory {out_dir}")

    # Create an HTTP request
    with requests.get(file_url, stream=True) as r:

        # Check content length
        content_length = int(r.headers.get("Content-Length"))

        # SDisplay progress bar
        with tqdm.wrapattr(r.raw, "read", total=content_length, desc="") as raw:

            # Save file
            with open(os.path.join(out_dir, file_to_download), "wb") as path:
                shutil.copyfileobj(raw, path)

    print("Download complete.")

    # Unzip file with python
    print("Unzipping file...")
    with zipfile.ZipFile(os.path.join(out_dir, file_to_download), "r") as zip_ref:
        zip_ref.extractall(out_dir) # Extract all files to directory
    print("Unzipping complete.")


### 3. Combining data csv

In [None]:
out_processed_dir = os.path.join("../data", "processed", "figshare")
file_to_exclude = "observed_daily_rainfall_SYD.csv"
files = glob.glob(out_dir + "/*.csv")

In [None]:
%%timeit -r 1

# Combine data
df = pd.concat(
    (
        pd.read_csv(file, index_col=0).assign(model=re.findall(r"[A-Z][^_]+", file)[0])
        for file in files
        if file != file_to_exclude
    )
)

# Write to file
os.makedirs(out_processed_dir, exist_ok=True)  
df.to_csv(os.path.join(out_processed_dir, "processed_rainfall.csv"))


#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 4min 50s   |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |                  |      |                        |        |            |
| Michelle Wang      |                  |      |                        |        |            |

### 4. Load the combined CSV to memory and perform a simple EDA

#### A. Load all columns

In [None]:
%%timeit -r 1

# Load the data
df = pd.read_csv(os.path.join(out_processed_dir, "processed_rainfall.csv"), index_col=0)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")  
print(df.describe())



#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 6min 39s   |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |                  |      |                        |        |            |
| Michelle Wang      |                  |      |                        |        |            |

#### B. Load only required columns


In [None]:
%%timeit -r 1
use_cols = ["time", "rain (mm/day)", "model"]
df = pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True,
    usecols=use_cols,
)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")
print(df.describe())


#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 6min 39s   |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |                  |      |                        |        |            |
| Michelle Wang      |                  |      |                        |        |            |

#### C. Change dtype and use only required columns

In [None]:
%%timeit -r 1

use_cols = ["time", "rain (mm/day)", "model"]
dtypes = {"rain (mm/day)": "float32", "model": "str"}

df = pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True,
    usecols=use_cols,
    dtype=dtypes,
)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")
print(df.describe())



#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 6min 39s   |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |                  |      |                        |        |            |
| Michelle Wang      |                  |      |                        |        |            |

#### D. Use chunks

In [None]:
%%timeit -r 1

df = pd.DataFrame()
for chunk in pd.read_csv(
    os.path.join(out_processed_dir, "processed_rainfall.csv"),
    index_col=0,
    parse_dates=True, 
    chunksize=1_000_000):
    df = df.append(chunk)

# Get the model counts
print("Model counts:")
print(df.model.value_counts())

# Describe the data
print("Data description:")
print(df.describe())

#### Compare run times on different machines

| Team Member        | Operating System | RAM  | Processor              | Is SSD | Time taken |
|:------------------:|:----------------:|:----:|:----------------------:|:------:|:----------:|
| Rakesh Pandey      | Ubuntu 20.04     | 32GB | Intel® Core™ i7-10870H | Yes    | 6min 39s   |
| Mahsa Sarafrazi    |                  |      |                        |        |            |
| Gabe Fairbrother   |                  |      |                        |        |            |
| Michelle Wang      |                  |      |                        |        |            |

### 5. Perform a simple EDA in R

**Approach to transfer data from python to R**

We are more inclined to use the 'Arrow Exchange' method. Using 'Apache Arrow' as an intermediate step can speed up the conversion of a 'pandas. DataFrame'. The pyarrow package uses compiled code to efficiently convert a 'pandas. DataFrame' to a 'Arrow' data structure, and the R package arrow can do the same from a 'Arrow' data structure to a 'R data.frame'.

Time spent on this serialization/deserialization process is very less and is also a zero-copy process.

In [None]:
%load_ext rpy2.ipython

In [None]:
%%time
rdf = pyra.converter.py2rpy(pa.Table.from_pandas(df))

In [None]:
%%time
%%R -i rdf
library(dplyr)

# Get the model counts
print("Model counts:")
print(count(rdf, model, sort = TRUE))

# Describe the data
print("Data description:")
print(summary(rdf))


### 