### Import libraries

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

### Global variables

In [2]:
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
raw_data_directory = "../data/raw/"
unzip_directory = "../data/unzip"
combined_directory = "../data/combined"
files_to_dl = ["data.zip"]
dtypes = {
    "lat_min": "float16",
    "lat_max": "float16",
    "lon_min": "float16",
    "lon_max": "float16",
    "rain (mm/day)": "float16",
    "model": "str",
}

### Check files from API

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)
files = data["files"]
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

### 3.1 Download file

In [4]:
%%time
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(raw_data_directory, exist_ok=True)
        urlretrieve(file["download_url"], raw_data_directory + file["name"])

CPU times: user 5.37 s, sys: 4.88 s, total: 10.3 s
Wall time: 2min 10s


### 3.2 Unzip the file

In [5]:
%%time
os.makedirs(unzip_directory, exist_ok=True)
with zipfile.ZipFile(os.path.join(raw_data_directory, "data.zip"), 'r') as f:
    f.extractall(unzip_directory)

CPU times: user 22.9 s, sys: 5.19 s, total: 28.1 s
Wall time: 32.1 s


### 4.0 Remove unnecessary csv

In [6]:
os.remove(unzip_directory + "/observed_daily_rainfall_SYD.csv")

### 4.1 Merge files

In [7]:
%%time
files = glob.glob(unzip_directory + '/*.csv')
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall(r'(?<=unzip\/).+(?=_daily)', file)[0])
                for file in files)
              )
os.makedirs(combined_directory, exist_ok=True)
df.to_csv(combined_directory + "/combined_data.csv")

CPU times: user 8min 39s, sys: 29.7 s, total: 9min 8s
Wall time: 9min 36s


In [9]:
%%sh
du -sh ../data/combined/combined_data.csv

5.6G	../data/combined/combined_data.csv


### 4.2 Time comparison of merging files

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Arushi Ahuja| MacOs | 8GB | Intel Core i5 | Yes| 9 min 36s |
| Dongxiao Li |                  |     |           |        |            |
| Simon Guo   |                  |     |           |        |            |
| Thomas Siu  | MacOS | 16GB | Apple M1 | Yes |    6min 30s        |


summarize the observations

### 5.1.1 Load the combined csv (changing dtype) and EDA

In [16]:
%%time
df = pd.read_csv(combined_directory + "/combined_data.csv", parse_dates=['time'], dtype=dtypes)
print(df["model"].value_counts())

MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-ESM2           3541230
CMCC-CM2-SR5        3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
CPU times: user 1min 30s, sys: 13.6 s, total: 1min 44s
Wall time: 1min 57s


### 5.1.3 Time comparison

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Arushi Ahuja| MacOs | 8GB | Intel Core i5 | Yes|  1 min 57s          |
| Dongxiao Li |                  |     |           |        |            |
| Simon Guo   |                  |     |           |        |            |
| Thomas Siu  | MacOS | 16GB | Apple M1 | Yes |           |

### 5.2.1 Load the combined csv (Load just columns what we want) and EDA

In [18]:
%%time
df = df.drop(columns = ["lat_min", "lat_max", "lon_min", "lon_max"])
print(df["model"].value_counts())

MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-ESM2           3541230
CMCC-CM2-SR5        3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
CPU times: user 4.99 s, sys: 413 ms, total: 5.41 s
Wall time: 5.53 s


### 5.2.3 Time comparison

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Arushi Ahuja| MacOs | 8GB | Intel Core i5 | Yes| 5.41 s           |
| Dongxiao Li |                  |     |           |        |            |
| Simon Guo   |                  |     |           |        |            |
| Thomas Siu  | MacOS | 16GB | Apple M1 | Yes |          |

### 5.3 Summary and observations

summary the observations

### 6.1 Import R libraries

In [11]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


// Run it in console in 525 environment

conda install -c conda-forge r-dplyr

In [47]:
%%R
suppressMessages(library(arrow, warn.conflicts = FALSE))
suppressMessages(library(dplyr, warn.conflicts = FALSE))

### 6.2 Transfer data to R

### 6.3 EDA in R

### 6.4 Discussions

Discuss why you chose this approach over others.

### 7. Challenges and Difficulties

Discuss any challenges or difficulties you faced when dealing with this large amount of data on your laptops. Briefly explain your approach to overcome the challenges or reasons why you could not overcome them.