## 1. Download And Extract

In [1]:
import glob
import json
import os
import re
import zipfile
from urllib.request import urlretrieve

import pandas as pd
import requests

In [2]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "../data/"

In [3]:
# Get the files data
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)
files = data["files"]
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [5]:
%%time
# Downlaod the file
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Vanessa Yuen  | Windows 10 Pro | 16GB  | AMD Ryzen 7  |  Y      |  1min 13s |
| Zheren Xu   | macOS Big Sur Version 11.6  |   16GB |  Apple M1 |Y|57.2s|
| Member 3    |                  |     |           |        |            |
| Member 4    |                  |     |           |        |            |

In [20]:
%%time
# Extract the zip file
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), "r") as f:
    f.extractall(output_directory)

CPU times: user 14.5 s, sys: 1.11 s, total: 15.6 s
Wall time: 15.7 s


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Vanessa Yuen  | Windows 10 Pro | 16GB  | AMD Ryzen 7  |  Y      | 23.5 s |
| Zheren Xu   | macOS Big Sur Version 11.6  |   16GB |  Apple M1 |Y|15.9 s|
| Member 3    |                  |     |           |        |            |
| Member 4    |                  |     |           |        |            |

## 2. Combining data CSVs

In [21]:
%%time
# Combine the CSV
# Remember to remove 'observed_daily_rainfall_SYD.csv' before running this cell
use_cols = [
    "time",
    "lat_min",
    "lat_max",
    "lon_min",
    "lon_max",
    "rain (mm/day)",
]

files = glob.glob("../data/*.csv")

df = pd.concat(
    (
        pd.read_csv(file, index_col=0, usecols=use_cols).assign(
            model=re.findall(r"[^\\]+(?=\_daily_rainfall)", file)[0]
        )
        for file in files
    )
)
df.to_csv("../data/combined_data.csv")

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Vanessa Yuen  | Windows 10 Pro | 16GB  | AMD Ryzen 7  |  Y      |   7min 17s |
| Zheren Xu   | macOS Big Sur  |   16GB |  Apple M1 |Y|6min 14s|
| Member 3    |                  |     |           |        |            |
| Member 4    |                  |     |           |        |            |

### **Add observations here**

## 3. Simple EDA

### 3.1 Sanity Check

In [18]:
print(df.shape)
print(df.head())

(62467843, 4)
     lat_min   lon_min  rain (mm/day)                    model
0 -35.439867  141.5625   4.244226e-13  ../data/MPI-ESM-1-2-HAM
1 -35.439867  141.5625   4.217326e-13  ../data/MPI-ESM-1-2-HAM
2 -35.439867  141.5625   4.498125e-13  ../data/MPI-ESM-1-2-HAM
3 -35.439867  141.5625   4.251282e-13  ../data/MPI-ESM-1-2-HAM
4 -35.439867  141.5625   4.270161e-13  ../data/MPI-ESM-1-2-HAM


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62467843 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   lat_min        float64
 1   lat_max        float64
 2   lon_min        float64
 3   lon_max        float64
 4   rain (mm/day)  float64
 5   model          object 
dtypes: float64(5), object(1)
memory usage: 3.3+ GB


### 3.2 Load just columns what we want

#### Load all columns

In [14]:
%%time
df_all_col = pd.read_csv("../data/combined_data.csv")
print(df_all_col["model"].value_counts())

../data/MPI-ESM1-2-HR       5154240
../data/CMCC-CM2-HR4        3541230
../data/CMCC-ESM2           3541230
../data/CMCC-CM2-SR5        3541230
../data/NorESM2-MM          3541230
../data/TaiESM1             3541230
../data/SAM0-UNICON         3541153
../data/GFDL-ESM4           3219300
../data/FGOALS-f3-L         3219300
../data/GFDL-CM4            3219300
../data/MRI-ESM2-0          3037320
../data/EC-Earth3-Veg-LR    3037320
../data/BCC-CSM2-MR         3035340
../data/MIROC6              2070900
../data/ACCESS-CM2          1932840
../data/ACCESS-ESM1-5       1610700
../data/INM-CM4-8           1609650
../data/INM-CM5-0           1609650
../data/FGOALS-g3           1287720
../data/KIOST-ESM           1287720
../data/AWI-ESM-1-1-LR       966420
../data/MPI-ESM1-2-LR        966420
../data/NESM3                966420
../data/MPI-ESM-1-2-HAM      966420
../data/NorESM2-LM           919800
../data/BCC-ESM1             551880
../data/CanESM5              551880
Name: model, dtype: int64
CP

#### Load just what is needed

In [15]:
%%time
use_cols = ['lat_min','lon_min','rain (mm/day)','model']
df = pd.read_csv("../data/combined_data.csv",usecols=use_cols)
print(df["model"].value_counts())

../data/MPI-ESM1-2-HR       5154240
../data/CMCC-CM2-HR4        3541230
../data/CMCC-ESM2           3541230
../data/CMCC-CM2-SR5        3541230
../data/NorESM2-MM          3541230
../data/TaiESM1             3541230
../data/SAM0-UNICON         3541153
../data/GFDL-ESM4           3219300
../data/FGOALS-f3-L         3219300
../data/GFDL-CM4            3219300
../data/MRI-ESM2-0          3037320
../data/EC-Earth3-Veg-LR    3037320
../data/BCC-CSM2-MR         3035340
../data/MIROC6              2070900
../data/ACCESS-CM2          1932840
../data/ACCESS-ESM1-5       1610700
../data/INM-CM4-8           1609650
../data/INM-CM5-0           1609650
../data/FGOALS-g3           1287720
../data/KIOST-ESM           1287720
../data/AWI-ESM-1-1-LR       966420
../data/MPI-ESM1-2-LR        966420
../data/NESM3                966420
../data/MPI-ESM-1-2-HAM      966420
../data/NorESM2-LM           919800
../data/BCC-ESM1             551880
../data/CanESM5              551880
Name: model, dtype: int64
CP

#### Compare

| Team Member | Operating System | RAM | Processor | Is SSD | Load All Cols| Load what is needed |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|:----------:|
| Vanessa Yuen  | Windows 10 Pro | 16GB  | AMD Ryzen 7  |  Y      |   ||
| Zheren Xu   | macOS Big Sur |   16GB |  Apple M1 |Y|57.8s|41.2s|
| Member 3    |                  |     |           |        |            ||
| Member 4    |                  |     |           |        |            ||

### **Add observations here**

## 4. Simple EDA in R

In [18]:
%load_ext rpy2.ipython



In [19]:
%%R

UsageError: %%R is a cell magic, but the cell body is empty. Did you mean the line magic %R (single %)?
