# Milestone 1

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

## Download data and unzip the file

In [2]:
# Referenced the below code from lecture notes 

article_id = 14096681 
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = os.path.join(os.path.pardir, "data/rainfall/")

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  
files = data["files"]            
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

## Combine CSVs into 1 (written to data/combined_data.csv)

In [49]:
%%time
files = glob.glob(os.path.join(os.path.pardir, "data/rainfall/*.csv"))
files.remove(os.path.join(os.path.pardir, "data/rainfall", "observed_daily_rainfall_SYD.csv"))

df = pd.concat((
  pd.read_csv(file, index_col=0)
  .assign(model=re.findall("([^_]*)", os.path.basename(file))[0])
  for file in files))
df.to_csv(os.path.join(os.path.pardir, "data/combined_data.csv"))

CPU times: total: 5min 46s
Wall time: 5min 47s


## EDA using pandas

Saving memory usage: Using only certain columns (time, model, rain) and cast rain to float32

In [50]:
%%time
use_cols = ['time', 'rain (mm/day)', 'model']
df = pd.read_csv(os.path.join(os.path.pardir, "data/combined_data.csv"), 
    usecols = use_cols)
df['rain (mm/day)'] = df['rain (mm/day)'].astype('float32', errors='ignore')
df['model'].value_counts()

CPU times: total: 44.1 s
Wall time: 44.9 s


MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64

| Team Member | Operating System | RAM          | Processor             | Is SSD | CPU time   | Wall time  |
|:-----------:|:----------------:|:------------:|:---------------------:|:------:|:----------:|:----------:|
| Rev         |MacOS             |8GB 3733MHz   |Intel i3 1.1GHz        |Yes     |6m56s       |7m47s       |
| Caroline    |Windows 10        |16GB 3200MHz  |Intel i7-11800H 2.3GHz |Yes     |5m47s       |5m49s       |
| Sneha       |Windows 11        |16GB 4800MHz  |Intel i7-12700H 2.3GHz |Yes     |5m20s       |5m39s       |
| Renzo       |Windows 10        |8GB 2400MHz   |Intel i5-7300HQ 2.5GHz |Yes     |12m8s       |13m52s      |