In [1]:
##import libraries
#import re
#import os
#import glob
#import zipfile
#import requests
#from urllib.request import urlretrieve
#import json
#import pandas as pd
#from memory_profiler import memory_usage

In [13]:
import io
import os
import json
import glob
#import intake
import requests
import numpy as np
import pandas as pd
#import xarray as xr
from urllib.request import urlretrieve
#import proplot as pplot
#from joblib import Parallel, delayed
#import warnings
#warnings.filterwarnings("ignore")  # ignore some annoying matplotlib warnings
from memory_profiler import memory_usage
import zipfile

In [14]:
# more library loading
%load_ext rpy2.ipython
%load_ext memory_profiler

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


### 3. Downloading the data

In [9]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshare/"

In [10]:
# metadata output
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]

[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

In [None]:
%%time
#download readme and data.zip files
files_to_dl = ["README.md", "data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

In [None]:
%%time
#extract zip files to repo
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

### 4. Combining data CSVs

In [24]:
%%time
%memit

#4

# Shows time that regular python takes to merge file
# Join all data together
import pandas as pd
use_cols = ["time",'lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']

files = glob.glob('./figshare/*.csv')

df_all = None

for file in files:
    
    filename = os.path.basename(file)
    
    if '_daily_rainfall_NSW.csv' in filename:
        print(f"Processing the file {filename}")
        model = filename.split('_daily_rainfall_NSW.csv')[0]

        df = pd.read_csv(file, usecols=use_cols, index_col=0)
        df['model'] = model    

        if df_all is None:
            df_all = df
        else:
            df_all = df_all.append(df)

peak memory: 4924.14 MiB, increment: 0.14 MiB
Processing the file ACCESS-CM2_daily_rainfall_NSW.csv
Processing the file ACCESS-ESM1-5_daily_rainfall_NSW.csv
Processing the file AWI-ESM-1-1-LR_daily_rainfall_NSW.csv
Processing the file BCC-CSM2-MR_daily_rainfall_NSW.csv
Processing the file BCC-ESM1_daily_rainfall_NSW.csv
Processing the file CanESM5_daily_rainfall_NSW.csv
Processing the file CMCC-CM2-HR4_daily_rainfall_NSW.csv
Processing the file CMCC-CM2-SR5_daily_rainfall_NSW.csv
Processing the file CMCC-ESM2_daily_rainfall_NSW.csv
Processing the file EC-Earth3-Veg-LR_daily_rainfall_NSW.csv
Processing the file FGOALS-f3-L_daily_rainfall_NSW.csv
Processing the file FGOALS-g3_daily_rainfall_NSW.csv
Processing the file GFDL-CM4_daily_rainfall_NSW.csv
Processing the file GFDL-ESM4_daily_rainfall_NSW.csv
Processing the file INM-CM4-8_daily_rainfall_NSW.csv
Processing the file INM-CM5-0_daily_rainfall_NSW.csv
Processing the file KIOST-ESM_daily_rainfall_NSW.csv
Processing the file MIROC6_dai

In [25]:
# save combined file
df_all.to_csv('./figshare/combined_data.csv')

In [29]:
%%sh 
#get file size of combined csv
du -sh figshare/combined_data.csv

5.7G	figshare/combined_data.csv


**Observations**

Our team members had the following computer specs:  

| Team Member       | Ram     | Processor     |
| :------------- | :----------: | -----------: |
|  Cal | 16GB   | AMD Ryzen 5 3600 6-core    |
| Justin  | 32GB  | Intel i5 | 
| Anita   |  x |x  | 
| Yuan  |  x |x  | 

We found that writing this combined CSV file, depending on our computer specs and write method, resulted in the following processing times and peak memory usage: 

| Team Member       | Write Method     | Processing Time     | Peak Memory Usage |
| :------------- | :----------: | -----------: | |
|  Cal | Pandas   | 1min 15sec  | 4924 mb |
|  Justin | x   | 1min 07sec  | 1593 mb |
|  Anita | x   | xmin xsec  | x mb |
|  Yuan | x   | xmin xsec  | x mb |

### 5. Load the combined CSV to memory and perform a simple EDA

In [31]:
%%time
%%memit
df = pd.read_csv("figshare/combined_data.csv")
print(df["model"].value_counts())

# peak memory 10787mb, processing time 1min 3sec

MPI-ESM1-2-HR       5154240
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
TaiESM1             3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-ESM4           3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM-1-2-HAM      966420
NESM3                966420
MPI-ESM1-2-LR        966420
AWI-ESM-1-1-LR       966420
NorESM2-LM           919800
CanESM5              551880
BCC-ESM1             551880
Name: model, dtype: int64
peak memory: 10787.64 MiB, increment: 4435.55 MiB
Wall time: 1min 3s


In [39]:
%%time
%%memit
use_cols = ["time", "rain (mm/day)", "model"] # other columns? 
df = pd.read_csv("figshare/combined_data.csv", usecols = use_cols)
print(df["model"].value_counts())

# peak memory usage 10946mb, processing time 53.1 secs

MPI-ESM1-2-HR       5154240
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
TaiESM1             3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-ESM4           3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM-1-2-HAM      966420
NESM3                966420
MPI-ESM1-2-LR        966420
AWI-ESM-1-1-LR       966420
NorESM2-LM           919800
CanESM5              551880
BCC-ESM1             551880
Name: model, dtype: int64
peak memory: 10946.77 MiB, increment: 2129.70 MiB
Wall time: 53.1 s


*observations*

By loading in only 3 out of the original 7 columns our combined CSV, we obtained the following results: 
- peak memory was not improved: 10.9GB (reduced columns) versus 10.8GB (all columns)
- memory incremental usage was cut in half: 2.1GB (reduced columns) versus 4.4GB (all columns)
- processing run time was marginally improved: 53.1 seconds (reduced columns) versus 1min 3 seconds (all columns)

### 6. Perform a simple EDA in R