In [5]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import rpy2.rinterface
from memory_profiler import memory_usage

In [6]:
# %load_ext rpy2.ipython
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


## 1. Download the data

In [7]:
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "../figshareausraincloud/raw_data/"

In [8]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

## 2. Unzip the data

In [9]:
%%time
files_to_dl = ["data.zip"]  
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

Wall time: 1min 32s


In [10]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

Wall time: 27.2 s


## 3. Combining data CSVs

In [11]:
%%time
%memit
 
os.makedirs("../figshareausraincloud/processed_data/", exist_ok=True)
files = glob.glob('../figshareausraincloud/raw_data/*.csv')
df = pd.concat((pd.read_csv(file, index_col=0).assign(model=re.findall(r'^.*?(?=_)', os.path.basename(file))[0])
                for file in files))
df.to_csv("../figshareausraincloud/processed_data/combined_data.csv")

peak memory: 82.52 MiB, increment: 0.82 MiB
Wall time: 10min


In [12]:
%%sh

du -sh ../figshareausraincloud/processed_data/combined_data.csv

5.7G	../figshareausraincloud/processed_data/combined_data.csv


## 4. Load the combined CSV to memory and perform a simple EDA

### 4.1 Using Pandas

In [22]:
%%time
%memit

df = pd.read_csv("../figshareausraincloud/processed_data/combined_data.csv")
print(df["model"].value_counts())

peak memory: 6671.32 MiB, increment: 0.79 MiB
MPI-ESM1-2-HR       5154240
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
TaiESM1             3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
MPI-ESM-1-2-HAM      966420
NESM3                966420
MPI-ESM1-2-LR        966420
AWI-ESM-1-1-LR       966420
NorESM2-LM           919800
CanESM5              551880
BCC-ESM1             551880
observed              46020
Name: model, dtype: int64
Wall time: 2min 19s


### 4.2 Changing dtype of the data using Pandas

In [23]:
print(f"Memory usage with float64: {df.memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df.astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 3500.78 MB
Memory usage with float32: 2250.50 MB


### 4.3 Loading data with columns that we are interested in

In [25]:
%%time
%%memit

use_cols = ['lat_min','lat_max', 'rain (mm/day)', 'model']
df = pd.read_csv("../figshareausraincloud/processed_data/combined_data.csv",usecols=use_cols)
print(df["model"].value_counts())

MPI-ESM1-2-HR       5154240
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
TaiESM1             3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
MPI-ESM-1-2-HAM      966420
NESM3                966420
MPI-ESM1-2-LR        966420
AWI-ESM-1-1-LR       966420
NorESM2-LM           919800
CanESM5              551880
BCC-ESM1             551880
observed              46020
Name: model, dtype: int64
peak memory: 9159.09 MiB, increment: 3812.83 MiB
Wall time: 1min 23s


### 4.4 Loading data in chunks using Pandas

In [24]:
%%time
%memit

counts = pd.Series(dtype=int)
for chunk in pd.read_csv("../figshareausraincloud/processed_data/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

peak memory: 5162.25 MiB, increment: 0.58 MiB
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
AWI-ESM-1-1-LR       966420
BCC-CSM2-MR         3035340
BCC-ESM1             551880
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
CanESM5              551880
EC-Earth3-Veg-LR    3037320
FGOALS-f3-L         3219300
FGOALS-g3           1287720
GFDL-CM4            3219300
GFDL-ESM4           3219300
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
MIROC6              2070900
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-HR       5154240
MPI-ESM1-2-LR        966420
MRI-ESM2-0          3037320
NESM3                966420
NorESM2-LM           919800
NorESM2-MM          3541230
SAM0-UNICON         3541153
TaiESM1             3541230
observed              46020
dtype: int32
Wall time: 2min 3s


### 4.4 Loading data using Dask

In [16]:
import dask.dataframe as dd

In [26]:
%%time
%memit

ddf = dd.read_csv("../figshareausraincloud/processed_data/combined_data.csv")
print(ddf["model"].value_counts())

peak memory: 2227.01 MiB, increment: 0.00 MiB
Dask Series Structure:
npartitions=1
    int64
      ...
Name: model, dtype: int64
Dask Name: value-counts-agg, 300 tasks
Wall time: 7.7 s


### Discussion
- 16GB RAM, Windows 10 Professional
- Unzip the data: roughly 2 mins in total
- Combining the data: 10 mins, peak memory is 82.52 MiB
- Using pandas to read the csv file: 2 mins 19 sec, peak memory is 6671.32 MiB
- Changing the data types using pandas: the memory decreases from 3500.78 MB by using `float64` to 2250.50 MB using `float32`.
- Loading data with only columns that we are interested in (2 columns less): 1 min 23 sec, peak memory is 9159.09 MiB
- Loading data in chunks using pandas: 2 mins 3 sec, peak memory is 5162.25 MiB
- Loading data using Dask: 7.7 sec, pead memory is 2227.01 MiB