In [2]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

# Downloading the data

In [2]:
%pwd
## Change it to the location that you want to download your files to.

'C:\\Users\\robin\\Downloads\\MDS\\Block 6\\DSCI 525\\525-group-01\\notebooks'

In [3]:
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "rainfall/"

In [4]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text) 
files = data["files"]            
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [None]:
%%time
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

In [None]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

# Combining data CSVs

In [5]:
%cd rainfall/

C:\Users\robin\Downloads\MDS\Block 6\DSCI 525\525-group-01\notebooks\rainfall


In [7]:
%%time
## here we are using a normal python way for merging the data 

files = glob.glob('*.csv')
if "observed_daily_rainfall_SYD.csv" in files:
    files.remove("observed_daily_rainfall_SYD.csv")

if "combined_data.csv" in files:
    os.remove("combined_data.csv")
    files.remove("combined_data.csv")
    
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall("([^_]*)", file)[0])
                for file in files)
              )
#df.to_csv("combined_data.csv")
with open("combined_data.csv", "w") as f:
    df.to_csv(f)

CPU times: total: 9min 43s
Wall time: 9min 44s


In [8]:
df.model.unique(), df.model.nunique()

(array(['ACCESS-CM2', 'ACCESS-ESM1-5', 'AWI-ESM-1-1-LR', 'BCC-CSM2-MR',
        'BCC-ESM1', 'CanESM5', 'CMCC-CM2-HR4', 'CMCC-CM2-SR5', 'CMCC-ESM2',
        'EC-Earth3-Veg-LR', 'FGOALS-f3-L', 'FGOALS-g3', 'GFDL-CM4',
        'GFDL-ESM4', 'INM-CM4-8', 'INM-CM5-0', 'KIOST-ESM', 'MIROC6',
        'MPI-ESM-1-2-HAM', 'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0',
        'NESM3', 'NorESM2-LM', 'NorESM2-MM', 'SAM0-UNICON', 'TaiESM1'],
       dtype=object),
 27)

# Load the combined CSV to memory and perform a simple EDA

## Changing `dtype` of our data. Specifically, I will only test the target column`rain` and the index column `time`, while keeping all the other columns. 

In [9]:
df.index.dtype, df['rain (mm/day)'].dtype

(dtype('O'), dtype('float64'))

In [10]:
print(f"Memory usage with float64 rain and object type time: {df.memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64 rain and object type time: 3498.20 MB


In [11]:
df.index = pd.to_datetime(df.index)
df["rain (mm/day)"] = df['rain (mm/day)'].astype('float32')

In [12]:
df.index.dtype, df["rain (mm/day)"].dtype

(dtype('<M8[ns]'), dtype('float32'))

In [13]:
import numpy as np
np.dtype('datetime64[ns]') == np.dtype('<M8[ns]')

True

In [14]:
print(f"Memory usage with float32 rain and datetime time/index column: {df.memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float32 rain and datetime time/index column: 3248.33 MB


Although this might seem insignificant, when we only use the needed columns and apply this datatype, we should see better results. Below, I'll select only the required columns and then do this process again to check.

## Load just columns that we want

In [15]:
pd.read_csv("combined_data.csv")

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.250000,-35.00000,140.625,142.500,3.293256e-13,ACCESS-CM2
1,1889-01-02 12:00:00,-36.250000,-35.00000,140.625,142.500,0.000000e+00,ACCESS-CM2
2,1889-01-03 12:00:00,-36.250000,-35.00000,140.625,142.500,0.000000e+00,ACCESS-CM2
3,1889-01-04 12:00:00,-36.250000,-35.00000,140.625,142.500,0.000000e+00,ACCESS-CM2
4,1889-01-05 12:00:00,-36.250000,-35.00000,140.625,142.500,1.047658e-02,ACCESS-CM2
...,...,...,...,...,...,...,...
62467838,2014-12-27 12:00:00,-30.157068,-29.21466,153.125,154.375,5.543748e-01,TaiESM1
62467839,2014-12-28 12:00:00,-30.157068,-29.21466,153.125,154.375,7.028577e+00,TaiESM1
62467840,2014-12-29 12:00:00,-30.157068,-29.21466,153.125,154.375,2.347570e-01,TaiESM1
62467841,2014-12-30 12:00:00,-30.157068,-29.21466,153.125,154.375,2.097459e+00,TaiESM1


In [16]:
%%time
df2 = pd.read_csv("combined_data.csv", usecols=['time', 'rain (mm/day)', 'model'])

CPU times: total: 1min 4s
Wall time: 1min 4s


In [17]:
print(f"Memory usage with selected columns: {df2.memory_usage().sum() / 1e6:.2f} MB")

Memory usage with selected columns: 1499.23 MB


Now, I'll convert the datatypes again as done before and check how much it makes a difference.

In [18]:
df2.time = pd.to_datetime(df2.time)
df2["rain (mm/day)"] = df2['rain (mm/day)'].astype('float32')

In [19]:
df2.time.dtype, df2["rain (mm/day)"].dtype

(dtype('<M8[ns]'), dtype('float32'))

In [20]:
print(f"Memory usage with selected columns and datatypes: {df2.memory_usage().sum() / 1e6:.2f} MB")

Memory usage with selected columns and datatypes: 1249.36 MB


As we can see, the memory usage change is significant compared to the original. 

## Loading in chunks

In [5]:
%%time
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
AWI-ESM-1-1-LR       966420
BCC-CSM2-MR         3035340
BCC-ESM1             551880
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
CanESM5              551880
EC-Earth3-Veg-LR    3037320
FGOALS-f3-L         3219300
FGOALS-g3           1287720
GFDL-CM4            3219300
GFDL-ESM4           3219300
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
MIROC6              2070900
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-HR       5154240
MPI-ESM1-2-LR        966420
MRI-ESM2-0          3037320
NESM3                966420
NorESM2-LM           919800
NorESM2-MM          3541230
SAM0-UNICON         3541153
TaiESM1             3541230
dtype: int32
CPU times: total: 1min 22s
Wall time: 1min 22s


# Perform a simple EDA in R

In [None]:
#!pip install rpy2_arrow
import pyarrow.dataset as ds
import pyarrow as pa
import pandas as pd
import pyarrow 
from pyarrow import csv
import rpy2_arrow.pyarrow_rarrow as pyra

R[write to console]: Error: cons memory exhausted (limit reached?)

R[write to console]: Error: no more error handlers available (recursive errors?); invoking 'abort' restart

