## Downloading data

In [48]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
url = "https://api.figshare.com/v2/articles/14096681"
output_directory = "figsharerainfall/"

response = requests.request("GET", url)
data = json.loads(response.text)
files = data["files"]
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [9]:
%%time
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 5.7 s, sys: 3.8 s, total: 9.5 s
Wall time: 4min 13s


In [10]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 14.5 s, sys: 1.15 s, total: 15.7 s
Wall time: 16.5 s


In [12]:
%ls -ltr figsharerainfall/

total 12102840
-rw-r--r--   1 sukhleen  staff  814041183 28 Mar 15:15 data.zip
-rw-r--r--   1 sukhleen  staff   95376895 28 Mar 15:15 MPI-ESM-1-2-HAM_daily_rainfall_NSW.csv
-rw-r--r--   1 sukhleen  staff   94960113 28 Mar 15:15 AWI-ESM-1-1-LR_daily_rainfall_NSW.csv
-rw-r--r--   1 sukhleen  staff   82474546 28 Mar 15:15 NorESM2-LM_daily_rainfall_NSW.csv
-rw-r--r--   1 sukhleen  staff  127613760 28 Mar 15:15 ACCESS-CM2_daily_rainfall_NSW.csv
-rw-r--r--   1 sukhleen  staff  232118894 28 Mar 15:15 FGOALS-f3-L_daily_rainfall_NSW.csv
-rw-r--r--   1 sukhleen  staff  330360682 28 Mar 15:15 CMCC-CM2-HR4_daily_rainfall_NSW.csv
-rw-r--r--   1 sukhleen  staff  254009247 28 Mar 15:15 MRI-ESM2-0_daily_rainfall_NSW.csv
-rw-r--r--   1 sukhleen  staff  235661418 28 Mar 15:15 GFDL-CM4_daily_rainfall_NSW.csv
-rw-r--r--   1 sukhleen  staff  294260911 28 Mar 15:15 BCC-CSM2-MR_daily_rainfall_NSW.csv
-rw-r--r--   1 sukhleen  staff  295768615 28 Mar 15:15 EC-Earth3-Veg-LR_daily_rainfall_NSW.csv
-rw-r--r--   1

## Combining CSVs

In [40]:
%%time
use_cols = ["time", "lat_min", "lat_max", "lon_min", "lon_max", "rain (mm/day)"]
files = glob.glob('figsharerainfall/*.csv')
files.remove('figsharerainfall/observed_daily_rainfall_SYD.csv')

df = pd.concat((pd.read_csv(file, index_col=0, usecols=use_cols)
                .assign(model=re.findall(r'[^\/]+(?=\.)', file)[0].split("_daily")[0])
                for file in files)
              )
df.to_csv("figsharerainfall/combined_data.csv")

CPU times: user 12min 1s, sys: 23.9 s, total: 12min 25s
Wall time: 12min 39s


In [41]:
%%sh
du -sh figsharerainfall/combined_data.csv

 11G	figsharerainfall/combined_data.csv


In [54]:
df.shape

(124935686, 6)

In [None]:
df.head()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


## EDA in Python

### Loading the combined data into memory and performing simple EDA

In [105]:
%%time
df = pd.read_csv("figsharerainfall/combined_data.csv", index_col=0)
df['model'].value_counts()

CPU times: user 1min 33s, sys: 14.1 s, total: 1min 47s
Wall time: 1min 55s


In [106]:
df

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM
...,...,...,...,...,...,...
2014-12-27 12:00:00,-30.157068,-29.214660,153.1250,154.3750,6.689683e+00,SAM0-UNICON
2014-12-28 12:00:00,-30.157068,-29.214660,153.1250,154.3750,7.862555e+00,SAM0-UNICON
2014-12-29 12:00:00,-30.157068,-29.214660,153.1250,154.3750,1.000503e+01,SAM0-UNICON
2014-12-30 12:00:00,-30.157068,-29.214660,153.1250,154.3750,8.541592e+00,SAM0-UNICON


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124935686 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   lat_min        float64
 1   lat_max        float64
 2   lon_min        float64
 3   lon_max        float64
 4   rain (mm/day)  float64
 5   model          object 
dtypes: float64(5), object(1)
memory usage: 6.5+ GB


In [108]:
%%time
df.describe()

Unnamed: 0,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
count,118497100.0,124935700.0,118497100.0,124935700.0,118497100.0
mean,-33.10482,-31.97757,146.9059,148.215,1.90117
std,1.963549,1.992067,3.793784,3.809994,5.585735
min,-36.46739,-36.0,140.625,141.25,-3.807373e-12
25%,-34.86911,-33.66221,143.4375,145.0,3.838411e-06
50%,-33.0,-32.04188,146.875,148.125,0.06154947
75%,-31.4017,-30.15707,150.1875,151.3125,1.020918
max,-29.9,-27.90606,153.75,155.625,432.9395


> The entire dataset usage more than 6.5GB worth of memory! Let's try to explore methods to reduce the memory usage.

## Investigating methods to reduce memory usage

### Method 1: Changing the data types of the columns

In [109]:
print(f"Memory usage with float64: {df[['lat_min','lat_max','lon_min','lon_max','rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[['lat_min','lat_max','lon_min','lon_max','rain (mm/day)']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 5996.91 MB
Memory usage with float32: 3498.20 MB


In [110]:
%%time
dtypes = {"lat_min": "float32",
          "lat_max": "float32",
          "lon_min": "float32",
          "lon_max": "float32",
          "rain (mm/day)": "float32"}

df_red_dtypes = pd.read_csv("figsharerainfall/combined_data.csv", index_col=0, dtype=dtypes)
print(df_red_dtypes["model"].value_counts())

combined_data       62467843
MPI-ESM1-2-HR        5154240
CMCC-ESM2            3541230
NorESM2-MM           3541230
TaiESM1              3541230
CMCC-CM2-SR5         3541230
CMCC-CM2-HR4         3541230
SAM0-UNICON          3541153
GFDL-CM4             3219300
FGOALS-f3-L          3219300
GFDL-ESM4            3219300
MRI-ESM2-0           3037320
EC-Earth3-Veg-LR     3037320
BCC-CSM2-MR          3035340
MIROC6               2070900
ACCESS-CM2           1932840
ACCESS-ESM1-5        1610700
INM-CM5-0            1609650
INM-CM4-8            1609650
KIOST-ESM            1287720
FGOALS-g3            1287720
AWI-ESM-1-1-LR        966420
MPI-ESM1-2-LR         966420
NESM3                 966420
MPI-ESM-1-2-HAM       966420
NorESM2-LM            919800
BCC-ESM1              551880
CanESM5               551880
Name: model, dtype: int64
CPU times: user 1min 38s, sys: 11.5 s, total: 1min 50s
Wall time: 1min 57s


In [111]:
df_red_dtypes

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM
...,...,...,...,...,...,...
2014-12-27 12:00:00,-30.157068,-29.214659,153.1250,154.3750,6.689683e+00,SAM0-UNICON
2014-12-28 12:00:00,-30.157068,-29.214659,153.1250,154.3750,7.862556e+00,SAM0-UNICON
2014-12-29 12:00:00,-30.157068,-29.214659,153.1250,154.3750,1.000503e+01,SAM0-UNICON
2014-12-30 12:00:00,-30.157068,-29.214659,153.1250,154.3750,8.541592e+00,SAM0-UNICON


In [112]:
df_red_dtypes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124935686 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   lat_min        float32
 1   lat_max        float32
 2   lon_min        float32
 3   lon_max        float32
 4   rain (mm/day)  float32
 5   model          object 
dtypes: float32(5), object(1)
memory usage: 4.2+ GB


> By changing the data type of numeric columns from `float64` to `float32`, we have reduced the memory consumption to around 4.2GB.

In [113]:
%%time
df_red_dtypes.describe()

Unnamed: 0,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
count,118497100.0,124935700.0,118497100.0,124935700.0,118497100.0
mean,-33.10474,-31.9773,146.9064,148.2143,1.901167
std,1.963549,1.992067,3.793784,3.809994,5.585735
min,-36.46739,-36.0,140.625,141.25,-3.807373e-12
25%,-34.86911,-33.66221,143.4375,145.0,3.838411e-06
50%,-33.0,-32.04189,146.875,148.125,0.06154947
75%,-31.4017,-30.15707,150.1875,151.3125,1.020918
max,-29.9,-27.90606,153.75,155.625,432.9395


### Method 2: Loading only selective columns - `time`, `rain (mm/day)` and `model`

In [115]:
%%time
use_cols = ['time','rain (mm/day)', 'model']
df_subset = pd.read_csv("figsharerainfall/combined_data.csv",usecols=use_cols, index_col=0)
print(df_subset['model'].value_counts())

combined_data       62467843
MPI-ESM1-2-HR        5154240
CMCC-ESM2            3541230
NorESM2-MM           3541230
TaiESM1              3541230
CMCC-CM2-SR5         3541230
CMCC-CM2-HR4         3541230
SAM0-UNICON          3541153
GFDL-CM4             3219300
FGOALS-f3-L          3219300
GFDL-ESM4            3219300
MRI-ESM2-0           3037320
EC-Earth3-Veg-LR     3037320
BCC-CSM2-MR          3035340
MIROC6               2070900
ACCESS-CM2           1932840
ACCESS-ESM1-5        1610700
INM-CM5-0            1609650
INM-CM4-8            1609650
KIOST-ESM            1287720
FGOALS-g3            1287720
AWI-ESM-1-1-LR        966420
MPI-ESM1-2-LR         966420
NESM3                 966420
MPI-ESM-1-2-HAM       966420
NorESM2-LM            919800
BCC-ESM1              551880
CanESM5               551880
Name: model, dtype: int64
CPU times: user 1min 25s, sys: 10.2 s, total: 1min 36s
Wall time: 1min 42s


In [72]:
df_subset

Unnamed: 0_level_0,rain (mm/day)
time,Unnamed: 1_level_1
1889-01-01 12:00:00,4.244226e-13
1889-01-02 12:00:00,4.217326e-13
1889-01-03 12:00:00,4.498125e-13
1889-01-04 12:00:00,4.251282e-13
1889-01-05 12:00:00,4.270161e-13
...,...
2014-12-27 12:00:00,6.689683e+00
2014-12-28 12:00:00,7.862555e+00
2014-12-29 12:00:00,1.000503e+01
2014-12-30 12:00:00,8.541592e+00


In [114]:
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124935686 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 1 columns):
 #   Column         Dtype  
---  ------         -----  
 0   rain (mm/day)  float64
dtypes: float64(1)
memory usage: 1.9+ GB


> The memory usage of the selected columns `time` and `rain (mm/day)` is just 1.9GB. By selecting only the columns that we wanted to work with, we have reduced the memory usage significantly.

In [116]:
%%time
df_subset.describe()

CPU times: user 3.46 s, sys: 2.44 s, total: 5.91 s
Wall time: 6.2 s


Unnamed: 0,rain (mm/day)
count,118497100.0
mean,1.90117
std,5.585735
min,-3.807373e-12
25%,3.838411e-06
50%,0.06154947
75%,1.020918
max,432.9395


### Method 3: Loading data in chunks

In [117]:
%%time
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("figsharerainfall/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk['model'].value_counts(), fill_value=0)
print(counts.astype(int))

ACCESS-CM2           1932840
ACCESS-ESM1-5        1610700
AWI-ESM-1-1-LR        966420
BCC-CSM2-MR          3035340
BCC-ESM1              551880
CMCC-CM2-HR4         3541230
CMCC-CM2-SR5         3541230
CMCC-ESM2            3541230
CanESM5               551880
EC-Earth3-Veg-LR     3037320
FGOALS-f3-L          3219300
FGOALS-g3            1287720
GFDL-CM4             3219300
GFDL-ESM4            3219300
INM-CM4-8            1609650
INM-CM5-0            1609650
KIOST-ESM            1287720
MIROC6               2070900
MPI-ESM-1-2-HAM       966420
MPI-ESM1-2-HR        5154240
MPI-ESM1-2-LR         966420
MRI-ESM2-0           3037320
NESM3                 966420
NorESM2-LM            919800
NorESM2-MM           3541230
SAM0-UNICON          3541153
TaiESM1              3541230
combined_data       62467843
dtype: int64
CPU times: user 1min 32s, sys: 7.63 s, total: 1min 40s
Wall time: 1min 41s


> There is a noticeable reduction in the processing and run time when the data is loaded in chunks.

### EDA on dataframe with reduced `dtype`

In [None]:
%%time
plt.hist(df_red_dtypes, figsize=(20, 20), bins=20);