In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import numpy as np
import dask.dataframe as dd
import pyarrow.dataset as ds
import rpy2_arrow.pyarrow_rarrow as pyra

## Part 3 - Downloading the data

In [2]:
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshareweather/"

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [4]:
file = files[3]
file

{'id': 26766812,
 'name': 'data.zip',
 'size': 814041183,
 'is_link_only': False,
 'download_url': 'https://ndownloader.figshare.com/files/26766812',
 'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
 'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9'}

In [5]:
# Download the data.zip, will take around 10-15 minutes
os.makedirs(output_directory, exist_ok=True)
urlretrieve(file["download_url"], output_directory + file["name"])

('figshareweather/data.zip', <http.client.HTTPMessage at 0x1c5b635db80>)

In [6]:
# Unzip contents of data.zip
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

## Part 4 - Combining the data CSVs

In [7]:
files = glob.glob('figshareweather/*.csv')
files.remove('figshareweather\observed_daily_rainfall_SYD.csv') 
#files.remove('figshareweather/observed_daily_rainfall_SYD.csv') 

In [8]:
%%time
df = pd.concat((pd.read_csv(file, index_col=0)
                    .assign(model=re.findall(r"(?<=figshareweather\/)(.*)(?=_daily)", file)[0])
                    for file in files))
#df = pd.concat((pd.read_csv(file, index_col=0).assign(model=re.findall(r"[^\/]+(?=_daily)", file)[0]) for file in files))
df.to_csv('figshareweather/combined_data.csv')

CPU times: total: 7min 20s
Wall time: 7min 22s


In [9]:
# df = pd.read_csv('figshareweather/combined_data.csv')
df

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-36.250000,-35.00000,140.625,142.500,3.293256e-13,figshareweather\ACCESS-CM2
1889-01-02 12:00:00,-36.250000,-35.00000,140.625,142.500,0.000000e+00,figshareweather\ACCESS-CM2
1889-01-03 12:00:00,-36.250000,-35.00000,140.625,142.500,0.000000e+00,figshareweather\ACCESS-CM2
1889-01-04 12:00:00,-36.250000,-35.00000,140.625,142.500,0.000000e+00,figshareweather\ACCESS-CM2
1889-01-05 12:00:00,-36.250000,-35.00000,140.625,142.500,1.047658e-02,figshareweather\ACCESS-CM2
...,...,...,...,...,...,...
2014-12-27 12:00:00,-30.157068,-29.21466,153.125,154.375,5.543748e-01,figshareweather\TaiESM1
2014-12-28 12:00:00,-30.157068,-29.21466,153.125,154.375,7.028577e+00,figshareweather\TaiESM1
2014-12-29 12:00:00,-30.157068,-29.21466,153.125,154.375,2.347570e-01,figshareweather\TaiESM1
2014-12-30 12:00:00,-30.157068,-29.21466,153.125,154.375,2.097459e+00,figshareweather\TaiESM1


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Cuthbert |  MacOS                |  32GB   | Intel i9           | Yes       |     5:59       |
| Abhiket  |Windows 10 Education            | 16GB      |   Intel i7        |  Yes       |  7:20          |
| Paniz    |  MacOS                |  16GB   |   Intel i7         |   Yes     |    6:08       |
| Irene    |       MacOS           |   8GB  |     Intel i5      |    Yes    |     10:18       |

## Part 5 - Python EDA

### Preliminary EDA without any pre-processing

In [10]:
%%time
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62467843 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   lat_min        float64
 1   lat_max        float64
 2   lon_min        float64
 3   lon_max        float64
 4   rain (mm/day)  float64
 5   model          object 
dtypes: float64(5), object(1)
memory usage: 3.3+ GB
CPU times: total: 15.6 ms
Wall time: 14 ms


In [11]:
%%time
df.describe()

CPU times: total: 11 s
Wall time: 11 s


Unnamed: 0,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
count,59248540.0,62467840.0,59248540.0,62467840.0,59248540.0
mean,-33.10482,-31.97757,146.9059,148.215,1.90117
std,1.963549,1.992067,3.793784,3.809994,5.585735
min,-36.46739,-36.0,140.625,141.25,-3.807373e-12
25%,-34.86911,-33.66221,143.4375,145.0,3.838413e-06
50%,-33.0,-32.04188,146.875,148.125,0.06154947
75%,-31.4017,-30.15707,150.1875,151.3125,1.020918
max,-29.9,-27.90606,153.75,155.625,432.9395


In [12]:
%%time
df.model.value_counts()

CPU times: total: 1.77 s
Wall time: 1.8 s


figshareweather\MPI-ESM1-2-HR       5154240
figshareweather\TaiESM1             3541230
figshareweather\NorESM2-MM          3541230
figshareweather\CMCC-CM2-HR4        3541230
figshareweather\CMCC-CM2-SR5        3541230
figshareweather\CMCC-ESM2           3541230
figshareweather\SAM0-UNICON         3541153
figshareweather\FGOALS-f3-L         3219300
figshareweather\GFDL-CM4            3219300
figshareweather\GFDL-ESM4           3219300
figshareweather\EC-Earth3-Veg-LR    3037320
figshareweather\MRI-ESM2-0          3037320
figshareweather\BCC-CSM2-MR         3035340
figshareweather\MIROC6              2070900
figshareweather\ACCESS-CM2          1932840
figshareweather\ACCESS-ESM1-5       1610700
figshareweather\INM-CM5-0           1609650
figshareweather\INM-CM4-8           1609650
figshareweather\KIOST-ESM           1287720
figshareweather\FGOALS-g3           1287720
figshareweather\MPI-ESM1-2-LR        966420
figshareweather\NESM3                966420
figshareweather\AWI-ESM-1-1-LR  

In [13]:
df2 = df.copy()
df2.dtypes

lat_min          float64
lat_max          float64
lon_min          float64
lon_max          float64
rain (mm/day)    float64
model             object
dtype: object

### Changing dtype of your data

In [14]:
df2.index = pd.to_datetime(df2.index)
df2['model'] = df2['model'].astype('category')
df2['rain (mm/day)'] = df2['rain (mm/day)'].astype(np.float32)
df2.loc[:, ['lat_min', 'lat_max', 'lon_min', 'lon_max']] = df2.loc[:, ['lat_min', 'lat_max', 'lon_min', 'lon_max']].astype(np.float32)

In [15]:
%%time
df2.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62467843 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 6 columns):
 #   Column         Dtype   
---  ------         -----   
 0   lat_min        float32 
 1   lat_max        float32 
 2   lon_min        float32 
 3   lon_max        float32 
 4   rain (mm/day)  float32 
 5   model          category
dtypes: category(1), float32(5)
memory usage: 1.7 GB
CPU times: total: 812 ms
Wall time: 837 ms


In [16]:
%%time
df2.describe()

CPU times: total: 8.5 s
Wall time: 8.53 s


Unnamed: 0,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
count,59248540.0,62467840.0,59248540.0,62467840.0,59248540.0
mean,-33.10463,-31.97747,146.9054,148.2152,1.901175
std,1.963549,1.992067,3.793784,3.809994,5.585735
min,-36.46739,-36.0,140.625,141.25,-3.807373e-12
25%,-34.86911,-33.66221,143.4375,145.0,3.838413e-06
50%,-33.0,-32.04189,146.875,148.125,0.06154947
75%,-31.4017,-30.15707,150.1875,151.3125,1.020918
max,-29.9,-27.90606,153.75,155.625,432.9395


In [17]:
%%time
df2.model.value_counts()

CPU times: total: 344 ms
Wall time: 313 ms


figshareweather\MPI-ESM1-2-HR       5154240
figshareweather\TaiESM1             3541230
figshareweather\NorESM2-MM          3541230
figshareweather\CMCC-CM2-HR4        3541230
figshareweather\CMCC-CM2-SR5        3541230
figshareweather\CMCC-ESM2           3541230
figshareweather\SAM0-UNICON         3541153
figshareweather\FGOALS-f3-L         3219300
figshareweather\GFDL-CM4            3219300
figshareweather\GFDL-ESM4           3219300
figshareweather\EC-Earth3-Veg-LR    3037320
figshareweather\MRI-ESM2-0          3037320
figshareweather\BCC-CSM2-MR         3035340
figshareweather\MIROC6              2070900
figshareweather\ACCESS-CM2          1932840
figshareweather\ACCESS-ESM1-5       1610700
figshareweather\INM-CM5-0           1609650
figshareweather\INM-CM4-8           1609650
figshareweather\KIOST-ESM           1287720
figshareweather\FGOALS-g3           1287720
figshareweather\MPI-ESM1-2-LR        966420
figshareweather\NESM3                966420
figshareweather\AWI-ESM-1-1-LR  

### Loading in Chunks

In [18]:
%%time
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("figshareweather/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

figshareweather\ACCESS-CM2          1932840
figshareweather\ACCESS-ESM1-5       1610700
figshareweather\AWI-ESM-1-1-LR       966420
figshareweather\BCC-CSM2-MR         3035340
figshareweather\BCC-ESM1             551880
figshareweather\CMCC-CM2-HR4        3541230
figshareweather\CMCC-CM2-SR5        3541230
figshareweather\CMCC-ESM2           3541230
figshareweather\CanESM5              551880
figshareweather\EC-Earth3-Veg-LR    3037320
figshareweather\FGOALS-f3-L         3219300
figshareweather\FGOALS-g3           1287720
figshareweather\GFDL-CM4            3219300
figshareweather\GFDL-ESM4           3219300
figshareweather\INM-CM4-8           1609650
figshareweather\INM-CM5-0           1609650
figshareweather\KIOST-ESM           1287720
figshareweather\MIROC6              2070900
figshareweather\MPI-ESM-1-2-HAM      966420
figshareweather\MPI-ESM1-2-HR       5154240
figshareweather\MPI-ESM1-2-LR        966420
figshareweather\MRI-ESM2-0          3037320
figshareweather\NESM3           

### Difference in memory

In [19]:
print(f"Memory usage with float64: {df[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e9:.2f} GB")
print(f"Memory usage with float32: {df2[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e9:.2f} GB")

Memory usage with float64: 3.00 GB
Memory usage with float32: 1.75 GB


### Preliminary EDA, changing the dtypes and Loading in Chunks
- Change dtype of model to categorical, numerical to float32, and time to datetime
    - In practice, we probably cannot change precision of latitudes and longitudes because we would lose precision in our geographical points. 
    - Might be okay to reduce the significant figures for the rainfall numbers since they are measured in mm already. 
    - Making these changes reduces the dataframe memory usage from 3 GB to 1.75 GB. (this alone does not significantly change EDA time).
- It appears we want all the columns as they contain pertinent information to our investigation, and thus we cannot omit any of the columns when reading them in.
- Loading the data in chunks for the purpose of our EDA has actually drastically increased the time taken to perform the value_counts by an order of magnitude.

<br>

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken (before) | Time taken (dtype)|Time taken(chunks)|
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|:----------:|:----------:|
| Cuthbert |  MacOS                |  32GB   | Intel i9           | Yes       | 3.32s         |294ms|55s|
| Abhiket    | Windows 10 Education |   16GB  | Intel i7          |   Yes     |  1.77s          |344ms|1:14|
| Paniz    |   MacOS               | 16GB    | Intel i7          |    Yes    |   2 s         |467 ms| 58.3s|
| Irene    |                  |     |           |        |            |||

## Part 6 - File Transfer, R EDA

In [20]:
%load_ext rpy2.ipython



### Parquet File

In [21]:
%%time
df.to_parquet("figshareweather/combined_data.parquet")

CPU times: total: 29.2 s
Wall time: 28.4 s


In [22]:
%%R 
start_time <- Sys.time()
suppressMessages(library(dplyr))
suppressMessages(library(arrow))
df_parquet <- open_dataset("figshareweather/combined_data.parquet")
result <- df_parquet %>% count(model)
end_time <- Sys.time()
print(result %>% collect())
print(end_time - start_time)

[38;5;246m# A tibble: 27 x 2[39m
   model                                     n
   [3m[38;5;246m<chr>[39m[23m                                 [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m [38;5;246m"[39mfigshareweather\\ACCESS-CM2[38;5;246m"[39m       1[4m9[24m[4m3[24m[4m2[24m840
[38;5;250m 2[39m [38;5;246m"[39mfigshareweather\\ACCESS-ESM1-5[38;5;246m"[39m    1[4m6[24m[4m1[24m[4m0[24m700
[38;5;250m 3[39m [38;5;246m"[39mfigshareweather\\AWI-ESM-1-1-LR[38;5;246m"[39m    [4m9[24m[4m6[24m[4m6[24m420
[38;5;250m 4[39m [38;5;246m"[39mfigshareweather\\BCC-CSM2-MR[38;5;246m"[39m      3[4m0[24m[4m3[24m[4m5[24m340
[38;5;250m 5[39m [38;5;246m"[39mfigshareweather\\BCC-ESM1[38;5;246m"[39m          [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 6[39m [38;5;246m"[39mfigshareweather\\CanESM5[38;5;246m"[39m           [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 7[39m [38;5;246m"[39mfigshareweather\\CMCC-CM2-HR4[38;5;246m"[39m     3

### Feather File

In [23]:
%%time
df.reset_index().to_feather("figshareweather/combined_data.feather")

CPU times: total: 15 s
Wall time: 11.6 s


In [24]:
%%R 
start_time <- Sys.time()
suppressMessages(library(dplyr))
df_feather <- read_feather("figshareweather/combined_data.feather")
result <- df_feather %>% count(model)
end_time <- Sys.time()
print(result %>% collect())
print(end_time - start_time)

[38;5;246m# A tibble: 27 x 2[39m
   model                                     n
   [3m[38;5;246m<chr>[39m[23m                                 [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m [38;5;246m"[39mfigshareweather\\ACCESS-CM2[38;5;246m"[39m       1[4m9[24m[4m3[24m[4m2[24m840
[38;5;250m 2[39m [38;5;246m"[39mfigshareweather\\ACCESS-ESM1-5[38;5;246m"[39m    1[4m6[24m[4m1[24m[4m0[24m700
[38;5;250m 3[39m [38;5;246m"[39mfigshareweather\\AWI-ESM-1-1-LR[38;5;246m"[39m    [4m9[24m[4m6[24m[4m6[24m420
[38;5;250m 4[39m [38;5;246m"[39mfigshareweather\\BCC-CSM2-MR[38;5;246m"[39m      3[4m0[24m[4m3[24m[4m5[24m340
[38;5;250m 5[39m [38;5;246m"[39mfigshareweather\\BCC-ESM1[38;5;246m"[39m          [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 6[39m [38;5;246m"[39mfigshareweather\\CanESM5[38;5;246m"[39m           [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 7[39m [38;5;246m"[39mfigshareweather\\CMCC-CM2-HR4[38;5;246m"[39m     3

### Arrow Exchange

In [25]:
%%time
df_arrow = ds.dataset("figshareweather/combined_data.csv", format="csv")
table = df_arrow.to_table()
r_table = pyra.converter.py2rpy(table)

CPU times: total: 1min 20s
Wall time: 1min 20s


In [26]:
%%R -i r_table
start_time <- Sys.time()
suppressMessages(library(dplyr))
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result %>% collect())
print(end_time - start_time)

[38;5;246m# A tibble: 27 x 2[39m
   model                                     n
   [3m[38;5;246m<chr>[39m[23m                                 [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m [38;5;246m"[39mfigshareweather\\ACCESS-CM2[38;5;246m"[39m       1[4m9[24m[4m3[24m[4m2[24m840
[38;5;250m 2[39m [38;5;246m"[39mfigshareweather\\ACCESS-ESM1-5[38;5;246m"[39m    1[4m6[24m[4m1[24m[4m0[24m700
[38;5;250m 3[39m [38;5;246m"[39mfigshareweather\\AWI-ESM-1-1-LR[38;5;246m"[39m    [4m9[24m[4m6[24m[4m6[24m420
[38;5;250m 4[39m [38;5;246m"[39mfigshareweather\\BCC-CSM2-MR[38;5;246m"[39m      3[4m0[24m[4m3[24m[4m5[24m340
[38;5;250m 5[39m [38;5;246m"[39mfigshareweather\\BCC-ESM1[38;5;246m"[39m          [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 6[39m [38;5;246m"[39mfigshareweather\\CanESM5[38;5;246m"[39m           [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 7[39m [38;5;246m"[39mfigshareweather\\CMCC-CM2-HR4[38;5;246m"[39m     3

### Discussion about file types

- The parquet file was fast to create and exchange. 
- The feather file was fast to create, but the exchange was a bit long. 
- The arrow csv was slow to create but very fast to exchange.
- We did not use pandas exchange because the process is very slow. 

Among all the three approaches we tried, we picked parquet file because overall it was the fastest in terms of creating and exchanging the file.