In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

# 3. Downloading the data

## 1. Download the data from figshare 

In [10]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshareairline"

In [4]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

## 2. Extract the zip file

In [5]:
%%time
files_to_dl = ["data.zip"]  # feel free to add other files here
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 5.49 s, sys: 3.71 s, total: 9.2 s
Wall time: 1min 47s


In [13]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory + '/data')

CPU times: user 14.6 s, sys: 1.14 s, total: 15.8 s
Wall time: 16.6 s


In [14]:
%ls -ltr figshareairline/data

total 10540336
-rw-r--r--   1 bananabook  staff   95376895 Mar 28 14:53 MPI-ESM-1-2-HAM_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff   94960113 Mar 28 14:53 AWI-ESM-1-1-LR_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff   82474546 Mar 28 14:53 NorESM2-LM_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff  127613760 Mar 28 14:53 ACCESS-CM2_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff  232118894 Mar 28 14:53 FGOALS-f3-L_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff  330360682 Mar 28 14:53 CMCC-CM2-HR4_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff  254009247 Mar 28 14:53 MRI-ESM2-0_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff  235661418 Mar 28 14:53 GFDL-CM4_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff  294260911 Mar 28 14:53 BCC-CSM2-MR_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff  295768615 Mar 28 14:53 EC-Earth3-Veg-LR_daily_rainfall_NSW.csv
-rw-r--r--   1 bananabook  staff  328852379 Mar 28 14:53 C

In [4]:
%%time
### just listing to get an idea how individual file looks like
df = pd.read_csv(
    "figshareairline/data/ACCESS-CM2_daily_rainfall_NSW.csv", dtype={"TailNum": "str"}
)
df

CPU times: user 995 ms, sys: 136 ms, total: 1.13 s
Wall time: 1.21 s


Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-36.25,-35.00,140.625,142.50,3.293256e-13
1,1889-01-02 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
2,1889-01-03 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
3,1889-01-04 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
4,1889-01-05 12:00:00,-36.25,-35.00,140.625,142.50,1.047658e-02
...,...,...,...,...,...,...
1932835,2014-12-27 12:00:00,-30.00,-28.75,151.875,153.75,2.951144e-02
1932836,2014-12-28 12:00:00,-30.00,-28.75,151.875,153.75,2.257118e-01
1932837,2014-12-29 12:00:00,-30.00,-28.75,151.875,153.75,1.204670e-01
1932838,2014-12-30 12:00:00,-30.00,-28.75,151.875,153.75,2.632404e-02


# 4. Combining data CSVs

## 1. Combine data CSVs into a single CSV using pandas
## 2. Add an extra column called “model” that identifies the model
## 3. Compare run times

In [23]:
file = "figshareairline/data/ACCESS-CM2_daily_rainfall_NSW.csv"
re.findall(r'(?<=\/)[a-zA-Z0-9-]*(?=\_)', file)[0]

'ACCESS-CM2'

In [24]:
%%time
## here we are using a normal python way for merging the data 
import pandas as pd

files = glob.glob('figshareairline/data/*.csv')
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall(r'(?<=\/)[a-zA-Z0-9-]*(?=\_)', file)[0])
                for file in files)
              )
df.to_csv("figshareairline/combined_data.csv")

CPU times: user 5min 52s, sys: 9.79 s, total: 6min 1s
Wall time: 6min 8s


# 5. Load the combined CSV to memory and perform a simple EDA

## 1.1 Read original csv run time and memory usage

In [3]:
%%time
import pandas as pd
df = pd.read_csv("figshareairline/combined_data.csv", index_col=0)
df.head()

CPU times: user 56.9 s, sys: 6.91 s, total: 1min 3s
Wall time: 1min 9s


Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [4]:
%%time
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62513863 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   lat_min        float64
 1   lat_max        float64
 2   lon_min        float64
 3   lon_max        float64
 4   rain (mm/day)  float64
 5   model          object 
dtypes: float64(5), object(1)
memory usage: 3.3+ GB


## 1.2 Change data type to reduce memory usage

In [36]:
%%time

# 5. change data type
df_float32 = pd.read_csv("figshareairline/combined_data.csv", index_col=0).astype(
    "float32", errors="ignore"
)
df_float32.head()

CPU times: user 42.5 s, sys: 8.54 s, total: 51 s
Wall time: 1min 6s


Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439865,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [37]:
df_float32.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62513863 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   lat_min        float32
 1   lat_max        float32
 2   lon_min        float32
 3   lon_max        float32
 4   rain (mm/day)  float32
 5   model          object 
dtypes: float32(5), object(1)
memory usage: 2.1+ GB


In [50]:
print(f"Memory usage with float64: {df.memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df_float32.astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 3534.59 MB
Memory usage with float32: 2250.50 MB


## 1.3 Load just the column to reduce memory usage

In [51]:
%%time

# 5. Load just the column that we want
use_cols=['model']
df_models = pd.read_csv("figshareairline/combined_data.csv", usecols=use_cols)
df_model_counts = pd.DataFrame(df_models.value_counts())
df_model_counts

CPU times: user 23.6 s, sys: 2.58 s, total: 26.2 s
Wall time: 27.5 s


Unnamed: 0_level_0,0
model,Unnamed: 1_level_1
MPI-ESM1-2-HR,5154240
TaiESM1,3541230
CMCC-CM2-HR4,3541230
CMCC-CM2-SR5,3541230
CMCC-ESM2,3541230
NorESM2-MM,3541230
SAM0-UNICON,3541153
FGOALS-f3-L,3219300
GFDL-CM4,3219300
GFDL-ESM4,3219300


In [52]:
print(f"Memory usage with all the columns: {df.memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with just the column: {df_models.memory_usage().sum() / 1e6:.2f} MB")

Memory usage with all the columns: 3534.59 MB
Memory usage with just the column: 500.11 MB


In [13]:
import altair as alt
alt.Chart(df_model_counts).mark_bar().encode(
    x = 'index',
    y = 'model')

In [53]:
%%time
df_rain = df[["rain (mm/day)"]].describe().reset_index()
df_rain

CPU times: user 1.97 s, sys: 1.9 s, total: 3.87 s
Wall time: 5.02 s


Unnamed: 0,index,rain (mm/day)
0,count,59294560.0
1,mean,1.901827
2,std,5.588275
3,min,-3.807373e-12
4,25%,3.876672e-06
5,50%,0.06161705
6,75%,1.021314
7,max,432.9395


# 6. Perform a simple EDA in R

## 1. Pick an approach to transfer the dataframe from python to R. 
- Parquet

In [18]:
%%time
df.to_parquet("figshareairline/combined_data.parquet")

CPU times: user 19.1 s, sys: 5.32 s, total: 24.4 s
Wall time: 25.7 s


In [27]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [59]:
%%R
suppressMessages(library(arrow, warn.conflicts = FALSE))
suppressMessages(library(dplyr, warn.conflicts = FALSE))

In [61]:
%%time
%%R
ds <- open_dataset("figshareairline/combined_data.parquet")

CPU times: user 22 ms, sys: 14 ms, total: 36 ms
Wall time: 57.5 ms


In [63]:
%%time
%%R

head(ds)

Table
6 rows x 7 columns
$lat_min <double>
$lat_max <double>
$lon_min <double>
$lon_max <double>
$rain (mm/day) <double>
$model <string>
$time <string>

See $metadata for additional Schema metadata
CPU times: user 4.06 s, sys: 4.45 s, total: 8.51 s
Wall time: 6.46 s


## 2. Performs simple EDA in R

In [64]:
%%time
%%R

str(ds)

Classes 'FileSystemDataset', 'Dataset', 'ArrowObject', 'R6' <FileSystemDataset>
  Inherits from: <Dataset>
  Public:
    .:xp:.: externalptr
    .class_title: function () 
    clone: function (deep = FALSE) 
    files: active binding
    filesystem: active binding
    format: active binding
    initialize: function (xp) 
    invalidate: function () 
    metadata: active binding
    NewScan: function () 
    num_cols: active binding
    num_rows: active binding
    pointer: function () 
    print: function (...) 
    schema: active binding
    set_pointer: function (xp) 
    ToString: function () 
    type: active binding 
CPU times: user 14.3 ms, sys: 20.5 ms, total: 34.8 ms
Wall time: 60.2 ms


In [67]:
%%time
%%R

dim(ds)

[1] 62513863        7
CPU times: user 12.9 ms, sys: 15.7 ms, total: 28.5 ms
Wall time: 50.2 ms
