In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import dask.dataframe as dd
import pyarrow.feather as feather

## 3. Downloading the data

rubric={correctness:10}

<div class="alert alert-block alert-info">
Download the data from figshare to your local computer using the figshare API (you need to make use of requests library).
Extract the zip file, again programmatically, similar to how we did it in class.

You can download the data and unzip it manually. But we learned about APIs, so we can do it in a reproducible way with the requests library, similar to how we did it in class.

There are 5 files in the figshare repo. The one we want is: `data.zip`
</div>

### 3.1 Setting up API

Code adopted from lecture 2 notes.

In [2]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figsharerainfall/"

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

### 3.2 Download the data and unzip it

Code adopted from lecture 2 notes.

In [4]:
%%time
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: total: 3.61 s
Wall time: 25.4 s


In [5]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: total: 14.5 s
Wall time: 14.8 s


In [6]:
%ls -ltr figsharerainfall/

Invalid switch - "".


### 3.3 Preview data

In [7]:
%%time
# Checking out the file
df_sample = pd.read_csv("figsharerainfall/ACCESS-CM2_daily_rainfall_NSW.csv",
                 index_col="time",
                 parse_dates=True)
df_sample.head()

CPU times: total: 1.89 s
Wall time: 1.92 s


Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13
1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0
1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0
1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0
1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658


## 4. Combining data CSVs

rubric={correctness:10,reasoning:10}

<div class="alert alert-block alert-info">

1. Combine data CSVs into a single CSV using pandas.

2. When combining the CSV files, add an extra column called "model" that identifies the model. 
    > Tip 1: you can get this column populated from the file name, eg: for file name "SAM0-UNICON_daily_rainfall_NSW.csv", the model name is SAM0-UNICON 
    
    > Tip 2: Remember how we added year when we combined airline CSVs. Tip 3: You can use regex generator.

Note: There is a file called observed_daily_rainfall_SYD.csv in the data folder that you downloaded. Make sure you exclude this file (programmatically or just take out that file from folder) before you combine CSVs. We will use this file in our next milestone.
</div>

In [8]:
os.remove("figsharerainfall/observed_daily_rainfall_SYD.csv")

In [9]:
%%time
files = glob.glob('figsharerainfall/*.csv')
df = pd.concat(
    (pd.read_csv(file, index_col="time", parse_dates=True)
                .assign(model=re.findall(r'(?<=fall\\)(.+)?(?=_daily)', file)[0])
                for file in files)
              )
df.to_csv("figsharerainfall/combined_data.csv")

CPU times: total: 6min 35s
Wall time: 6min 36s


In [10]:
print(df.shape)

df.head()

(62467843, 6)


Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2
1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658,ACCESS-CM2


In [11]:
df.tail()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-12-27 12:00:00,-30.157068,-29.21466,153.125,154.375,0.554375,TaiESM1
2014-12-28 12:00:00,-30.157068,-29.21466,153.125,154.375,7.028577,TaiESM1
2014-12-29 12:00:00,-30.157068,-29.21466,153.125,154.375,0.234757,TaiESM1
2014-12-30 12:00:00,-30.157068,-29.21466,153.125,154.375,2.097459,TaiESM1
2014-12-31 12:00:00,-30.157068,-29.21466,153.125,154.375,0.548421,TaiESM1


## 5. Load the combined CSV to memory and perform a simple EDA

rubric={correctness:10,reasoning:10}

<div class="alert alert-block alert-info">

Investigate at least two of the following approaches to reduce memory usage while performing the EDA (e.g., value_counts).
- Changing dtype of your data
- Load just columns what we want
- Loading in chunks
- Dask

Compare run times on different machines within your team and summarize your observations.
</div>

## 5.1 Convert to float 32

In [12]:
%%time
df_32 = df.loc[:, df.columns != "model"].astype('float32')
df_32["model"] = df["model"]

CPU times: total: 1.73 s
Wall time: 1.71 s


**Float 64 results**

In [13]:
%%time
print(df["model"].value_counts())

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
CPU times: total: 1.53 s
Wall time: 1.55 s


**Float 32 results**

In [14]:
%%time
print(df_32["model"].value_counts())

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
CPU times: total: 1.55 s
Wall time: 1.55 s


## 5.2 Chunk method

In [15]:
%%time
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("figsharerainfall/combined_data.csv", chunksize=10_000_000,
                        index_col="time", parse_dates=True):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
AWI-ESM-1-1-LR       966420
BCC-CSM2-MR         3035340
BCC-ESM1             551880
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
CanESM5              551880
EC-Earth3-Veg-LR    3037320
FGOALS-f3-L         3219300
FGOALS-g3           1287720
GFDL-CM4            3219300
GFDL-ESM4           3219300
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
MIROC6              2070900
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-HR       5154240
MPI-ESM1-2-LR        966420
MRI-ESM2-0          3037320
NESM3                966420
NorESM2-LM           919800
NorESM2-MM          3541230
SAM0-UNICON         3541153
TaiESM1             3541230
dtype: int32
CPU times: total: 1min 13s
Wall time: 1min 13s


### 5.3 Using Dask

In [16]:
dask_df = dd.read_csv("figsharerainfall/combined_data.csv")

In [17]:
%%time
print(dask_df["model"].value_counts().compute())

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
CPU times: total: 45.3 s
Wall time: 15.7 s


## 6. Perform a simple EDA in R
rubric={correctness:15,reasoning:10}


<div class="alert alert-block alert-info">

Pick an approach to transfer the dataframe from python to R.
 
- Parquet file
- Feather file
- Pandas exchange
- Arrow exchange

Discuss why you chose this approach over others.

</div>

In [18]:
%%time
df_2 = pd.read_csv("figsharerainfall/combined_data.csv")
df_2.to_feather("figsharerainfall/combined_data.feather")

CPU times: total: 57 s
Wall time: 54.1 s


In [19]:
dfeather = pd.read_feather("figsharerainfall/combined_data.feather")

In [20]:
%%sh
du -sh figsharerainfall/combined_data.feather

1.2G	figsharerainfall/combined_data.feather


In [2]:
%load_ext rpy2.ipython



In [3]:
%%R

suppressMessages(library(dplyr))
suppressMessages(library(arrow))

feather_r <- read_feather("figsharerainfall/combined_data.feather")
start_time <- Sys.time()
result <- feather_r  %>% count(model)
print(result)
end_time <- Sys.time()
print(end_time - start_time)

[38;5;246m# A tibble: 27 x 2[39m
   model                  n
   [3m[38;5;246m<chr>[39m[23m              [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[38;5;250m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[38;5;250m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[38;5;250m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[38;5;250m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 6[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 7[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m 8[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m 9[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m10[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[38;5;246m# ... with 17 more rows[39m
Time difference of 4.525287 secs
