# Download the Data

In [3]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

In [4]:
%cd /Users/JulieS/Documents/MDS/Block_6/DSCI_525/DSCI525_group13/

/Users/JulieS/Documents/MDS/Block_6/DSCI_525/DSCI525_group13


In [5]:
# Necessary metadata
article_id = 14096681 
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "data/"

In [6]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [7]:
%%time
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 3.09 s, sys: 5.14 s, total: 8.23 s
Wall time: 3min 17s


## Extract data

In [None]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 10.1 s, sys: 1.5 s, total: 11.6 s
Wall time: 12 s


## Combine CSV files

In [15]:
%%time
## here we are using a normal python way for merging the data 
use_cols = ["time", "lat_min", "lat_max", "lon_min", "lon_max", "rain (mm/day)"]
files = glob.glob('data/*.csv')
df = pd.concat((pd.read_csv(file, index_col=0, usecols=use_cols)
                .assign(model=re.findall("/([^_]*)", file)[0])
                for file in files)
              )
df.to_csv("data/combined_data.csv")

CPU times: user 4min 18s, sys: 12.3 s, total: 4min 30s
Wall time: 4min 33s


In [18]:
df.shape

(62467843, 2)

## EDA

### Use just the `time` and `rain` columns and change dtype from `float64` to `float32`

In [20]:
import numpy as np

In [23]:
%%time
use_cols = ["time", "rain (mm/day)", 'model']
df = pd.read_csv("data/combined_data.csv",usecols=use_cols, 
                 dtype = {'rain (mm/day)': np.float32})
print(df["model"].value_counts())
print(df.info())
print(df.describe())

MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-ESM2           3541230
CMCC-CM2-SR5        3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62467843 entries, 0 to 62467842
Data columns (total 3 columns):
 #   Column         Dtype  
---  ------         -----  
 0   time           object 
 1   rain (mm/day)  

## EDA in R

In [40]:
%reset -f

In [41]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [42]:
filepathcsv = "/Users/JulieS/Documents/MDS/Block_6/DSCI_525/DSCI525_group13/data/combined_data.csv"
filepathparquet = "/Users/JulieS/Documents/MDS/Block_6/DSCI_525/DSCI525_group13/data/combined_data.parquet"
filepathparquetr = "/Users/JulieS/Documents/MDS/Block_6/DSCI_525/DSCI525_group13/data/combined_data_r.parquet"

In [43]:
# !pip install rpy2_arrow
import pyarrow.dataset as ds
import pyarrow as pa
import pandas as pd
import pyarrow 
from pyarrow import csv
import rpy2_arrow.pyarrow_rarrow as pyra

In [44]:
%%time
dataset = ds.dataset(filepathcsv, format="csv")
table = dataset.to_table()
r_table = pyra.converter.py2rpy(table)

CPU times: user 15.8 s, sys: 1.47 s, total: 17.3 s
Wall time: 16.3 s


In [39]:
%%time
%%R -i r_table
start_time <- Sys.time()
suppressMessages(library(dplyr))
result <- r_table |> summarise()
end_time <- Sys.time()
print(result |> collect())
print(end_time - start_time)

RParsingError: Parsing status not OK - PARSING_STATUS.PARSE_ERROR

TokenError: ('EOF in multi-line statement', (9, 0))