In [15]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import numpy as np
import pyarrow.feather as feather
from memory_profiler import memory_usage

In [2]:
# %load_ext rpy2.ipython
%load_ext memory_profiler

## Download Data

In [3]:
# Attribution: DSCI 525 lecture notebook
# Necessary metadata
article_id = 14096681  # unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figsharerainfall/"

In [4]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)
files = data["files"]    

In [5]:
%%time
files_to_dl = ["data.zip"]  
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 5.71 s, sys: 5.6 s, total: 11.3 s
Wall time: 1min 33s


In [6]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 19.5 s, sys: 4.19 s, total: 23.7 s
Wall time: 34.2 s


## Combine Data

In [7]:
# Make observed_daily_rainfall_SYD.csv have the same columns as other csvs
# df = pd.read_csv('figsharerainfall/observed_daily_rainfall_SYD.csv')
# df["lat_min"] = np.nan
# df["lat_max"] = np.nan
# df["lon_min"] = np.nan
# df["lon_max"] = np.nan
# df.reset_index(inplace=True, drop=True)
# df.to_csv("figsharerainfall/observed_daily_rainfall_SYD.csv", index = False)
# df.head()

In [10]:
%%time
%memit
# Shows time that regular python takes to merge file
# Join all data together
## here we are using a normal python way of merging the data 
# use_cols = ["time", "lat_min", "lat_max", "lon_min","lon_max","rain (mm/day)"]
files = glob.glob('figsharerainfall/*.csv')
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall(r'^[^_]+(?=_)', file)[0])
                for file in files)
              )
df.to_csv("figsharerainfall/combined_data.csv")

peak memory: 71.81 MiB, increment: 0.68 MiB
CPU times: user 25min 1s, sys: 3min 34s, total: 28min 36s
Wall time: 33min 54s


In [16]:
feather.write_feather(df, "figsharerainfall/combined_data.feather")

In [11]:
%%sh
du -sh figsharerainfall/combined_data.csv

 21G	figsharerainfall/combined_data.csv


In [12]:
%%time
df = pd.read_csv("figsharerainfall/combined_data.csv")

CPU times: user 4min 6s, sys: 2min 38s, total: 6min 45s
Wall time: 10min 56s


In [13]:
print(df.shape)

(250055452, 7)


In [14]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,figsharerainfall/MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,figsharerainfall/MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,figsharerainfall/MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,figsharerainfall/MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,figsharerainfall/MPI-ESM-1-2-HAM


## EDA Python

## EDA R