# Download Data

In [1]:
import glob
import json
import os
import re
import zipfile
from urllib.request import urlretrieve

import pandas as pd
import requests

In [2]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshareairline/"

To send a GET request to list the available files:

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(
    response.text
)  # this contains all the articles data, feel free to check it out
files = data["files"]  # this is just the data about the files, which is what we want
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

To get the file named `data.zip`:

In [None]:
%%time
files_to_dl = ["data.zip"]  # feel free to add other files here
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

In [4]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), "r") as f:
    f.extractall(output_directory)

CPU times: user 18 s, sys: 3.32 s, total: 21.3 s
Wall time: 22.9 s


# Combine Data CSVs

To combine all these files using python:

In [7]:
%%time
import pandas as pd

files = glob.glob("figshareairline/*.csv")
try:
    files.remove(os.path.join("figshareairline", "observed_daily_rainfall_SYD.csv"))
except:
    pass
try:
    files.remove(os.path.join("figshareairline", "combined_data.csv"))
except:
    pass
            
df = pd.concat(
    (
        pd.read_csv(file, index_col=False)
        .assign(
            file_name=re.findall("[ \w-]+\.", file)[0],
            model=lambda x: x.file_name.str.split("_", expand=True)[0],
        )
        .drop(columns="file_name")
        for file in files
    )
)
df.to_csv("figshareairline/combined_data.csv")

CPU times: user 12min 12s, sys: 1min 47s, total: 13min 59s
Wall time: 15min 22s


In [8]:
%%sh
du -sh figshareairline/combined_data.csv

6.0G	figshareairline/combined_data.csv


In [12]:
df.shape

(62467843, 7)

In [10]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


### Run Times:


Alex: 

- CPU times: total: 5min 10s

- Wall time: 5min 29s

Harry:

- CPU times: total: 12min 12s

- Wall time: 15min 22s

Brandon:

Anthea:

### Summary:
> TBC