In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import platform

## Downloading the data 

In [2]:
# Adapted from lecture notes
# Necessary metadata
article_id = 14096681  # unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "data/"

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # contains all the data
files = data["files"]             # the data about the files
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [4]:
%%time
files_to_dl = ["data.zip"] 
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 4.55 s, sys: 3.25 s, total: 7.8 s
Wall time: 42.8 s


In [5]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 14.7 s, sys: 1.08 s, total: 15.7 s
Wall time: 15.8 s


## Combining data CSVs

In [6]:
files = glob.glob('data/*.csv')
files.remove('data/observed_daily_rainfall_SYD.csv')

In [7]:
# in case you already ran the chunks and want to retest -- DELETE FOR SUBMISSION

# files.remove('data/combined_data.csv')

In [8]:
def combine_csv(regex_string, files):
    """
    Combines csv with given files and assigns name according to regex expression.  
    Parameters
    ----------
    regex_string : string
                   regex expression to extract model name  
    files   : list
            list of strings to the path of the file
                
    Returns
    -------
    df : pandas dataframe 
    """
    df = pd.concat((pd.read_csv(file, index_col=0)
                    .assign(model=re.findall(regex_string, file)[0])
                    for file in files))
    return df

In [9]:
%%time

string_mac = r"(?<=data/)(.*)(?=_daily)"
string_windows = r"(?<=data\\)(.*)(?=_daily)"

if 'mac' in platform.platform():
    df = combine_csv(string_mac, files)
else:
    df = combine_csv(string_windows, files)
    
df.to_csv("data/combined_data.csv")

CPU times: user 5min 57s, sys: 8.22 s, total: 6min 5s
Wall time: 6min 7s


In [10]:
df.head()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Qingqing   |   Windows               |   16GB  |     Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz 2.59 GHz      |   T     |     1min 32s       |
| Lianna   |       MacOS           |  16GB   |    Apple M1 - 8 Core       |    T    |     6min 5s       |
| Linhan    |                  |     |           |        |            |
|  Doris    |                  |     |           |        |            |

## Loading the combined CSV to memory and performing a simple EDA