# Milestone 1 Notebook

*Note: Steps 1 (team work contract) and 2 (creating repository) for this milestone are not included in this notebook.*

## Imports

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

## 3. Downloading the data

### Download using figshare's API

In [2]:
# figshare article metadata
article_id = 14096681  
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}

# directories for output
output_directory = "figsharerain/"
output_directory_files = "figsharerain/data/"

# get the files from figshare
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)
files = data["files"]

files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [3]:
%%time

files_to_dl = ['data.zip']

# download files, this takes some time
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 3.42 s, sys: 7.2 s, total: 10.6 s
Wall time: 20min 6s


### Extract and view files

In [4]:
%%time

# extract files
os.makedirs(output_directory_files, exist_ok=True)
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory_files)

CPU times: user 16.5 s, sys: 2.88 s, total: 19.4 s
Wall time: 20 s


# 4. Combine data CSVs with Pandas

### Combine CSVs and add "model" column

In [5]:
%%time
# files to combine
files = glob.glob('figsharerain/data/*.csv')
files.remove('figsharerain/data/observed_daily_rainfall_SYD.csv')

# combine with pandas
df = pd.concat(
    (
        pd.read_csv(file, index_col=0)
        .assign(model=re.findall(r'^[^_]*', file)[0]) for file in files) # model column
    )

df["model"] = df["model"].apply(lambda x: x.split("/")[-1])

# save combined file
df.to_csv("figsharerain/data/combined_data.csv")

CPU times: user 6min 43s, sys: 22.9 s, total: 7min 6s
Wall time: 7min 11s


In [6]:
df.head()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [7]:
print(df.shape)

(62467843, 6)


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Nico        | Mac              |32gb | Intel i9  |  Yes   |  7min 11 sec          |
| Kristin    |  Mac                | 8gb    |  Intel i5         |   Yes     |   9min 8 sec         |
| Jennifer    |                  |     |           |        |            |
| Morgan    |                  |     |           |        |            |

*Summary of observations of runtimes*

TO ADD AT END

# 5. Load the combined CSV and perform EDA

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Nico        | Mac              |32gb | Intel i9  |  Yes   |            |
| Kristin    |  Mac                | 8gb    |   Intel i5        |    Yes    |            |
| Jennifer    |                  |     |           |        |            |
| Morgan    |                  |     |           |        |            |

*Summary of observations of runtimes*

TO ADD AT END

# 6. Perform EDA in R