# MDS DSCI 525 - Group 15 Milestone 1

**Author**: Lennon Lok Lam Au-Yeung, Ke Wang, Ty Andrews, Peng Zhang

## Step 0 Importing library

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

## Step 1 Downloading the data via API

Navigate to the location of your computer where you would like to download the files to.

In [2]:
%cd ~/MDS/525_labs/figshareexp
## Change it to the location that you want to download your files to.

/Users/pengzh/MDS/525_labs/figshareexp


In [3]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figsharerainfall/"

In [4]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [5]:
%%time
files_to_dl = ["data.zip"] 
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 3.24 s, sys: 5.11 s, total: 8.35 s
Wall time: 1min 51s


In [6]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 7.2 s, sys: 819 ms, total: 8.02 s
Wall time: 10.2 s


In [7]:
%ls -ltr figsharerainfall

total 12428672
drwxr-xr-x  30 pengzh  staff        960 27 Mar 20:18 [1m[34m__MACOSX[m[m/
-rw-r--r--   1 pengzh  staff  814041183 28 Mar 12:45 data.zip
-rw-r--r--   1 pengzh  staff   95376895 28 Mar 12:45 MPI-ESM-1-2-HAM_daily_rainfall_NSW.csv
-rw-r--r--   1 pengzh  staff   94960113 28 Mar 12:45 AWI-ESM-1-1-LR_daily_rainfall_NSW.csv
-rw-r--r--   1 pengzh  staff   82474546 28 Mar 12:45 NorESM2-LM_daily_rainfall_NSW.csv
-rw-r--r--   1 pengzh  staff  127613760 28 Mar 12:45 ACCESS-CM2_daily_rainfall_NSW.csv
-rw-r--r--   1 pengzh  staff  232118894 28 Mar 12:45 FGOALS-f3-L_daily_rainfall_NSW.csv
-rw-r--r--   1 pengzh  staff  330360682 28 Mar 12:45 CMCC-CM2-HR4_daily_rainfall_NSW.csv
-rw-r--r--   1 pengzh  staff  254009247 28 Mar 12:45 MRI-ESM2-0_daily_rainfall_NSW.csv
-rw-r--r--   1 pengzh  staff  235661418 28 Mar 12:45 GFDL-CM4_daily_rainfall_NSW.csv
-rw-r--r--   1 pengzh  staff  294260911 28 Mar 12:45 BCC-CSM2-MR_daily_rainfall_NSW.csv
-rw-r--r--   1 pengzh  staff  295768615 28 Mar 12:4

## Step 2 Combining data CSVs

Combine csv files into one file. Note that `observed_daily_rainfall_SYD.csv` has been manually removed as per the milestone 1 requirement.

In [8]:
%%time
# We are using a normal python way for merging the data 
# add extra column of "model"
use_cols = ["time", "lat_min", "lat_max", "lon_min","lon_max","rain (mm/day)"]
files = glob.glob('figsharerainfall/*.csv')
df = pd.concat((pd.read_csv(file, index_col=0, usecols=use_cols)
                .assign(model=re.findall("/([^_]*)", file)[0])
                for file in files)
              )
df.to_csv("figsharerainfall/combined_data.csv")

CPU times: user 2min 59s, sys: 7.87 s, total: 3min 7s
Wall time: 3min 8s


Compare the time for combining CSVs on team member's local computers. See the following table for results.

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Lennon Lok Lam |                  |     |           |        |            |
| Ke             |                  |     |           |        |            |
| Ty             |                  |     |           |        |            |
| Peng           | MacOS Ventura V13.2.1 | 16GB | Apple M2 | Yes | 3min 8s  |

## Step 3 Load combined CSV to memory and perform a simple EDA in Python

In [11]:
df.head()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [12]:
df.tail()

Unnamed: 0_level_0,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-12-27 12:00:00,-30.157068,-29.21466,153.125,154.375,6.689683,SAM0-UNICON
2014-12-28 12:00:00,-30.157068,-29.21466,153.125,154.375,7.862555,SAM0-UNICON
2014-12-29 12:00:00,-30.157068,-29.21466,153.125,154.375,10.005026,SAM0-UNICON
2014-12-30 12:00:00,-30.157068,-29.21466,153.125,154.375,8.541592,SAM0-UNICON
2014-12-31 12:00:00,-30.157068,-29.21466,153.125,154.375,68.117489,SAM0-UNICON


In [14]:
%%time
df.describe()

CPU times: user 6.39 s, sys: 1.38 s, total: 7.77 s
Wall time: 7.78 s


Unnamed: 0,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
count,59248540.0,62467840.0,59248540.0,62467840.0,59248540.0
mean,-33.10482,-31.97757,146.9059,148.215,1.90117
std,1.963549,1.992067,3.793784,3.809994,5.585735
min,-36.46739,-36.0,140.625,141.25,-3.807373e-12
25%,-34.86911,-33.66221,143.4375,145.0,3.838413e-06
50%,-33.0,-32.04188,146.875,148.125,0.06154947
75%,-31.4017,-30.15707,150.1875,151.3125,1.020918
max,-29.9,-27.90606,153.75,155.625,432.9395


## Step 4 Perform a simple EDA in R