# DSCI 525 Group 4 - Data retrieval using figshare API

## Import libraries:

In [1]:
import re
import os
import sys
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Specify Meta variables

In [2]:
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "../data/"
file_to_download = "data.zip"
rerun = True

## List of files available for download

In [3]:
%%time
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)
files = data["files"]
files

CPU times: total: 328 ms
Wall time: 1.24 s


[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

## Download specified file

In [4]:
%%time
if os.path.exists(f"{output_directory}/{file_to_download}"):
    print("data.zip is already exists!")
else:
    files_to_dl = ["data.zip"]

    for file in files:
        if file["name"] in files_to_dl:
            os.makedirs(output_directory, exist_ok=True)
            urlretrieve(file["download_url"], output_directory + file["name"])

data.zip is already exists!
CPU times: total: 0 ns
Wall time: 0 ns


## Unzip downloaded file

In [5]:
%%time
if rerun:
    with zipfile.ZipFile(os.path.join(output_directory, file_to_download), 'r') as f:
        f.extractall(output_directory)
else:
    print("Some CSV files already exists, nothing is extraced. Please check if files in data directory are correct.")

CPU times: total: 14.6 s
Wall time: 14.8 s


## 4. Combining data CSVs
rubric={correctness:10,reasoning:10}

1. Combine data CSVs into a single CSV using pandas.

2. When combining the CSV files, add an extra column called "model" that identifies the model. Tip 1: you can get this column populated from the file name, eg: for file name "SAM0-UNICON_daily_rainfall_NSW.csv", the model name is SAM0-UNICON Tip 2: Remember how we added year when we combined airline CSVs. Tip 3: You can use regex generator.

Note: There is a file called observed_daily_rainfall_SYD.csv in the data folder that you downloaded. Make sure you exclude this file (programmatically or just take out that file from folder) before you combine CSVs. We will use this file in our next milestone.

3. Compare run times on different machines within your team and summarize your observations.

In [6]:
#%%time
#files = glob.glob(f'{output_directory}/*.csv')
#files = [f for f in files if f.find("observed_daily_rainfall_SYD.csv")==-1 and f.find("combined_data.csv")==-1]

#df = pd.concat((pd.read_csv(file, index_col=0, parse_dates=True).assign(model=re.findall(r'(?<=\\)(.*)(?=_daily)', file)[0])
#                for file in files))
#print(df.shape)

In [7]:
%%time

if rerun:
    files = glob.glob(f'{output_directory}/*.csv')
    files = [f for f in files if f.find("observed_daily_rainfall_SYD.csv")==-1 and f.find("combined_data.csv")==-1]
    i = 1
    records = 0

    for file in files:
        df = pd.read_csv(file, index_col=0, parse_dates=True).assign(model=re.findall(r'(?<=\\)(.*)(?=_daily)', file)[0])
        print(f"Processing {file} \t total {len(df)} rows, \t {i} out of {len(files)} files.")
        records += len(df)

        if i == 1:
            df.to_csv(f"{output_directory}/combined_data.csv")
        else:
            df.to_csv(f'{output_directory}/combined_data.csv', mode='a', header=False)

        i+=1
    print("")
    print(f"Total rows: {records}.") #62467843 rows
    print("")

Processing ../data\ACCESS-CM2_daily_rainfall_NSW.csv 	 total 1932840 rows, 	 1 out of 27 files.
Processing ../data\ACCESS-ESM1-5_daily_rainfall_NSW.csv 	 total 1610700 rows, 	 2 out of 27 files.
Processing ../data\AWI-ESM-1-1-LR_daily_rainfall_NSW.csv 	 total 966420 rows, 	 3 out of 27 files.
Processing ../data\BCC-CSM2-MR_daily_rainfall_NSW.csv 	 total 3035340 rows, 	 4 out of 27 files.
Processing ../data\BCC-ESM1_daily_rainfall_NSW.csv 	 total 551880 rows, 	 5 out of 27 files.
Processing ../data\CanESM5_daily_rainfall_NSW.csv 	 total 551880 rows, 	 6 out of 27 files.
Processing ../data\CMCC-CM2-HR4_daily_rainfall_NSW.csv 	 total 3541230 rows, 	 7 out of 27 files.
Processing ../data\CMCC-CM2-SR5_daily_rainfall_NSW.csv 	 total 3541230 rows, 	 8 out of 27 files.
Processing ../data\CMCC-ESM2_daily_rainfall_NSW.csv 	 total 3541230 rows, 	 9 out of 27 files.
Processing ../data\EC-Earth3-Veg-LR_daily_rainfall_NSW.csv 	 total 3037320 rows, 	 10 out of 27 files.
Processing ../data\FGOALS-f3-L

## Compare results:

| Team Member          | Operating System | RAM (GB) | Processor                 | Is SSD | Time taken |
| -------------------- | ---------------- | -------- | ------------------------- | ------ | ---------- |
| Anahita Einolghozati | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Luke Collins         | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Zihan Zhou           | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Steven Lio           | Windows 10 x64   | 1x16     | AMD Ryzen 7 5800H 3.20GHz | Yes    | 6 mins 28s |

In [8]:
%%sh
du -sh "../data/combined_data.csv"

5.7G	../data/combined_data.csv


## 5. Load the combined CSV to memory and perform a simple EDA
rubric={correctness:10,reasoning:10}

1. Investigate at least two of the following approaches to reduce memory usage while performing the EDA (e.g., value_counts).

- Changing dtype of your data
- Load just columns what we want
- Loading in chunks
- Dask
2. Compare run times on different machines within your team and summarize your observations.

## Benchmark: Load everything

In [9]:
%%time

df = pd.read_csv(f"{output_directory}/combined_data.csv",index_col=0, parse_dates=True)
print("")
print(df.info(memory_usage='deep'))
print("")
print(df["model"].value_counts())


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62467843 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   lat_min        float64
 1   lat_max        float64
 2   lon_min        float64
 3   lon_max        float64
 4   rain (mm/day)  float64
 5   model          object 
dtypes: float64(5), object(1)
memory usage: 6.7 GB
None

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-L

## Compare results:

| Team Member          | Operating System | RAM (GB) | Processor                 | Is SSD | Time taken |
| -------------------- | ---------------- | -------- | ------------------------- | ------ | ---------- |
| Anahita Einolghozati | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Luke Collins         | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Zihan Zhou           | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Steven Lio           | Windows 10 x64   | 1x16     | AMD Ryzen 7 5800H 3.20GHz | Yes    | 1 mins 29s |

## Change dtype of data

In [10]:
%%time

dtypes = {'lat_min': 'float16', 
          'lat_max': 'float16', 
          'lon_min': 'float16', 
          'lon_max': 'float16',
          'rain (mm/day)':'float32',
          'model':'str'}
df = pd.read_csv(f"{output_directory}/combined_data.csv",index_col=0, parse_dates=True, dtype=dtypes)
print("")
print(df.info(memory_usage='deep'))
print("")
print(df["model"].value_counts())


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62467843 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   lat_min        float16
 1   lat_max        float16
 2   lon_min        float16
 3   lon_max        float16
 4   rain (mm/day)  float32
 5   model          object 
dtypes: float16(4), float32(1), object(1)
memory usage: 5.1 GB
None

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720


## Compare results:

| Team Member          | Operating System | RAM (GB) | Processor                 | Is SSD | Time taken |
| -------------------- | ---------------- | -------- | ------------------------- | ------ | ---------- |
| Anahita Einolghozati | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Luke Collins         | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Zihan Zhou           | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Steven Lio           | Windows 10 x64   | 1x16     | AMD Ryzen 7 5800H 3.20GHz | Yes    | 1 mins 27s |

## Load only minimum columns

In [11]:
%%time

use_cols = ["time","rain (mm/day)","model"]
df = pd.read_csv(f"{output_directory}/combined_data.csv",index_col=0, parse_dates=True, usecols=use_cols)
print("")
print(df.info(memory_usage='deep'))
print("")
print(df["model"].value_counts())


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62467843 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 2 columns):
 #   Column         Dtype  
---  ------         -----  
 0   rain (mm/day)  float64
 1   model          object 
dtypes: float64(1), object(1)
memory usage: 4.9 GB
None

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM  

## Compare results:

| Team Member          | Operating System | RAM (GB) | Processor                 | Is SSD | Time taken |
| -------------------- | ---------------- | -------- | ------------------------- | ------ | ---------- |
| Anahita Einolghozati | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Luke Collins         | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Zihan Zhou           | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Steven Lio           | Windows 10 x64   | 1x16     | AMD Ryzen 7 5800H 3.20GHz | Yes    | 1 mins 18s |

## Load only minimum columns and specify column types

In [12]:
%%time

use_cols = ["time","rain (mm/day)","model"]
dtypes = {'rain (mm/day)':'float32',
          'model':'str'}
df = pd.read_csv(f"{output_directory}/combined_data.csv",index_col=0, parse_dates=True, usecols=use_cols,dtype=dtypes)
print("")
print(df.info(memory_usage='deep'))
print("")
print(df["model"].value_counts())


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62467843 entries, 1889-01-01 12:00:00 to 2014-12-31 12:00:00
Data columns (total 2 columns):
 #   Column         Dtype  
---  ------         -----  
 0   rain (mm/day)  float32
 1   model          object 
dtypes: float32(1), object(1)
memory usage: 4.6 GB
None

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM  

## Compare results:

| Team Member          | Operating System | RAM (GB) | Processor                 | Is SSD | Time taken |
| -------------------- | ---------------- | -------- | ------------------------- | ------ | ---------- |
| Anahita Einolghozati | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Luke Collins         | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Zihan Zhou           | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Steven Lio           | Windows 10 x64   | 1x16     | AMD Ryzen 7 5800H 3.20GHz | Yes    | 1 mins 28s |

## Loading in chunks

In [13]:
%%time
counts = pd.Series(dtype=int)
for chunk in pd.read_csv(f"{output_directory}/combined_data.csv", chunksize=1_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
AWI-ESM-1-1-LR       966420
BCC-CSM2-MR         3035340
BCC-ESM1             551880
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
CanESM5              551880
EC-Earth3-Veg-LR    3037320
FGOALS-f3-L         3219300
FGOALS-g3           1287720
GFDL-CM4            3219300
GFDL-ESM4           3219300
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
MIROC6              2070900
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-HR       5154240
MPI-ESM1-2-LR        966420
MRI-ESM2-0          3037320
NESM3                966420
NorESM2-LM           919800
NorESM2-MM          3541230
SAM0-UNICON         3541153
TaiESM1             3541230
dtype: int32
CPU times: total: 56.2 s
Wall time: 56.3 s


## Compare results:

| Team Member          | Operating System | RAM (GB) | Processor                 | Is SSD | Time taken |
| -------------------- | ---------------- | -------- | ------------------------- | ------ | ---------- |
| Anahita Einolghozati | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Luke Collins         | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Zihan Zhou           | ???????????????? | ???????? | ????????????????????????? | ?????? | ?????????? |
| Steven Lio           | Windows 10 x64   | 1x16     | AMD Ryzen 7 5800H 3.20GHz | Yes    | 56s        |

## 6. Perform a simple EDA in R
rubric={correctness:15,reasoning:10}

1. Pick an approach to transfer the dataframe from python to R.
- Parquet file
- Feather file
- Pandas exchange
- Arrow exchange
2. Discuss why you chose this approach over others.