In [4]:
import io
import os
import json
import glob
import requests
import numpy as np
import pandas as pd
from urllib.request import urlretrieve
from memory_profiler import memory_usage
import zipfile

## Step1: Download the data from figshare to local computer

In [2]:
%pwd

'C:\\Users\\yxiong\\DSCI_525_group21\\notebooks'

In [3]:
%cd /Users/yxiong/DSCI_525_group21/notebooks/

C:\Users\yxiong\DSCI_525_group21\notebooks


In [4]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshare/"

In [5]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files
# the file we target for is the 4th file with .zip. 

[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

In [6]:
%%time
files_to_dl = ["data.zip"]  # feel free to add other files here
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

Wall time: 1min 22s


In [7]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

Wall time: 3min 6s


## Step 2: Combining CSVs

In [8]:
### just listing to get an idea how individual file looks like 
use_cols = ["time",'lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']
df = pd.read_csv("./figshare/ACCESS-CM2_daily_rainfall_NSW.csv", usecols=use_cols)
df

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-36.25,-35.00,140.625,142.50,3.293256e-13
1,1889-01-02 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
2,1889-01-03 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
3,1889-01-04 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
4,1889-01-05 12:00:00,-36.25,-35.00,140.625,142.50,1.047658e-02
...,...,...,...,...,...,...
1932835,2014-12-27 12:00:00,-30.00,-28.75,151.875,153.75,2.951144e-02
1932836,2014-12-28 12:00:00,-30.00,-28.75,151.875,153.75,2.257118e-01
1932837,2014-12-29 12:00:00,-30.00,-28.75,151.875,153.75,1.204670e-01
1932838,2014-12-30 12:00:00,-30.00,-28.75,151.875,153.75,2.632404e-02


In [5]:
%load_ext memory_profiler

In [30]:
%%time
%memit
# Shows time that regular python takes to merge file
# Join all data together
import pandas as pd
use_cols = ["time",'lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']

files = glob.glob('./figshare/*.csv')

df_all = None

for file in files:
    
    filename = os.path.basename(file)
    
    if '_daily_rainfall_NSW.csv' in filename:
        print(f"Processing the file {filename}")
        model = filename.split('_daily_rainfall_NSW.csv')[0]

        df = pd.read_csv(file, usecols=use_cols, index_col=0)
        df['model'] = model    

        if df_all is None:
            df_all = df
        else:
            df_all = df_all.append(df)

peak memory: 3499.92 MiB, increment: 0.04 MiB
Processing the file ACCESS-CM2_daily_rainfall_NSW.csv
Processing the file ACCESS-ESM1-5_daily_rainfall_NSW.csv
Processing the file AWI-ESM-1-1-LR_daily_rainfall_NSW.csv
Processing the file BCC-CSM2-MR_daily_rainfall_NSW.csv
Processing the file BCC-ESM1_daily_rainfall_NSW.csv
Processing the file CanESM5_daily_rainfall_NSW.csv
Processing the file CMCC-CM2-HR4_daily_rainfall_NSW.csv
Processing the file CMCC-CM2-SR5_daily_rainfall_NSW.csv
Processing the file CMCC-ESM2_daily_rainfall_NSW.csv
Processing the file EC-Earth3-Veg-LR_daily_rainfall_NSW.csv
Processing the file FGOALS-f3-L_daily_rainfall_NSW.csv
Processing the file FGOALS-g3_daily_rainfall_NSW.csv
Processing the file GFDL-CM4_daily_rainfall_NSW.csv
Processing the file GFDL-ESM4_daily_rainfall_NSW.csv
Processing the file INM-CM4-8_daily_rainfall_NSW.csv
Processing the file INM-CM5-0_daily_rainfall_NSW.csv
Processing the file KIOST-ESM_daily_rainfall_NSW.csv
Processing the file MIROC6_dai

In [31]:
df_all.to_csv('./figshare/combined_data.csv')

In [32]:
%%sh
du -sh figshare/combined_data.csv

5.7G	figshare/combined_data.csv


#### Observation: 
#### Computer: IntelI5, 8GB Ram. Wall time 31 min. peak memory: 3499.92 MiB, increment: 0.04 MiB

### Step 3: Load combined CSV in memory and perform simple EDA

### Loading data using Dask

In [3]:
import dask.dataframe as dd
import pandas as pd

In [4]:
%%time
%%memit
use_cols = ["time",'lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']

files = glob.glob('./figshare/*.csv')

ddf_all = None

for file in files:
    
    filename = os.path.basename(file)
    
    if '_daily_rainfall_NSW.csv' in filename:
        print(f"Processing the file {filename}")
        model = filename.split('_daily_rainfall_NSW.csv')[0]

        ddf = dd.read_csv(file, assume_missing=True, usecols=use_cols)
        ddf['model'] = model    

        if ddf_all is None:
            ddf_all = ddf
        else:
            ddf_all = ddf_all.append(ddf)

Processing the file ACCESS-CM2_daily_rainfall_NSW.csv
Processing the file ACCESS-ESM1-5_daily_rainfall_NSW.csv
Processing the file AWI-ESM-1-1-LR_daily_rainfall_NSW.csv
Processing the file BCC-CSM2-MR_daily_rainfall_NSW.csv
Processing the file BCC-ESM1_daily_rainfall_NSW.csv
Processing the file CanESM5_daily_rainfall_NSW.csv
Processing the file CMCC-CM2-HR4_daily_rainfall_NSW.csv
Processing the file CMCC-CM2-SR5_daily_rainfall_NSW.csv
Processing the file CMCC-ESM2_daily_rainfall_NSW.csv
Processing the file EC-Earth3-Veg-LR_daily_rainfall_NSW.csv
Processing the file FGOALS-f3-L_daily_rainfall_NSW.csv
Processing the file FGOALS-g3_daily_rainfall_NSW.csv
Processing the file GFDL-CM4_daily_rainfall_NSW.csv
Processing the file GFDL-ESM4_daily_rainfall_NSW.csv
Processing the file INM-CM4-8_daily_rainfall_NSW.csv
Processing the file INM-CM5-0_daily_rainfall_NSW.csv
Processing the file KIOST-ESM_daily_rainfall_NSW.csv
Processing the file MIROC6_daily_rainfall_NSW.csv
Processing the file MPI-ES

#### Observation: 
#### Computer: IntelI5, 8GB Ram. Using Dask, now the Wall time 9s. peak memory: 249.95 MiB, increment: 7.42 MiB. This is way way faster than using Pandas to load the data. 

### Some basic EDA analysis

In [12]:
ddf["model"].value_counts()

Dask Series Structure:
npartitions=1
    int64
      ...
Name: model, dtype: int64
Dask Name: value-counts-agg, 25 tasks

In [13]:
ddf.dtypes

time              object
lat_min          float64
lat_max          float64
lon_min          float64
lon_max          float64
rain (mm/day)    float64
model             object
dtype: object

In [15]:
ddf.head(10)

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.811518,-34.86911,140.625,141.875,5.727971000000001e-17,TaiESM1
1,1889-01-02 12:00:00,-35.811518,-34.86911,140.625,141.875,-4.460195e-18,TaiESM1
2,1889-01-03 12:00:00,-35.811518,-34.86911,140.625,141.875,0.0,TaiESM1
3,1889-01-04 12:00:00,-35.811518,-34.86911,140.625,141.875,0.0,TaiESM1
4,1889-01-05 12:00:00,-35.811518,-34.86911,140.625,141.875,0.02592095,TaiESM1
5,1889-01-06 12:00:00,-35.811518,-34.86911,140.625,141.875,4.662255,TaiESM1
6,1889-01-07 12:00:00,-35.811518,-34.86911,140.625,141.875,3.083404e-08,TaiESM1
7,1889-01-08 12:00:00,-35.811518,-34.86911,140.625,141.875,0.0001055728,TaiESM1
8,1889-01-09 12:00:00,-35.811518,-34.86911,140.625,141.875,0.0001616503,TaiESM1
9,1889-01-10 12:00:00,-35.811518,-34.86911,140.625,141.875,0.0001176145,TaiESM1


### Step 4: Perform EDA in R