# DSCI 525 - Web and Cloud Computing
## Milestone 1: Tackling big data on your laptop

### Group #4
### Members: Heidi Ye, Junting He, Kamal MoravejJahromi, Tanmay Sharma

### GitHub Repo: **https://github.com/UBC-MDS/group4-525**

In [21]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage
import dask.dataframe as dd
import pyarrow.feather as feather
import pyarrow.dataset as ds

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler

## 1. Downloading the data

In [3]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshareairline/"

In [4]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

## 2. Unzipping Data

In [5]:
%%time
files_to_dl = ["data.zip"]  # feel free to add other files here
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 4.53 s, sys: 4.34 s, total: 8.87 s
Wall time: 5min 59s


In [6]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 19.6 s, sys: 3.94 s, total: 23.5 s
Wall time: 24.7 s


## 3. Combining data CSVs

In [7]:
df = pd.read_csv("./figshareairline/ACCESS-CM2_daily_rainfall_NSW.csv")
df

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-36.25,-35.00,140.625,142.50,3.293256e-13
1,1889-01-02 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
2,1889-01-03 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
3,1889-01-04 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
4,1889-01-05 12:00:00,-36.25,-35.00,140.625,142.50,1.047658e-02
...,...,...,...,...,...,...
1932835,2014-12-27 12:00:00,-30.00,-28.75,151.875,153.75,2.951144e-02
1932836,2014-12-28 12:00:00,-30.00,-28.75,151.875,153.75,2.257118e-01
1932837,2014-12-29 12:00:00,-30.00,-28.75,151.875,153.75,1.204670e-01
1932838,2014-12-30 12:00:00,-30.00,-28.75,151.875,153.75,2.632404e-02


In [8]:
%%time
%memit
# Shows time that regular python takes to merge file
# Join all data together
## here we are using a normal python way of merging the data 

files = glob.glob('figshareairline/*.csv')
df = pd.concat((pd.read_csv(file, index_col=0)
                .assign(model=re.findall(r'/([^_]*)', file)[0])
                for file in files)
              )
df.to_csv("figshareairline/combined_data.csv")

peak memory: 398.13 MiB, increment: 0.06 MiB
CPU times: user 14min 45s, sys: 1min 25s, total: 16min 11s
Wall time: 16min 56s


In [9]:
df_combined = pd.read_csv("./figshareairline/combined_data.csv")

In [10]:
df_combined

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM
...,...,...,...,...,...,...,...
125027721,2014-12-27 12:00:00,-30.157068,-29.214660,153.1250,154.3750,6.689683e+00,SAM0-UNICON
125027722,2014-12-28 12:00:00,-30.157068,-29.214660,153.1250,154.3750,7.862555e+00,SAM0-UNICON
125027723,2014-12-29 12:00:00,-30.157068,-29.214660,153.1250,154.3750,1.000503e+01,SAM0-UNICON
125027724,2014-12-30 12:00:00,-30.157068,-29.214660,153.1250,154.3750,8.541592e+00,SAM0-UNICON


In [11]:
df_combined["model"].unique()

array(['MPI-ESM-1-2-HAM', 'AWI-ESM-1-1-LR', 'NorESM2-LM', 'ACCESS-CM2',
       'FGOALS-f3-L', 'CMCC-CM2-HR4', 'MRI-ESM2-0', 'GFDL-CM4',
       'BCC-CSM2-MR', 'EC-Earth3-Veg-LR', 'CMCC-ESM2', 'NESM3',
       'MPI-ESM1-2-LR', 'ACCESS-ESM1-5', 'FGOALS-g3', 'INM-CM4-8',
       'MPI-ESM1-2-HR', 'TaiESM1', 'NorESM2-MM', 'CMCC-CM2-SR5',
       'combined', 'observed', 'KIOST-ESM', 'INM-CM5-0', 'MIROC6',
       'BCC-ESM1', 'GFDL-ESM4', 'CanESM5', 'SAM0-UNICON'], dtype=object)

In [12]:
%%sh
du -sh figshareairline/combined_data.csv

 11G	figshareairline/combined_data.csv


## 4. Load the combined CSV to memory and perform a simple EDA

In [13]:
%%time
%%memit
#simple pandas - This is how we do normally ,which means we are loading the entire data to the memory
df = pd.read_csv("figshareairline/combined_data.csv")
print(df["model"].value_counts())

combined            62513863
MPI-ESM1-2-HR        5154240
CMCC-CM2-SR5         3541230
CMCC-CM2-HR4         3541230
NorESM2-MM           3541230
TaiESM1              3541230
CMCC-ESM2            3541230
SAM0-UNICON          3541153
FGOALS-f3-L          3219300
GFDL-ESM4            3219300
GFDL-CM4             3219300
MRI-ESM2-0           3037320
EC-Earth3-Veg-LR     3037320
BCC-CSM2-MR          3035340
MIROC6               2070900
ACCESS-CM2           1932840
ACCESS-ESM1-5        1610700
INM-CM5-0            1609650
INM-CM4-8            1609650
FGOALS-g3            1287720
KIOST-ESM            1287720
AWI-ESM-1-1-LR        966420
MPI-ESM1-2-LR         966420
MPI-ESM-1-2-HAM       966420
NESM3                 966420
NorESM2-LM            919800
BCC-ESM1              551880
CanESM5               551880
observed               46020
Name: model, dtype: int64
peak memory: 6057.54 MiB, increment: 1579.50 MiB
CPU times: user 1min 57s, sys: 53.1 s, total: 2min 50s
Wall time: 3min 23s


In [14]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [15]:
#checking datatypes for columns
df.dtypes

time              object
lat_min          float64
lat_max          float64
lon_min          float64
lon_max          float64
rain (mm/day)    float64
model             object
dtype: object

### 4.1. Investigate approaches to reduce memory usage while performing the EDA 

### 4.1.1. Changing dtype of the data

In [16]:
print(f"Memory usage with float64: {df[['lat_min','lat_max','rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[['lat_min','lat_max','rain (mm/day)']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 3000.67 MB
Memory usage with float32: 1500.33 MB


### 4.1.2. Loading data in chunks using Pandas

In [17]:
%%time
%%memit
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("figshareairline/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

ACCESS-CM2           1932840
ACCESS-ESM1-5        1610700
AWI-ESM-1-1-LR        966420
BCC-CSM2-MR          3035340
BCC-ESM1              551880
CMCC-CM2-HR4         3541230
CMCC-CM2-SR5         3541230
CMCC-ESM2            3541230
CanESM5               551880
EC-Earth3-Veg-LR     3037320
FGOALS-f3-L          3219300
FGOALS-g3            1287720
GFDL-CM4             3219300
GFDL-ESM4            3219300
INM-CM4-8            1609650
INM-CM5-0            1609650
KIOST-ESM            1287720
MIROC6               2070900
MPI-ESM-1-2-HAM       966420
MPI-ESM1-2-HR        5154240
MPI-ESM1-2-LR         966420
MRI-ESM2-0           3037320
NESM3                 966420
NorESM2-LM            919800
NorESM2-MM           3541230
SAM0-UNICON          3541153
TaiESM1              3541230
combined            62513863
observed               46020
dtype: int64
peak memory: 6671.34 MiB, increment: 2168.95 MiB
CPU times: user 1min 56s, sys: 18.1 s, total: 2min 14s
Wall time: 2min 19s


### 4.1.2. Loading data using Dask

In [18]:
%%time
%%memit
# dask way
# Here again I am using dask to read that csv file. Remember internally its loading chunks and doing it parallely.
# here see cpu time greater than wall time 
ddf = dd.read_csv('figshareairline/combined_data.csv')
print(ddf["model"].value_counts().compute())

combined            62513863
MPI-ESM1-2-HR        5154240
CMCC-CM2-HR4         3541230
TaiESM1              3541230
NorESM2-MM           3541230
CMCC-ESM2            3541230
CMCC-CM2-SR5         3541230
SAM0-UNICON          3541153
GFDL-ESM4            3219300
GFDL-CM4             3219300
FGOALS-f3-L          3219300
MRI-ESM2-0           3037320
EC-Earth3-Veg-LR     3037320
BCC-CSM2-MR          3035340
MIROC6               2070900
ACCESS-CM2           1932840
ACCESS-ESM1-5        1610700
INM-CM4-8            1609650
INM-CM5-0            1609650
FGOALS-g3            1287720
KIOST-ESM            1287720
MPI-ESM-1-2-HAM       966420
MPI-ESM1-2-LR         966420
NESM3                 966420
AWI-ESM-1-1-LR        966420
NorESM2-LM            919800
CanESM5               551880
BCC-ESM1              551880
observed               46020
Name: model, dtype: int64
peak memory: 2902.85 MiB, increment: 1236.12 MiB
CPU times: user 2min 35s, sys: 41.4 s, total: 3min 17s
Wall time: 1min 32s


### 4.2. Discuss your observations.

- Memory usage with float32: 750.17 MB was almost half of the memory usage with float64: 1500.33 MB.
- Loading data in chunks with Pandas reduced sys time and thus overall wall time.
- We notice that the wall time went down with dask and CPU time was higher than wall time, suggesting CPU was performing operations in parallel. 

### 5. Perform a simple EDA in R

In [31]:
%%time
%%memit
## read more on the datasets here  https://arrow.apache.org/docs/python/dataset.html
dataset = ds.dataset("figshareairline/combined_data.csv", format="csv")
## this is of arrow table format
table = dataset.to_table()

peak memory: 3667.79 MiB, increment: 3489.57 MiB
CPU times: user 43.4 s, sys: 46.7 s, total: 1min 30s
Wall time: 1min 23s


In [32]:
%%time
# experiment in writing in feather format 
feather.write_feather(table, 'figshareairline/combined_data.feather')

CPU times: user 9.67 s, sys: 23.9 s, total: 33.6 s
Wall time: 20.8 s


In [33]:
%%time
%%memit
## read more on the datasets here  https://arrow.apache.org/docs/python/dataset.html
dataset = ds.dataset("figshareairline/combined_data.csv", format="csv")
## this is of arrow table format
table = dataset.to_table()

peak memory: 8104.93 MiB, increment: 189.67 MiB
CPU times: user 44.7 s, sys: 1min 16s, total: 2min
Wall time: 1min 53s


In [34]:
%%time
# experiment in writing in feather format 
feather.write_feather(table, 'figshareairline/combined_data.feather')

CPU times: user 10.1 s, sys: 18.7 s, total: 28.8 s
Wall time: 14.7 s


In [37]:
%%time
%%R
### her we are showing how much time it took to read a feather file what we wrote in python
library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_feather("figshareairline/combined_data.feather")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result)
print(end_time - start_time)

[1] "tbl_df"     "tbl"        "data.frame"
[90m# A tibble: 29 x 2[39m
   model                 n
 [90m*[39m [3m[90m<chr>[39m[23m             [3m[90m<int>[39m[23m
[90m 1[39m ACCESS-CM2      1[4m9[24m[4m3[24m[4m2[24m840
[90m 2[39m ACCESS-ESM1-5   1[4m6[24m[4m1[24m[4m0[24m700
[90m 3[39m AWI-ESM-1-1-LR   [4m9[24m[4m6[24m[4m6[24m420
[90m 4[39m BCC-CSM2-MR     3[4m0[24m[4m3[24m[4m5[24m340
[90m 5[39m BCC-ESM1         [4m5[24m[4m5[24m[4m1[24m880
[90m 6[39m CanESM5          [4m5[24m[4m5[24m[4m1[24m880
[90m 7[39m CMCC-CM2-HR4    3[4m5[24m[4m4[24m[4m1[24m230
[90m 8[39m CMCC-CM2-SR5    3[4m5[24m[4m4[24m[4m1[24m230
[90m 9[39m CMCC-ESM2       3[4m5[24m[4m4[24m[4m1[24m230
[90m10[39m combined       62[4m5[24m[4m1[24m[4m3[24m863
[90m# … with 19 more rows[39m
Time difference of 58.48578 secs
CPU times: user 23.3 s, sys: 1min 3s, total: 1min 26s
Wall time: 59.2 s


In [65]:
%%R

library(tidyverse)

r_table <- r_table %>% rename(rain_mmperday = `rain (mm/day)`)

summary_table <- r_table %>%
    drop_na() %>%
    summarise(median_lat_min = median(lat_min),
             median_lat_max = median(lat_max),
             median_lon_min = median(lon_min),
             median_lon_max = median(lon_max),
             median_rain = median(rain_mmperday))
    
summary_table


  median_lat_min median_lat_max median_lon_min median_lon_max median_rain
1            -33      -32.04188        146.875        148.125  0.06154947


### 5.1 Discuss why you chose this approach over others

Feather was selected over Parquet, Pandas Exchange and Arrow Exchange for its comparatively high I/O speed, minimal memory on disk and the fact that unpacking isn't necessary for the data to be loaded back into RAM. Additionally, feather is relatively easy to use and is a suitable choice since the intent is not term storage. 