# DSCI 525 - Web and Cloud Computing
## Project: Daily Rainfall Over NSW, Australia
## Milestone 1: Tackling Big Data on Your Laptop 
### Authors: Group 24 Huanhuan Li, Nash Makhija and Nicholas Wu

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage
import dask.dataframe as dd

In [2]:
## install the packages https://arrow.apache.org/docs/python/install.html
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.parquet as pq
## How to install put instructions https://anaconda.org/conda-forge/rpy2
import rpy2.rinterface
# install this https://pypi.org/project/rpy2-arrow/#description  pip install rpy2-arrow
# have to install this as well conda install -c conda-forge r-arrow 
import rpy2_arrow.pyarrow_rarrow as pyra
### instruction
import pyarrow.feather as feather

In [3]:
%load_ext rpy2.ipython
%load_ext memory_profiler



In [4]:
# Code for this notebook was adapted from DSCI 525 course notes

## 1) Downloading the data

In [5]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "../data/"

In [6]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

In [7]:
%%time
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

Wall time: 1min 56s


In [8]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

Wall time: 17.1 s


## 2) Combine data CSVs

CSVs were combined using `Pandas`.

In [9]:
### just listing to get an idea how individual file looks like 
use_cols = ['time', 'lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']
df = pd.read_csv("../data/ACCESS-CM2_daily_rainfall_NSW.csv", usecols=use_cols)
df

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-36.25,-35.00,140.625,142.50,3.293256e-13
1,1889-01-02 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
2,1889-01-03 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
3,1889-01-04 12:00:00,-36.25,-35.00,140.625,142.50,0.000000e+00
4,1889-01-05 12:00:00,-36.25,-35.00,140.625,142.50,1.047658e-02
...,...,...,...,...,...,...
1932835,2014-12-27 12:00:00,-30.00,-28.75,151.875,153.75,2.951144e-02
1932836,2014-12-28 12:00:00,-30.00,-28.75,151.875,153.75,2.257118e-01
1932837,2014-12-29 12:00:00,-30.00,-28.75,151.875,153.75,1.204670e-01
1932838,2014-12-30 12:00:00,-30.00,-28.75,151.875,153.75,2.632404e-02


In [None]:
%%time
%memit
# Shows time that regular python takes to merge file
# Join all data together
## here we are using a normal python way of merging the data 
files = glob.glob('../data/*NSW.csv')
df = pd.concat((pd.read_csv(file, index_col=0, usecols=use_cols)
                .assign(model=file[8:file.index("_daily")])
                for file in files)
              )
df.to_csv("../data/combined_data.csv")

peak memory: 425.69 MiB, increment: 0.09 MiB


In [None]:
%%time

df_pandas = pd.read_csv("../data/combined_data.csv")

In [None]:
df_pandas.head()

### 2. Summary of Observation on Run Times and Memory Usage Comparison on Different Machines

#### Team member comparison:

Huanhuan:Total time taken to concatenate and create combined_data.csv was 6min 9s with peak memory usage of 293 MiB. Time taken to read combined_data.csv into pandas was 1min 6s.

Nash: Total time taken to concatenate and create combined_data.csv was 6min 7s with peak memory usage of 359.25 MiB. Time taken to read combined_data.csv into pandas was 1min 15s. Initially had storage issues due to hard drive being close to full storage, had to free up space before I was successfully able to create combined_data.csv

Nicholas: Total time taken to concatenate and create combined_data.csv was 4min 44s with peak memory usage of 397 MiB. Time taken to read combined_data.csv into pandas was 50.5s.

## 3) Load the Combined CSV to Memory and Perform a Simple EDA


### Approach 1. Load the Entire Data to Memory

In [None]:
%%time
%%memit
#simple pandas - This is how we do normally ,which means we are loading the entire data to the memory
df_pandas = pd.read_csv("../data/combined_data.csv")
print(df_pandas["model"].value_counts())


### Approach 2. Changing `dtype` of the data 

In [None]:
df_pandas.dtypes

In [None]:
print(f"Memory usage with float64: {df_pandas[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df_pandas[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

### Approach 3. Loading in chunks

In [None]:
%%time
%%memit
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("../data/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

### Approach 4. Load using DASK

In [None]:
%%time
%%memit
# dask way

df_dask = dd.read_csv("../data/combined_data.csv")
print(df_dask["model"].value_counts().compute())

### 3. Discussion on Observations

- Loading the entire data to memory takes the longest in wall time. 
- If we change those columns with float64 data type to float32, the memory usage reduced significantly from 2,498 MB to 1,249 MB.
- Loading in chunks reduced total CPU and sys time as well as wall time. 
- Loading with DASK reduced wall time significantly by almost half, but we also noticed that the CPU and sys time became greater than wall time, which is likely because DASK loads the data parallely. 

## 4) Perform a Simple EDA in R

### 1. Store the data in different formats

In [None]:
%%R
#just seeing if its available
library("arrow")
library("dplyr")

In [None]:
%%time
%%memit
## read more on the datasets here  https://arrow.apache.org/docs/python/dataset.html
dataset = ds.dataset("../data/combined_data.csv", format="csv")
## this is of arrow table format
table = dataset.to_table()

#### feather format

In [None]:
%%time
# experiment in writing in feather format 
feather.write_feather(table, '../data/example.feather')

#### parquet format

In [None]:
%%time
## writing as a single parquet 
pq.write_table(table, '../data/example.parquet')

In [None]:
%%time
## writing as a partitioned parquet 
pq.write_to_dataset(table, '../data/example_partitioned.parquet',partition_cols=['model'])

In [None]:
%%sh
# Check the size of different format
du -sh ../data/combined_data.csv
du -sh ../data/example.feather
du -sh ../data/example.parquet
du -sh ../data/example_partitioned.parquet

### 2. Experimenting different approaches

#### Approach 1. Pandas Exchange

In [None]:
%%time
%%memit
#simple pandas: read the entire dataset into memory
df = pd.read_csv("../data/combined_data.csv")

In [None]:
##I comment out the pandas exchange due to memory limitation.
#%%time
#%%R -i df
### Transferring the python dataframe to R
#start_time <- Sys.time()
#library(dplyr)
#print(class(df))
#result <- df %>% count(model)
#print(result)
#end_time <- Sys.time()
#print(end_time - start_time)

#### Approach 2. Arrow Exchange

In [None]:
%%time
%%memit
dataset = ds.dataset("../data/combined_data.csv", format="csv")
## this is of arrow table format
table = dataset.to_table()

In [None]:
%%time
%%memit
## Here we are loading the arrow dataframe that we have loaded previously
r_table = pyra.converter.py2rpy(table)

In [None]:
%%time
%%R -i r_table
start_time <- Sys.time()
print(class(r_table))
##add details on collect here
library(dplyr)
# Arrow only support some operations check this out https://arrow.apache.org/docs/r/articles/dataset.html
result <- r_table %>% collect() %>% count(model)
print(class(r_table %>% collect()))
end_time <- Sys.time()
print(result)
print(end_time - start_time)

#### Approach 3. Feather File

In [None]:
%%time
%%R
### her we are showing how much time it took to read a feather file what we wrote in python
library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_feather("../data/example.feather")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result)
print(end_time - start_time)

#### Approach 4. Parquet File

In [None]:
%%time
%%R
### her we are showing how much time it took to read a parquet file what we wrote in python
library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_parquet("../data/example.parquet")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result)
print(end_time - start_time)

In [None]:
%%time
%%R
### her we are showing how much time it took to read a parquet partitioned file what we wrote in python
library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_parquet("../data/example_partitioned.parquet")
print(class(r_table))
library(dplyr)
result <- r_table %>% count(model)
end_time <- Sys.time()
print(result)
print(end_time - start_time)

### 3. Discussion on Observations
- Comparing to 5.7G of csv file, feather file formate takes 1.1G, while parquet file formate only takes 542M. Both feather file and parquet file are more space efficent than csv file.
- Exchanging data to R with Pandas, my computer ran out of memory and failed to exchange the data.
- Exchanging data to R with Arrow Exchange, 