In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import numpy as np
from memory_profiler import memory_usage
import dask.dataframe as dd
import matplotlib.pyplot as plt
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.parquet as pq
import rpy2.rinterface
import rpy2_arrow.pyarrow_rarrow as pyra
import pyarrow.feather as feather

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler



# 1. Teamwork contract

The teamwork contract for our team, **team 17**, can be found [**here**](https://docs.google.com/document/d/15_jlrMTtFVXrCJXXRBJv0j-UZxT8hCKHzhsw8ymYr8o/edit?usp=sharing).


 # 2. Create repository and project structure
 
 The repository URL: **https://github.com/UBC-MDS/525_group17**

# 3. Download the data
We will get the data using the figshare API.

First we need to ensure we are in the root directory and create the necessary directories for the raw and combined data.

In [4]:
!pwd

'D:\\MDS_Block_6\\525_group17\\notebooks'

In [6]:
%cd ..

D:\MDS_Block_6\525_group17


In [7]:
!rm -r data
!mkdir data
!rm -r combined_data
!mkdir combined_data
!ls

combined_data
data
img
LICENSE
notebooks
README.md


In [8]:
# Necessary metadata for using figshare API
article_id = 14096681
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "data/"

In [9]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
# files

The file we want is `data.zip`. We will download it with `urllib.request.urlretrieve()` then extract it with `zipfile`.

In [10]:
%%time
files_to_dl = ["data.zip"] 
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

Wall time: 1min 16s


In [11]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

Wall time: 53.5 s


### Result Comparison
| | Team member| Operating System | CPU | RAM | Run-time|
|:---:|:----------:|:----------------:|:---:|:---:|:-------:|
|**Machine 1**|Lara Habashy|   MacOS      |   Intel Core i5  |  16GB   |  1min 37s      |
|**Machine 2**|Cameron Harris|    MacOS            |  Intel Core i7   |  16GB   |   1min 45s      |
|**Machine 3**|Trevor Kinsey|   Windows 10 Pro     |  Intel Core i7-1065G7   | 16GB  |  15min 42s   |
|**Machine 4**|Guanshu Tao|      Windows 10 Pro          |  10th Generation Intel Core i5-10210U   |   16GB     |    1min 11s       |

### Discussion 
- The data was downloaded fairly quickly, except for Trevor, whose internet connection is slow. 



# 4. Combining data CSVs
There are now many `.csv` files that we want to merge into a single file. 

All the files but one have the same columns:
- `time`
- `lat_min`
- `lat_max`
- `lon_min`
- `lon_max`
- `rain (mm/day)`) 

The file `observed_daily_rainfall_SYD.csv` only has:
- `time`
- `rain (mm/day)`

I wish to keep all these columns and add data to the missing columns from `observed_daily_rainfall_SYD.csv` by looking up the latitude and longitude information for this data. It seems to be from Sydney. For now this missing data will appear as NaN's in the combined dataframe.

### Using pandas to combine data
One of the files, `observed_daily_rainfall_SYD.csv` was missing 4 columns the others had, so we made them and filled them with NaNs. Also we made a column 'model' that says which file the data came from. 

It took us 7-10 minutes to combine and save the data as `combined_data.csv`, then about 1 minute to load the file back into a dataframe. This is a big dataframe, with over 62 million rows.


In [12]:
use_cols = pd.read_csv("data/ACCESS-CM2_daily_rainfall_NSW.csv").columns.to_list()
obs_df = pd.read_csv("data/observed_daily_rainfall_SYD.csv")
obs_df.columns.to_list()

['time', 'rain (mm/day)']

In [13]:
obs_df = pd.read_csv("data/observed_daily_rainfall_SYD.csv")
obs_df.insert(1, "lat_min", np.nan, True)
obs_df.insert(2, "lat_max", np.nan, True)
obs_df.insert(3, "lon_min", np.nan, True)
obs_df.insert(4, "lon_max", np.nan, True)
obs_df = obs_df.reindex(columns = use_cols)

obs_df.to_csv("data/observed_daily_rainfall_SYD.csv", index = False)

In [14]:
%%time
%memit
# combine data into a single giant `combined_data.csv` file
files = glob.glob('data/*.csv')
df = pd.concat((pd.read_csv(file)
                .assign(model=re.findall(r'(?<=data\\)[^\/]+(?=\_d)', file)[0])
                for file in files)
              )
df.to_csv("combined_data/combined_data.csv", index = False)

peak memory: 306.81 MiB, increment: 0.07 MiB
Wall time: 26min 26s


In [15]:
%%time
# df = pd.read_csv("combined_data/combined_data.csv", index_col=0)
df = pd.read_csv("combined_data/combined_data.csv")

Wall time: 4min 27s


In [16]:
%%time
df.head()

Wall time: 15.7 ms


Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658,ACCESS-CM2


In [17]:
df.shape

(62513863, 7)

## <br>

### Using DASK to combine data

DASK took slightly longer  to combine the data into one .csv file compared to  pandas. 

DASK is able to load the dataframe much faster using `dd.read_csv()`.

The `combined_data_dask.csv` file made from a DASK ddf takes up more space on disk as the `combined_data.csv` file made from a pandas df. 

In [16]:
%%time
%%memit

ddf = dd.read_csv("data/*.csv", 
                  include_path_column = True,
                  assume_missing = True)
ddf.to_csv("combined_data/combined_data_dask.csv", 
           single_file = True)

peak memory: 10700.45 MiB, increment: 5541.69 MiB
Wall time: 9min 45s


In [17]:
%%time
ddf = dd.read_csv("combined_data/combined_data_dask.csv")

Wall time: 35.5 ms


In [18]:
%%time
ddf.head()

Wall time: 709 ms


Unnamed: 0.1,Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),path
0,0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,C:/Users/Trevor_Kinsey/MDS/Block_6/DSCI_525/52...
1,1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0,C:/Users/Trevor_Kinsey/MDS/Block_6/DSCI_525/52...
2,2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0,C:/Users/Trevor_Kinsey/MDS/Block_6/DSCI_525/52...
3,3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0,C:/Users/Trevor_Kinsey/MDS/Block_6/DSCI_525/52...
4,4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658,C:/Users/Trevor_Kinsey/MDS/Block_6/DSCI_525/52...


In [19]:
%%sh
du -sh combined_data/combined_data_dask.csv
du -sh combined_data/combined_data.csv

11G	combined_data/combined_data_dask.csv
5.7G	combined_data/combined_data.csv


### Result Comparison 
| | Team member| Operating System | CPU | RAM | Run-time (using pandas)|  Run-time (using DASK)| 
|:---:|:----------:|:----------------:|:---:|:---:|:-------:|:--------:|
|**Machine 1**|Lara Habashy|   MacOS      |   Intel Core i7  |  16GB   |  10min 10s       | 12min 31s |
|**Machine 2**|Cameron Harris|    MacOS            |  Intel Core i7   |  16GB   |    8min 50s     | - |
|**Machine 3**|Trevor Kinsey|   Windows 10 Pro     |  Intel Core i7-1065G7   | 16GB  | 6min 56s |10min 20s |
|**Machine 4**|Guanshu Tao|      Windows 10 Pro          |  10th Generation Intel Core i5-10210U   |   16GB     | 8min 50s         |58min 37s|

#### Discussion 
- This was a slow process. 
- A few of us had difficulty combining the data with DASK. Those who did Use DASK didn't see any advantage over using pandas when it comes to combining the data.

- **Conclusion:** Pandas seems to be the better of these two options because it was faster and the file it created was smaller.
<br><br><br>

# 5. Load the combined CSV to memory and perform a simple EDA

The EDA will be to use `value_counts()` to count the number of data points that came from each .csv file, as recorded in the `model` column of `combined_data.csv`.

We will try and assess several methods do do this then choose our favourite.

### 5.1 Pandas (no chunking)

We will take this method to be a baseline with which to compare other methods. Note that the entire dataframe requires 8GB of memory.

**Verdict:** It's annoyingly slow, but not impossible to open a large file. Memory use could be a problem.

In [20]:
%%time
%%memit
df = pd.read_csv("combined_data/combined_data.csv")

peak memory: 9974.68 MiB, increment: 8330.80 MiB
Wall time: 1min 3s


In [21]:
%%time
%%memit
counts = df["model"].value_counts()
print(counts.astype(int))

MPI-ESM1-2-HR       5154240
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
TaiESM1             3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
GFDL-CM4            3219300
FGOALS-f3-L         3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NESM3                966420
MPI-ESM1-2-LR        966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
observed              46020
Name: model, dtype: int32
peak memory: 6664.56 MiB, increment: 60.50 MiB
Wall time: 4.71 s


### 5.2 Pandas (with chunking)

This took slightly less time as pandas without chunking, but used much less memory. 

The effect of chunk size:
- smaller chunk size (1,000,000) takes about the same time as larger chunk size (10,000,000)
- smaller chunk size (1,000,000) uses less memory than larger chunk size (10,000,000)

We could not time the loading of the data and performing the EDA (`value_counts()`) separately because the data from each chunk had to be counted as it became available. However the time  to do the combined operations using chunking was almost the same as doing the same operations in sequence without chunking.

**Verdict:** Chunking decreases the memory needed but doesn't save much time.

In [22]:
%%time
%%memit
CHUNKSIZE = 1_000_000
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("combined_data/combined_data.csv",
                         chunksize=CHUNKSIZE):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)

# suppress output to avoid repeating lengthy printout    
# print(counts.astype(int))

peak memory: 6822.82 MiB, increment: 218.68 MiB
Wall time: 1min 8s


In [23]:
%%time
%%memit
CHUNKSIZE = 10_000_000
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("combined_data/combined_data.csv", chunksize=CHUNKSIZE):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
AWI-ESM-1-1-LR       966420
BCC-CSM2-MR         3035340
BCC-ESM1             551880
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
CanESM5              551880
EC-Earth3-Veg-LR    3037320
FGOALS-f3-L         3219300
FGOALS-g3           1287720
GFDL-CM4            3219300
GFDL-ESM4           3219300
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
MIROC6              2070900
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-HR       5154240
MPI-ESM1-2-LR        966420
MRI-ESM2-0          3037320
NESM3                966420
NorESM2-LM           919800
NorESM2-MM          3541230
SAM0-UNICON         3541153
TaiESM1             3541230
observed              46020
dtype: int32
peak memory: 8377.54 MiB, increment: 1720.98 MiB
Wall time: 1min 2s


### 5.3 Pandas (loading only some columns, no chunking)
Since we only want the model for EDA, we will import just the `model` column. This is faster and uses less memory than loading the whole dataframe. 

Running `value_counts` takes the same time as it did using the entire data set, probably because it has to iterate through the same number of rows.

**Verdict:** This should be done whenever possible. It reduces memory required and speeds up loading data but has no effect on EDA time. 

In [24]:
%%time
%%memit
df = pd.read_csv("combined_data/combined_data.csv", 
                 usecols = ["model"])

peak memory: 6889.91 MiB, increment: 958.11 MiB
Wall time: 36.4 s


In [25]:
%%time
%%memit
df["model"].value_counts()

peak memory: 1508.04 MiB, increment: 110.69 MiB
Wall time: 4.96 s


### 5.4 DASK dataframe

This method is much faster than using pandas (without chunking), and uses less memory. The memory use is similar to using pandas with chunking, depending on the chunk size used. The `value_counts()` step took longer than the baseline pandas method because of the `.compute()` step, which is required when we want to work with the data and view the result.

**Verdict:** Fast and light on memory. Similar to chunked pandas but easier to use.

In [26]:
%%time
%%memit
# load data
ddf = dd.read_csv("combined_data/combined_data.csv")

peak memory: 1402.02 MiB, increment: 4.66 MiB
Wall time: 2.95 s


In [27]:
%%time
%%memit
# Do EDA
print(ddf["model"].value_counts().compute())

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
NorESM2-MM          3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
NorESM2-LM           919800
CanESM5              551880
BCC-ESM1             551880
observed              46020
Name: model, dtype: int64
peak memory: 2973.74 MiB, increment: 1576.16 MiB
Wall time: 38.1 s


<br>

### 5.5 Changing dtype of the data

Four of the columns are in float64 format so they could be converted to float32 and reduce the memory required by 1.25GB. 

**Verdict:** Should be done regardless of the method used to store and import data. This is low-hanging fruit to save space and should be used whenever possible.


In [28]:
df = pd.read_csv("combined_data/combined_data.csv")

print(f"Memory usage with float64: {df[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 2500.55 MB
Memory usage with float32: 1250.28 MB


### Result Comparison
| | Team member| Operating System | CPU | RAM | Run-time (using pandas)|  Run-time (using DASK)| Chunking | 
|:---:|:----------:|:----------------:|:---:|:---:|:-------:|:--------:|:----:|
|**Machine 1**|Lara Habashy|   MacOS      |   Intel Core i5  |  16GB   |   1min 10s     | 48s  |2min 18s |
|**Machine 2**|Cameron Harris|   MacOS             |  Intel Core i7   |  16GB   |    1min 50s     | - | 2min 10s|
|**Machine 3**|Trevor Kinsey|   Windows 10 Pro     |  Intel Core i7-1065G7   | 16GB  | 1min 8s  |  41s   |  1min 6s  |
|**Machine 4**|Guanshu Tao|    Windows 10 Pro          |  10th Generation Intel Core i5-10210U   |   16GB     |  1min 23s       |45s|1min 27s|


#### Discussion 
- Changing dtype and selecting only the columns you want are both effective in reducing time to load data and the memory required
- Using DASK to open and do EDA was faster than using pandas, but has proved to be unreliable because not all of us could get it to work.
- **Conclusion:** Change the dtype of the data and selecting only the columns you want whenever possible. Use DASK if you can.

# 6. Perform a simple EDA in R
To do this we will pass data from python to R in various ways, asses each method, then decide which one is best suited.

In [29]:
%%R
library(dplyr)
library(arrow)

R[write to console]: 
Attaching package: 'dplyr'


R[write to console]: The following objects are masked from 'package:stats':

    filter, lag


R[write to console]: The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




### 6.1 Using %%R -i

This didn't work because it seems our laptops didn't have enough memory, probably because we needed to have two large data frames (in python and R) open at the same time. This doesn't seem to be a good way to pass a large dataframe from python to R.

**Verdict:** NO THANKS!

In [30]:
# %%time
# %%R -i df

# start_time <- Sys.time()
# library(dplyr)
# counts <- df %>% count(model)
# end_time <- Sys.time()

# print(end_time - start_time)

### 6.2 Using arrow table and pyra

The arrow table is an intermediary form that can easily be moved between python and R. Once it is made it can be convert it using `pyra.converter.py2rpy()` to a table that is readble in R. This is a rather roundabout way to get data into R. It would be nicer to be able to load a file directly into R.

To create a table from file in python, convert the table, then pass to R took about the same time as loading the file and doing the EDA in python. This is impressive, since it took the same amount of time to do much more. 

**Verdict: Ok, but inconvenient.** This method works and but it is inconvenient to have to create a table in python, convert it, then pass that to R.

In [18]:
%%time
%%memit
# Read data file and prepare arrow table
dataset = ds.dataset("combined_data/combined_data.csv", format="csv")
table = dataset.to_table()

peak memory: 8649.15 MiB, increment: 3493.14 MiB
Wall time: 1min 41s


In [19]:
%%time
%%memit
# convert arrow table so it can be passed to R
r_table = pyra.converter.py2rpy(table)

5756
rarrow.ChunkedArray: 0.11634445190429688
5756
rarrow.ChunkedArray: 0.06902265548706055
5756
rarrow.ChunkedArray: 0.07774066925048828
5756
rarrow.ChunkedArray: 0.07131099700927734
5756
rarrow.ChunkedArray: 0.09745645523071289
5756
rarrow.ChunkedArray: 0.1260051727294922
5756
rarrow.ChunkedArray: 0.12061429023742676
peak memory: 8644.70 MiB, increment: 72.16 MiB
Wall time: 1min 52s


In [34]:
%%time
%%R -i r_table
# Pass r_table from python

start_time <- Sys.time()
library(dplyr)
counts <- r_table %>% collect() %>% count(model)
end_time <- Sys.time()

print(counts)
print(end_time - start_time)

# A tibble: 28 x 2
   model                  n
   <chr>              <int>
 1 ACCESS-CM2       1932840
 2 ACCESS-ESM1-5    1610700
 3 AWI-ESM-1-1-LR    966420
 4 BCC-CSM2-MR      3035340
 5 BCC-ESM1          551880
 6 CanESM5           551880
 7 CMCC-CM2-HR4     3541230
 8 CMCC-CM2-SR5     3541230
 9 CMCC-ESM2        3541230
10 EC-Earth3-Veg-LR 3037320
# ... with 18 more rows
Time difference of 5.679571 secs
Wall time: 5.87 s


### 6.3 Using feather
If we have an arrow table it can be saved as a .feather file, which R can read directly. 

**Verdict:** This method was very fast for those who could get it to work.

In [20]:
%%time
# create .feather file from existing arrow table
feather.write_feather(table, 'combined_data/combined_data.feather')

Wall time: 10.5 s


In [35]:
%%time
%%R
# Read .feather file into R
library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_feather("combined_data/combined_data.feather")
end_time <- Sys.time()
print(end_time - start_time)

Time difference of 6.747203 secs
Wall time: 6.86 s


In [36]:
%%time
%%R
# Perform EDA in R
library(dplyr)
start_time <- Sys.time()
counts <- r_table %>% collect() %>% count(model)
end_time <- Sys.time()

print(counts)
print(end_time - start_time)

[38;5;246m# A tibble: 28 x 2[39m
   model                  n
 [38;5;250m*[39m [3m[38;5;246m<chr>[39m[23m              [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[38;5;250m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[38;5;250m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[38;5;250m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[38;5;250m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 6[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 7[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m 8[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m 9[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m10[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[38;5;246m# ... with 18 more rows[39m
Time difference of 2.271981 secs
Wall time: 2.41 s


### 6.4 Using parquet

Our tests show that using arrow and parquet to perform the following steps:
- create an arrow table from `combined_data.csv` in python, 
- write the table to a parquet file, 
- read the parquet file into R, 
- do the EDA in R

is very fast. Faster than simply reading the original `combined_data.csv`. The upside of this method is that we have a .parquet file that can ba accessed in the future without having to create the arrow table.

**Verdict: This is the best choice in terms of speed and file size.**

In [38]:
%%time

pq.write_table(table, 'combined_data/combined_data.parquet')

Wall time: 12.9 s


In [42]:
%%time
%%R
# Read .parquet file into R
library(arrow)
start_time <- Sys.time()
r_table <- arrow::read_parquet("combined_data/combined_data.parquet")
end_time <- Sys.time()
print(end_time - start_time)

Time difference of 12.57204 secs
Wall time: 12.7 s


In [43]:
%%time
%%R
# Perform EDA in R
library(dplyr)
start_time <- Sys.time()
counts <- r_table %>% collect() %>% count(model)
end_time <- Sys.time()

print(counts)
print(end_time - start_time)

# A tibble: 28 x 2
   model                  n
   <chr>              <int>
 1 ACCESS-CM2       1932840
 2 ACCESS-ESM1-5    1610700
 3 AWI-ESM-1-1-LR    966420
 4 BCC-CSM2-MR      3035340
 5 BCC-ESM1          551880
 6 CanESM5           551880
 7 CMCC-CM2-HR4     3541230
 8 CMCC-CM2-SR5     3541230
 9 CMCC-ESM2        3541230
10 EC-Earth3-Veg-LR 3037320
# ... with 18 more rows
Time difference of 2.032011 secs
Wall time: 2.11 s


#### Compare filesize of various formats

In [41]:
%%sh

du -sh combined_data/combined_data.csv
du -sh combined_data/combined_data.feather
du -sh combined_data/combined_data.parquet

5.7G	combined_data/combined_data.csv
1.1G	combined_data/combined_data.feather
542M	combined_data/combined_data.parquet


### Result Comparison
| | Team member| Operating System | CPU | RAM | Run-time (arrow table + pyra)|  Run-time (feather)| Run-time (parquet) |
|:---:|:----------:|:----------------:|:---:|:---:|:-------:|:--------:|:----:|
|**Machine 1**|Lara Habashy|   MacOS      |  Intel Core i5   |  16GB   |    58s     | -  |  - |
|**Machine 2**|Cameron Harris|    MacOS            |  Intel Core i7   |  16GB   |    56s     | - | - |
|**Machine 3**|Trevor Kinsey|   Windows 10 Pro     |  Intel Core i7-1065G7   | 16GB  | 1min 5s |  -  | 1min 3 s    |
|**Machine 4**|Guanshu Tao|    Windows 10 Pro          |  10th Generation Intel Core i5-10210U   |   16GB     |   1min 18s       | 9.3s| 26s|


#### Discussion 
- The feather and parquet time include the time to create the arrow table. Once this is done, the time to open a file and perform EDA will be reduced.
- Not everyone could get the .feather and .parquet file to work, but these methods are very fast when they do work. The .feather works faster than .parquet. 
- **Conclusion:** Use a .feather or .parquet file because they are both very fast, but they don't work for every computer.