# Test for saving dataframe to disk
## Using the following formats:
* pickel 
* HDF5 
* Feather - Note: Need to manually install using - pip install feather-format
* Parquet 

In [None]:
#!pip install feather-format

In [None]:
import os
import pandas as pd
import numpy as np
import feather

if os.path.exists("data/nyc_taxi.csv"):
    print("Loading data to Data frame")
    df = pd.read_csv('nyc_taxi.csv')
else: 
    print("Downloading 1.6gb file data from internet... ")
    print("This process will take a few min..")
    df = pd.read_csv('http://s3.amazonaws.com/datashader-data/nyc_taxi.zip', compression='zip')
    print("saving data to csv")
    df.to_csv("data/nyc_taxi.csv")
    

# load to file

In [None]:
%%timeit -n1 -r1
pkl_path = 'data/nyc_taxi.pkl'
#os.remove(pkl_path)
df.to_pickle(pkl_path)


In [None]:
%%timeit -n1 -r1
hd5path = 'data/nyc_taxi.h5'
#os.remove(hd5path)
hd5store = pd.HDFStore(hd5path)
hd5store['nyc_taxi_org'] = df
hd5store.close()


In [None]:
%%timeit -n1 -r1
featherpath = 'data/nyc_taxi.feather'
#os.remove(featherpath)
feather.write_dataframe(df, featherpath)


In [None]:
%%timeit -n1 -r1
parquetpath = 'data/nyc_taxi.parque'
#os.remove(parquetpath)
df.to_parquet(parquetpath)

# read from file


In [None]:
%%timeit -n1 -r10
pkl_path = 'data/nyc_taxi.pkl'
df1 = pd.read_pickle(pkl_path)

In [None]:
%%timeit -n1 -r10
hd5path = 'data/nyc_taxi.h5'
hd5store = pd.HDFStore(hd5path)
df2 = hd5store['nyc_taxi_org']
hd5store.close()

In [None]:
%%timeit -n1 -r10
featherpath = 'data/nyc_taxi.feather'
df3 = feather.read_dataframe(featherpath)

In [None]:
%%timeit -n1 -r10
parquetpath = 'data/nyc_taxi.parque'
df4 = pd.read_parquet(parquetpath)