In [2]:
from pathlib import Path
import pandas as pd

In [3]:
data_path = Path('/drive/data/algo_trading/data/edgar')
notes_dir = data_path / 'notes'/ '2018q1_notes'

In [6]:
filing_path = data_path / 'filings' / 'index'

In [7]:
results = {}

## Get Data

In [8]:
df = pd.read_parquet(filing_path / 'filing_index.parquet')

## Parquet

### Size

In [9]:
parquet_file = Path('test.parquet')

In [10]:
df.to_parquet(parquet_file)
size = parquet_file.stat().st_size

### Read

In [11]:
%%timeit -o
df = pd.read_parquet(parquet_file)

6.43 s ± 92.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 6.43 s ± 92.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [12]:
read = _

In [13]:
parquet_file.unlink()

### Write

In [17]:
%%timeit -o
df.to_parquet(parquet_file)
parquet_file.unlink()

1min 7s ± 37.6 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 1min 7s ± 37.6 s per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [18]:
write = _

### Results

In [19]:
results['parquet'] = {'read': read.all_runs, 'write': write.all_runs, 'size': size}

## HDF5

In [20]:
test_store = Path('index.h5')

### Fixed Format

#### Size

In [21]:
with pd.HDFStore(test_store) as store:
    store.put('file', df)
size = test_store.stat().st_size

#### Read

In [22]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    store.get('file')

11.3 s ± 89.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 11.3 s ± 89.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [23]:
read = _

In [24]:
test_store.unlink()

#### Write

In [25]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    store.put('file', df)
test_store.unlink()

18 s ± 283 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 18 s ± 283 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [26]:
write = _

#### Results

In [27]:
results['hdf_fixed'] = {'read': read.all_runs, 'write': write.all_runs, 'size': size}

### Table Format

#### Size

In [78]:
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t')
size = test_store.stat().st_size    

#### Read

In [None]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    df = store.get('file')

In [None]:
read = _

In [None]:
test_store.unlink()

#### Write

In [None]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t')
test_store.unlink()    

In [None]:
write = _

#### Results

In [None]:
results['hdf_table'] = {'read': read.all_runs, 'write': write.all_runs, 'size': size}

### Table Select

#### Size

In [28]:
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t', data_columns=['company', 'form'])
size = test_store.stat().st_size 

#### Read

In [29]:
company = 'APPLE INC'

In [30]:
%%timeit
with pd.HDFStore(test_store) as store:
    s = store.select('file', 'company = company')

29.9 ms ± 243 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
read = _

In [32]:
test_store.unlink()

#### Write

In [33]:
%%timeit
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t', data_columns=['company', 'form'])
test_store.unlink() 

3min 20s ± 7.1 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [34]:
write = _

#### Results

In [35]:
results['hdf_select'] = {'read': read.all_runs, 'write': write.all_runs, 'size': size}

## CSV

In [36]:
test_csv = Path('test.csv')

### Size

In [37]:
df.to_csv(test_csv)
test_csv.stat().st_size

1711018429

### Read

In [38]:
%%timeit -o
df = pd.read_csv(test_csv)

17.9 s ± 60.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 17.9 s ± 60.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [39]:
read = _

In [40]:
test_csv.unlink()  

### Wrire

In [43]:
%%timeit -o
df.to_csv(test_csv)
test_csv.unlink()

1min 33s ± 672 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 1min 33s ± 672 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [44]:
write = _

### Results

In [45]:
results['csv'] = {'read': read.all_runs, 'write': write.all_runs, 'size': size}

## Store Results

In [46]:
for f, data in results.items():
    pd.DataFrame(data).to_csv('{}.csv'.format(f))