In [155]:
import pyarrow.parquet as pq

In [156]:
import numpy as np
import pandas as pd
import pyarrow as pa

In [157]:
df = pd.read_csv("../../data/prices.csv", index_col=0)
df.head(2)

Unnamed: 0,symbol,date,open,high,low,close,volume,adjusted
1,FLWS,1999-08-03,21.75,22.25,17.875,18.1875,9543400.0,18.1875
2,FLWS,1999-08-04,18.9375,20.5,16.875,17.0,4731800.0,17.0


In [158]:
current = df.loc[df.loc[:, 'symbol'] == 'IBM', ['date', 'open', 'high', 'low', 'close', 'volume', 'adjusted']].reset_index(drop=True)
future = current.iloc[5404:, :]
future

Unnamed: 0,date,open,high,low,close,volume,adjusted
5404,2001-05-21,112.7629,114.6272,112.3805,113.8050,8796232.0,66.19723
5405,2001-05-22,114.4359,114.4359,112.0937,112.8203,6281544.0,65.62444
5406,2001-05-23,112.6195,113.7189,111.9503,112.2371,6697120.0,65.28524
5407,2001-05-24,112.3327,114.3403,112.3327,114.3403,8081814.0,66.50865
5408,2001-05-25,114.2639,114.6272,112.5813,112.6195,5888039.0,65.50768
...,...,...,...,...,...,...,...
10805,2022-11-07,136.6400,138.7000,136.5100,138.3400,4043100.0,136.71000
10806,2022-11-08,139.0000,140.9300,138.7200,140.0400,5042800.0,138.39000
10807,2022-11-09,137.9500,138.9000,136.9400,137.3900,4720000.0,137.39000
10808,2022-11-10,140.2600,141.3700,138.2900,141.2300,5389000.0,141.23000


In [159]:
current = current.iloc[0:5404, :]
current

Unnamed: 0,date,open,high,low,close,volume,adjusted
0,1980-01-02,15.05736,15.41587,14.93786,14.93786,1723808.0,4.514468
1,1980-01-03,14.93786,15.17686,14.63910,15.17686,2553495.0,4.586700
2,1980-01-04,15.17686,15.23662,15.05736,15.14699,1978195.0,4.577672
3,1980-01-07,15.14699,15.14699,14.96773,15.08724,1480718.0,4.559614
4,1980-01-08,15.08724,16.13289,14.96773,16.10301,2968130.0,4.866599
...,...,...,...,...,...,...,...
5399,2001-05-14,106.30980,108.20270,106.11850,107.60990,5496521.0,62.593720
5400,2001-05-15,108.50860,109.13000,107.55260,108.58510,6143576.0,63.160940
5401,2001-05-16,107.55260,110.70750,107.26580,110.70750,8155348.0,64.395480
5402,2001-05-17,110.70750,111.94070,109.99040,110.00960,7204534.0,63.989570


In [160]:
future.iloc[0, :]

date        2001-05-21
open          112.7629
high          114.6272
low           112.3805
close          113.805
volume       8796232.0
adjusted      66.19723
Name: 5404, dtype: object

In [161]:
for i in range(future.index[0], future.index[-1]):
	current = current.append(future.loc[i, :])
	#table = pa.Table.from_pandas(current).replace_schema_metadata(None)
	#pq.write_table(table, 'example.parquet')
	# 96900 milliseconds / 5406 updates = 18 milliseconds each
	# it seems these tests indicate we can resave the entire table every time for most datasets/datastreams
	current.to_csv('example.csv')
 	# 361100 milliseconds / 5406 updates = 67 milliseconds each
	# it seems these tests indicate we can resave the entire table every time for most datasets/datastreams

In [165]:
df.to_csv('largeExample.csv')
# 1000000 * (60+44.2) = 8 microseconds per row

In [166]:
table = pa.Table.from_pandas(df).replace_schema_metadata(None)
pq.write_table(table, 'largeExample.parquet')
# 1000000*8.6 / 12995445 = .66 microseconds per row

In [167]:
# though to the to_csv might be ok for small datasets, it is not for large ones,
# it doesn't scale well. it's 3.72 times slower on smaller dataset, 
# and on a large dataset, it's 12.12 times slower.
# it's also larger on disk:
# 754 kb vs 521 kb, and 856,351 kb vs 369142 kb
# the only plus sides being: it's easy to understand, 
# and could handle multi columned dataframes

# of course the fastest solution is to save recent incrementals individually
# and then merge them all together with the larger history on occasion.
# there is, however, only one way to do this: to hard code the incrementals to
# to be merged every x number of observations, starting at the beginning of the
# dataset since everyone must merge on the same observaiton. That would cause a
# global slow down and frequency.

# or... if we don't want all ipfs addresses to match, which technically they 
# don't have to, we could keep with the current design, efficient, yet complex
# as it is, and allow users to download from any ipfs address they want (meaning
# if the originator of the stream is not available for some reason, the user can
# just download it from anyone who has reported an ipfs address for that stream)

# that's probably sufficient. I wanted the ipfs address to match for all users
# using the dataset, so that, like torrents the downloads could be faster. but
# that is probably less necessary than efficiency while running. and datasets
# that have a rare occurance could avoid having incrementals anyway and 
# therefore all match...

# we'll stick with our current design then, with the introduction of the author,
# and the default assumption that we don't need incrementals.

Unnamed: 0,symbol,date,open,high,low,close,volume,adjusted
1,FLWS,1999-08-03,21.7500,22.2500,17.875,18.1875,9543400.0,18.1875
2,FLWS,1999-08-04,18.9375,20.5000,16.875,17.0000,4731800.0,17.0000
3,FLWS,1999-08-05,17.0625,17.1250,13.500,16.0000,2851900.0,16.0000
4,FLWS,1999-08-06,16.0000,17.7500,16.000,17.0000,1411800.0,17.0000
5,FLWS,1999-08-09,17.9375,17.9375,15.875,16.0000,583000.0,16.0000
...,...,...,...,...,...,...,...,...
12995441,ZUMZ,2022-11-07,22.4100,22.8800,22.020,22.6700,172900.0,22.6700
12995442,ZUMZ,2022-11-08,22.8700,23.3600,22.220,22.4700,311800.0,22.4700
12995443,ZUMZ,2022-11-09,22.3600,22.4300,20.640,20.7800,239800.0,20.7800
12995444,ZUMZ,2022-11-10,21.8300,23.6900,21.780,23.5100,327600.0,23.5100


In [162]:
current

Unnamed: 0,date,open,high,low,close,volume,adjusted
0,1980-01-02,15.05736,15.41587,14.93786,14.93786,1723808.0,4.514468
1,1980-01-03,14.93786,15.17686,14.63910,15.17686,2553495.0,4.586700
2,1980-01-04,15.17686,15.23662,15.05736,15.14699,1978195.0,4.577672
3,1980-01-07,15.14699,15.14699,14.96773,15.08724,1480718.0,4.559614
4,1980-01-08,15.08724,16.13289,14.96773,16.10301,2968130.0,4.866599
...,...,...,...,...,...,...,...
10804,2022-11-04,135.65000,137.73000,134.94000,136.96000,4176600.0,135.346300
10805,2022-11-07,136.64000,138.70000,136.51000,138.34000,4043100.0,136.710000
10806,2022-11-08,139.00000,140.93000,138.72000,140.04000,5042800.0,138.390000
10807,2022-11-09,137.95000,138.90000,136.94000,137.39000,4720000.0,137.390000
