# Sample data

In [1]:
import pandas as pd
import numpy as np

def generate_sample_data(rows=5, cols=4, 
                         start_date='2021-01-01 00:00:00', 
                         freq='T',  # Frecuencia de 1 minuto
                         initial=100, 
                         noise_std=0.05):
    """
    Genera un DataFrame con datos financieros de prueba.
    
    Parámetros:
      - rows: número de filas (registros de tiempo).
      - cols: número de columnas (por ejemplo, activos).
      - start_date: fecha de inicio.
      - freq: frecuencia de las marcas de tiempo.
      - initial: valor inicial (por defecto 100).
      - noise_std: desviación estándar del ruido.
      
    Retorna:
      - DataFrame de pandas con índices de fecha y columnas con valores simulados.
    """
    # Genera el índice de fechas
    dates = pd.date_range(start=start_date, periods=rows, freq=freq)
    # Genera los datos con ruido gaussiano
    data = initial + np.random.randn(rows, cols) * noise_std
    # Crea el DataFrame asignando nombres a las columnas
    df = pd.DataFrame(data, index=dates, columns=[f'No{i}' for i in range(cols)])
    return df

# En Jupyter Notebook, usamos el cell magic %time para medir el tiempo de ejecución
%time data = generate_sample_data(rows=int(5e6), cols=10).round(4)


  dates = pd.date_range(start=start_date, periods=rows, freq=freq)


CPU times: user 1.41 s, sys: 213 ms, total: 1.62 s
Wall time: 1.68 s


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [3]:
h5 = pd.HDFStore('data/data.h5', 'w')

In [4]:
%time h5['data'] = data

CPU times: user 10.2 ms, sys: 542 ms, total: 552 ms
Wall time: 560 ms


In [13]:
h5.close()

In [6]:
h5 = pd.HDFStore('data/data.h5', 'r')

In [7]:
%time data_copy = h5['data']

CPU times: user 167 ms, sys: 667 ms, total: 834 ms
Wall time: 833 ms


In [15]:
%time data.to_hdf('data/data.h5', 'data', format='table')



CPU times: user 2.14 s, sys: 386 ms, total: 2.52 s
Wall time: 2.53 s


In [17]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [10]:
ls -n data/data.*

-rw-rw-rw- 1 1000 1000 440007240 Feb  2 03:45 data/data.h5


In [18]:
%time data_copy = pd.read_hdf('data/data.h5', 'data')

CPU times: user 99.7 ms, sys: 386 ms, total: 486 ms
Wall time: 489 ms


# Pytables

In [19]:
import tables as tb

In [20]:
h5 = tb.open_file('data/data.h5', 'r')

In [21]:
h5

File(filename=data/data.h5, title=np.str_(''), mode='r', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) np.str_('')
/data (Group) np.str_('')
/data/table (Table(np.int64(5000000),)) np.str_('')
  description := {
  "index": Int64Col(shape=(), dflt=np.int64(0), pos=0),
  "values_block_0": Float64Col(shape=(np.int64(10),), dflt=np.float64(0.0), pos=1)}
  byteorder := 'little'
  chunkshape := (np.int64(2978),)
  autoindex := True
  colindexes := {
    "index": Index(6, mediumshuffle, zlib(1)).is_csi=False}

In [22]:
h5.root.data.table[:3]

array([(1609459200000000000, [100.0323, 100.072 , 100.173 ,  99.9869,  99.9844,  99.9848,  99.9497, 100.0594,  99.9781, 100.0162]),
       (1609459260000000000, [ 99.9792,  99.9165, 100.0028, 100.0905,  99.9693,  99.9937,  99.9851, 100.0402,  99.9695,  99.9898]),
       (1609459320000000000, [100.0095,  99.9756,  99.9739,  99.9813,  99.9866,  99.9516, 100.0443, 100.0604,  99.9292, 100.0886])],
      dtype=[('index', '<i8'), ('values_block_0', '<f8', (10,))])

In [23]:
h5.close()

In [24]:
rm data/data.h5