In [None]:
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
from pat2vec.util.elasticsearch_methods import ingest_data_to_elasticsearch


In [None]:
pat2vec_path = '../../'
pat2vec_path2 = '../../pat2vec/'

# Specify the path you want to add
new_path = pat2vec_path

# Add the path to sys.path
if new_path not in sys.path:
    sys.path.append(new_path)
    sys.path.append(pat2vec_path2)

# Verify that the path has been added
print(sys.path)

In [None]:

# Example DataFrame creation
data_in = pd.DataFrame({
    "id": range(1, 101),  # 100 unique IDs
    "name": [f"Name_{i}" for i in range(1, 101)],
    "value": np.random.rand(100),  # Random float values
    "timestamp": pd.date_range(start="2023-01-01", periods=100, freq="D")  # Dates
})

In [None]:
# First ingestion with replace_index=True
ingest_data_to_elasticsearch(temp_df=data_in.head(0), index_name="test_index_example".lower(), index_mapping=None, replace_index=True)

# Define a safe maximum number of cells per chunk
max_cells = 100000  # Maximum cells that the system can safely handle

# Number of columns
num_columns = data_in.shape[1]

# Calculate the largest chunk size (in rows) that avoids crashes
chunk_size = max(1, int(max_cells / num_columns))  # Ensure at least 1 row per chunk

# Calculate the total number of chunks
n_chunks = int(np.ceil(data_in.shape[0] / chunk_size))

# Print results
print(f"Data dimensions: {data_in.shape[0]} rows, {data_in.shape[1]} columns")
print(f"Maximum safe cells per chunk: {max_cells}")
print(f"Calculated chunk size: {chunk_size} rows per chunk")
print(f"Total number of chunks: {n_chunks}")

# Splitting DataFrame into n chunks
chunks = np.array_split(data_in, n_chunks)

# Ingesting each chunk with replace_index=False
for chunk in tqdm(chunks):
    ingest_data_to_elasticsearch(temp_df=chunk, index_name="test_index_example".lower(), index_mapping=None, replace_index=False)