# Load parquet files to APM

In [None]:
from modules.util.database import SQLAlchemyClient

# ------------------------------------------------------------------------------ #
# Configuration
# ------------------------------------------------------------------------------ #

CONFIG_ID = "CUSTOM_TEST"

# setup database

db = SQLAlchemyClient(CONFIG_ID)
db.table_create_all()

# Combine parquet files

First we'll read and combine all files within the same folder of the same indicator group.
The new dataset will be stored in a subfolder named as `ready` within your folder where
the transformed time-series files are stored.
Within this combining we will also make sure that each file for the same indicator group
has not more than 1.000.000 rows as this is a limitation by file upload api from eIOT.

In [None]:
import os
from modules.util.helpers import Logger
from modules.util.config import get_config_by_id
import pandas as pd

log = Logger.get_logger(CONFIG_ID)
config = get_config_by_id(CONFIG_ID)

# TO-DO: move to configuration and create folders if not exist  (use pathlib)
TRANSFORMED_FOLDER = config["transform"]["time-series"]["directory"]
READY_FOLDER = os.path.join(TRANSFORMED_FOLDER, "ready")

# make sure the pick folder exists
if not os.path.exists(READY_FOLDER):
    os.makedirs(READY_FOLDER)

# we need to iterate over all folders which are within the transformed folder
for root, dirs, files in os.walk(TRANSFORMED_FOLDER):
    # log the current folder
    log.info(f"Processing folder: {root} with {len(files)} files")

    # List to store DataFrames
    dataframes = []

    # iterate over files and read them into dataframes
    # the index is "managedObjectId", "_time", "measuringNodeId"

    for f in files:
        file_path = os.path.join(root, f)
        df = pd.read_parquet(file_path)

        # Ensure the key columns exist in the DataFrame
        if all(
            col in df.columns for col in ["managedObjectId", "measuringNodeId", "_time"]
        ):
            dataframes.append(df)
        else:
            log.error(f"Skipping {f}: Missing key columns.")

    # Merge all DataFrames on the key columns
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)

        # Drop duplicate rows (optional, if needed)
        combined_df = combined_df.drop_duplicates(
            subset=["managedObjectId", "measuringNodeId", "_time"]
        )

        # Determine the number of chunks needed based on the maximum lines per file
        max_lines_per_file = 1000000
        num_chunks = (len(combined_df) // max_lines_per_file) + 1

        for i in range(num_chunks):
            start_idx = i * max_lines_per_file
            end_idx = (i + 1) * max_lines_per_file
            chunk_df = combined_df.iloc[start_idx:end_idx]

            # Save each chunk to a new parquet file
            chunk_file_path = os.path.join(
                READY_FOLDER,
                f"{os.path.basename(root)}_combined_chunk_{i}.parquet",
            )
            chunk_df.to_parquet(chunk_file_path, index=False)
            log.debug(f"Chunk {i} saved to {chunk_file_path}.")
    else:
        log.info("No valid Parquet files found.")

# Upload files to eIOT

The next step is to upload the data to eIOT. As the API has some limitations, we need also to
check that the file size of each file does not reach the limit of 50 MB. Therefore it might be
that we split the files even further (besides the 1 million lines limit).

Afterwards the files can be uploaded to eIOT file interface. As a result we'll get a file id
back which can be used to check the processing status of the file. We're storing the fild id
therefore in the database table T_EIOT_UPLOAD_STATUS.

In [None]:
import os
from modules.apm.eiot import EIoTApi
from modules.util.helpers import Logger
from modules.util.database import SQLAlchemyClient, EIotUploadStatus, EIotUploadStatusValues
from modules.util.config import get_config_by_id
# import pyarrow as pa
import pyarrow.parquet as pq

log = Logger.get_logger(CONFIG_ID)
config = get_config_by_id(CONFIG_ID)
api_eiot = EIoTApi(CONFIG_ID)
db = SQLAlchemyClient(CONFIG_ID)

UPLOADED_FOLDER = config["load"]["time-series"]["directory"]
TRANSFORMED_FOLDER = config["transform"]["time-series"]["directory"]
READY_FOLDER = os.path.join(TRANSFORMED_FOLDER, "ready")

# make sure the folder exists
if not os.path.exists(UPLOADED_FOLDER):
    os.makedirs(UPLOADED_FOLDER)

# in the previous step we have saved the combined dataframes to parquet files
# we now need to upload these files to the EIOT platform
# we need to iterate over all folders which are within the transformed folder
for root, dirs, files in os.walk(READY_FOLDER):
    # filter files to only include parquet files
    files = [f for f in files if f.endswith(".parquet")]
    # iterate over files and upload them to the EIOT platform
    for f in files:
        # we need to check the size of the file before uploading it
        # the maximum file size is 50 MB, so we need to split the file into chunks if it is larger
        max_file_size = 30 * 1024 * 1024  # 50 MB in bytes
        file_path = os.path.join(root, f)
        file_size = os.path.getsize(file_path)
        # Read the parquet file
        table = pq.read_table(file_path)
        # get the lines of the file
        lines = table.num_rows

        if file_size > max_file_size or lines > 1000000:
            log.info(f"File {f} is larger than 50 MB or has more than 1 million lines, splitting into chunks.")
            # Split the file into chunks and upload each chunk
            schema = table.schema

            # Determine chunk size based on file size and number of lines
            chunk_size = min(max_file_size // table.num_columns, 1000000)
            chunks = [table.slice(i, chunk_size) for i in range(0, table.num_rows, chunk_size)]

            for chunk_number, chunk in enumerate(chunks):
                chunk_file_path = file_path.replace('.parquet', f"_chunk_{chunk_number}.parquet")
                pq.write_table(chunk, chunk_file_path)
                # Upload the chunk
                file_response = api_eiot.upload_file(parquet_file_and_path=chunk_file_path)
                log.info(f"Uploaded chunk {chunk_number} of file {f} with ID {file_response['fileId']}")
                db.insert_one(
                    EIotUploadStatus(
                        fileName=file_response["fileName"],
                        fileId=file_response["fileId"],
                        status=EIotUploadStatusValues.UPLOADED,
                        statusDescription="upload_initiated",
                        statusTimestamp=file_response["uploadedTime"],
                    )
                )
                # move the chunk file after uploading to the uploaded folder
                os.rename(chunk_file_path, os.path.join(UPLOADED_FOLDER, os.path.basename(chunk_file_path)))
        else:
            # before start uploading the file, we need to check if the file is already uploaded
            # if it is uploaded, we should skip it

            # check if the file is already uploaded
            file_status = db.select_one(model=EIotUploadStatus, where=[EIotUploadStatus.fileName==f])
            if file_status and file_status["status"] != EIotUploadStatusValues.FAILURE:
                log.info(f"File {f} is already uploaded, skipping.")
                continue

            file_response = api_eiot.upload_file(parquet_file_and_path=file_path)
            log.info(f"Uploaded file {f} with ID {file_response['fileId']}")
            db.insert_one(
                EIotUploadStatus(
                    fileName=file_response["fileName"],
                    fileId=file_response["fileId"],
                    status=EIotUploadStatusValues.UPLOADED,
                    statusDescription="upload_initiated",
                    statusTimestamp=file_response["uploadedTime"],
                )
            )
            # move the chunk file after uploading to the uploaded folder
            os.rename(file_path, os.path.join(UPLOADED_FOLDER, os.path.basename(file_path)))

# check file upload status

The step starts by loading all outstanding file uploads from the internal database. This means all process
which are not finished and not failed. Next we check for each entry the current status of processing
and update also the databse status if it has changed.

After each iteration we'll wait for some time before checking the status again.

This step is stopped by the user or when all uploads are done or failed.



In [None]:
import time
from modules.apm.eiot import EIoTApi
from modules.util.helpers import Logger
from modules.util.database import (
    SQLAlchemyClient,
    EIotUploadStatus,
    EIotUploadStatusValues,
)
from modules.util.config import get_config_by_id

log = Logger.get_logger(CONFIG_ID)
config = get_config_by_id(CONFIG_ID)
api_eiot = EIoTApi(CONFIG_ID)
db = SQLAlchemyClient(CONFIG_ID)

uploaded_entries = db.select(
    model=EIotUploadStatus,
    where=[
        EIotUploadStatus.status != EIotUploadStatusValues.PROCESSED
        and EIotUploadStatus.status != EIotUploadStatusValues.FAILURE
    ],
)

# iterate over all uploaded entries and check if they are processed
all_exports_complete = False

count = 1

while not all_exports_complete:
    all_exports_complete = True

    for entry in uploaded_entries:
        file_id = entry["fileId"]
        status = api_eiot.get_file_status(file_id)
        log.info(f"File {file_id} has status: {status["status"]}.")

        if entry["status"] != status["status"]:
            # we have a new status, update the database
            entry["status"] = status["status"]
            if status["status"] == "Processed":
                new_status = EIotUploadStatusValues.PROCESSED
            elif status["status"] == "Received":
                log.info(f"File {file_id} is received.")
                new_status = EIotUploadStatusValues.RECEIVED
            elif status["status"] == "In Process":
                new_status = EIotUploadStatusValues.IN_PROCESS
            elif status["status"] == "Scanned":
                new_status = EIotUploadStatusValues.SCANNED
            elif status["status"] == "Processing Failed":
                new_status = EIotUploadStatusValues.FAILURE
                log.error(f"File {file_id} processing failed: {status["description"]}")
            else:
                raise Exception(f"Unknown status: {status['status']}")

            db.update_one(
                model=EIotUploadStatus,
                where=[EIotUploadStatus.fileId == file_id],
                values={
                    "status": new_status,
                    "statusDescription": status["description"],
                },
            )

        else:
            continue

        # check if the status is one of the final statuses
        if status["status"] not in ["Processing Failed", "Processed"]:
            all_exports_complete = False

    # wait for some time before checking the status again
    if not all_exports_complete:
        time.sleep(30 * count)
        count += 1


log.info("done")