In [1]:
import pandas as pd
import numpy as np
import psycopg2
from datetime import datetime
import json

### Db connection

In [2]:
idMap = {
    "4gBLwNljLBlc0oYpBdpGkD" : "ht_wp_1_id",
    "475wGcmfSE0L1wguboGfpW": "ht_wp_2_id",
    "43wZIInlpD6YFsEc8TkwlN": "weather",
    "30cjCSTbgsokSNalNoBbJh": "energie_prijs"
}

id_keys = list(idMap.keys())
id_names = list(idMap.values())

attribute_names = [
    'energyImportTotal',
    'temperature',
    'energy',
    'currentPrice'
]
batch_size = 500000

conn = psycopg2.connect(
    host="ec2-63-35-201-247.eu-west-1.compute.amazonaws.com",
    port="5432",
    user="postgres",
    password="Be6DkBT9FfY7CQ2vtj3Lpr6Wzqxax9",
    database="openremote"
)


cur = conn.cursor()

### Procedure 

process_asset_datapoints_proc

BEGIN
    OPEN result_set FOR
        SELECT 
            ad.timestamp AS data_timestamp,  
            ad.entity_id::text, 
            ad.attribute_name::text,  
            ad.value::text  
        FROM asset_datapoint ad
        WHERE ad.entity_id = ANY(id_keys)  
        AND ad.attribute_name = ANY(attribute_names)  
        ORDER BY ad.timestamp;
END;


### Procedure description

Procedure process data without making pivots. Further process is required.

In [3]:
def get_total_rows():
    query = f"""
    SELECT COUNT(*) FROM asset_datapoint 
     WHERE entity_id IN ({', '.join("'" + id + "'" for id in idMap.keys())})
    AND attribute_name IN ({', '.join("'" + id + "'" for id in attribute_names)})
    """
    cur.execute(query)
    total_rows = cur.fetchone()[0]
    return total_rows

In [4]:
# Get the total number of rows
total_rows = get_total_rows()
print(f"Total rows to process: {total_rows}")

# Calculate the number of batches
num_batches = (total_rows + batch_size - 1) // batch_size

Total rows to process: 560543


### Batch processing

In [5]:
processed_batches = []
columns = ['timestamp', 'entity_id', 'attribute_name', 'value']

query = """
SELECT * 
FROM process_asset_datapoints_v2(
    %s::text[],  
    %s::text[], 
    %s,  
    %s 
);
"""

for i in range(num_batches):
    offset = i * batch_size
    print(f"Processing batch {i + 1}: OFFSET {offset}")
    
    cur.execute(query, (id_keys, attribute_names, batch_size, offset))
    
    rows = cur.fetchall()
    df = pd.DataFrame(rows, columns=columns)

    processed_batches.append(df)


cur.close()
conn.close()

processed_data = pd.concat(processed_batches)

Processing batch 1: OFFSET 0
Processing batch 2: OFFSET 500000


In [6]:
processed_data.head()

Unnamed: 0,timestamp,entity_id,attribute_name,value
0,2023-12-16 00:00:01.870,30cjCSTbgsokSNalNoBbJh,currentPrice,67.19999694824219
1,2023-12-16 00:00:01.877,30cjCSTbgsokSNalNoBbJh,currentPrice,75.16
2,2023-12-16 01:00:00.235,30cjCSTbgsokSNalNoBbJh,currentPrice,55.64
3,2023-12-16 02:00:01.620,30cjCSTbgsokSNalNoBbJh,currentPrice,56.0
4,2023-12-16 03:00:01.894,30cjCSTbgsokSNalNoBbJh,currentPrice,58.0


In [7]:
processed_data.describe()

Unnamed: 0,timestamp
count,560543
mean,2024-07-28 16:46:33.310590464
min,2023-12-16 00:00:01.870000
25%,2024-05-08 16:47:20.997499904
50%,2024-07-09 03:47:02.708999936
75%,2024-10-20 09:30:20.141500160
max,2025-02-04 03:58:03.445000


### Pivot

In [8]:

processed_data['entity_name'] = processed_data['entity_id'].map(idMap)
processed_data['atributename_id'] = processed_data['entity_name'] + '_' + processed_data['attribute_name']

df_res = processed_data.pivot_table(index='timestamp', columns='atributename_id', values='value', aggfunc='first')
df_res = df_res.astype({col: 'float64' for col in df_res.select_dtypes(include=['object']).columns})
df_res.reset_index(inplace=True)

In [9]:
df_res.head()

atributename_id,timestamp,energie_prijs_currentPrice,ht_wp_1_id_energyImportTotal,ht_wp_2_id_energyImportTotal,weather_temperature
0,2023-12-16 00:00:01.870,67.199997,,,
1,2023-12-16 00:00:01.877,75.16,,,
2,2023-12-16 01:00:00.235,55.64,,,
3,2023-12-16 02:00:01.620,56.0,,,
4,2023-12-16 03:00:01.894,58.0,,,


In [10]:
df_res.describe(include='all')

atributename_id,timestamp,energie_prijs_currentPrice,ht_wp_1_id_energyImportTotal,ht_wp_2_id_energyImportTotal,weather_temperature
count,510771,31296.0,246876.0,254895.0,27476.0
mean,2024-07-28 04:55:52.351399680,102.07255,189379.281161,311609.868197,13.015815
min,2023-12-16 00:00:01.870000,-200.0,0.0,0.0,-3.37
25%,2024-05-08 11:00:42.588999936,79.9,174148.0,297054.5,7.29
50%,2024-07-06 15:23:17.817999872,101.0,184260.0,310912.0,13.7
75%,2024-10-20 22:39:41.895000064,123.99,201127.25,325953.0,18.22
max,2025-02-04 03:58:03.445000,550.0,232357.0,354738.0,32.29
std,,48.372422,18810.211377,21225.049924,6.990488


In [11]:
print(df_res.dtypes)

atributename_id
timestamp                       datetime64[ns]
energie_prijs_currentPrice             float64
ht_wp_1_id_energyImportTotal           float64
ht_wp_2_id_energyImportTotal           float64
weather_temperature                    float64
dtype: object
