In [2]:
#seperating the DHW and CH consumption

import os
import pandas as pd
import glob
import pyarrow.parquet as pq
from tqdm import tqdm
import numpy as np
import datetime


In [5]:

MaxPowList = pd.read_csv(r'D:\2min-resample\MetaDataSeparation\MetaData Filtered\PowerMaxT.csv',index_col=0)
Cwater = 4.2 #kJ/(kg*K)
ita = 0.9 #efficiency


def mainsw_temp(DateTime):
    day_of_year = DateTime.dayofyear
    ave = 11
    fluc = 5* np.sin((day_of_year-141)*2*np.pi/365)
    return ave+fluc


def extract_and_resample_and_calculate(file, resample_freq='30min'):
    global MaxPowList, Cwater, ita
    file_name = os.path.basename(file)
    df = pq.read_table(file).to_pandas()
    df.index = pd.to_datetime(df.index)
    
    # print(f"Processing file: {file_name}")
    # print("Initial data sample:")
    # print(df.head())

    df['MainsWT'] = df.index.to_series().apply(mainsw_temp)
    df['DHWpow'] = df['HwFlow[L/min](float32)'] * (df['HwTOutlet[degC](float32)'] - df['MainsWT']) * df['ChActive'] * Cwater /(ita*60)
    # print("Data after calculating DHWpow:")
    # print(df[['DHWpow']].head())
    df['ActPow[%](float32)'] = df['ActPow[%](float32)'] * df['ChActive']
    df['ActPow[%](float32)'] = df['ActPow[%](float32)'] * MaxPowList[file_name].iloc[0]/100
    df['CHPow']= df['ActPow[%](float32)']-df['DHWpow']
    # print("Data after calculating DHWpow and ActPow:")
    # print(df[['DHWpow', 'ActPow[%](float32)']].head())

    # resampled_gpow = (df['ActPow[%](float32)'].resample(resample_freq).mean())/200
    # resampled_DHWpow = (df['DHWpow'].resample(resample_freq).mean())/2
    # 
    # resampled_df = pd.concat([resampled_gpow, resampled_DHWpow], axis=1)
    # resampled_df['CHpow'] = resampled_df['ActPow[%](float32)'] - resampled_df['DHWpow']
    
    # print("Resampled data sample:")
    # print(resampled_df.head())
    
    return df['CHPow']

def create_common_time_index(file_list, resample_freq='2min'):
    # Create an empty DataFrame to hold all the indices
    min_date, max_date = None, None

    # Determine the overall time range from all files
    for file in tqdm(file_list, desc="Getting Date Stamps"):
        df = pq.read_table(file).to_pandas()
        df.index = pd.to_datetime(df.index)
        current_min_date, current_max_date = df.index.min(), df.index.max()
        if min_date is None or current_min_date < min_date:
            min_date = current_min_date
        if max_date is None or current_max_date > max_date:
            max_date = current_max_date

    # Create a common time index based on the overall time range
    common_time_index = pd.date_range(start=min_date, end=max_date, freq=resample_freq)
    
    index_df = pd.DataFrame(common_time_index, columns=['datetime'])
    # Save the DataFrame to a CSV file
    index_df.to_csv(f'D:\\2min-resample\MetaDataSeparation\MetaData Filtered\\time_index{resample_freq}.csv', index=False)
    
    return common_time_index


def main():
    # Directory containing the parquet files
    out_dir = r'D:\2min-resample\MetaDataSeparation\MetaData Filtered\load duration model_WODHW'
    directories = [
        r'D:\2min-resample\MetaDataSeparation\MetaData Filtered\With_RetT\processed_files_Version3\\',
        r'D:\\2min-resample\MetaDataSeparation\MetaData Filtered\WO_RetT\processed_files_Version3\\'
    ]
    # Initialize an empty list to hold all file paths
    file_list = []

    # Collect file paths from all directories
    for directory in directories:
        file_list.extend(glob.glob(os.path.join(directory, '*.parquet')))

    # Create a common time index
    # common_time_index = create_common_time_index(file_list)
    
    index_df = pd.read_csv(r'D:\2min-resample\MetaDataSeparation\MetaData Filtered\time_index2min.csv')
    # Convert the DataFrame back to a DatetimeIndex
    common_time_index = pd.to_datetime(index_df['datetime'])
    
    
    # Dictionary to hold all resampled data
    data_dict = {}
    n=0
    # Process each file
    for file in tqdm(file_list, desc="Processing files"):
        file_name = os.path.basename(file)
        resampled_series = extract_and_resample_and_calculate(file)
        
        # Align the resampled series to the common time index without filling missing values
        resampled_series = resampled_series.reindex(common_time_index)
        data_dict[file_name] = resampled_series
        # n+=1
        # if n==10:
        #     break
    # Combine all series into a single DataFrame
    combined_df = pd.DataFrame(data_dict, index=common_time_index)
     

    # Save the combined DataFrame to a CSV file
    output_csv_path = os.path.join(out_dir, 'combined_consumption_WODHW_2min.csv')
    combined_df.to_csv(output_csv_path, index=True)

if __name__ == "__main__":
    main()

Processing files: 100%|██████████| 3231/3231 [1:08:37<00:00,  1.27s/it]


In [4]:
common_time_index = pd.date_range(start="2021-07-24 00:00:00+00:00", end="2022-08-24 00:00:00+00:00", freq="2min")
index_df = pd.DataFrame(common_time_index, columns=['datetime'])
# Save the DataFrame to a CSV file
index_df.to_csv(f'D:\\2min-resample\MetaDataSeparation\MetaData Filtered\\time_index2min.csv', index=False)

In [None]:
df = pd.read_csv(r"D:\2min-resample\MetaDataSeparation\MetaData Filtered\load duration model_WODHW\combined_consumption_WODHW_2min.csv",index_col=0)
print(df.head(30))


In [8]:
df['sum'] = df.sum(axis=1)
dfout = df['sum']
dfout = dfout.to_csv(r"D:\2min-resample\MetaDataSeparation\MetaData Filtered\load duration model_WODHW\sum_2min.csv",index=True)