In [2]:
import pandas as pd
import glob
import pyarrow.parquet as pq
from tqdm import tqdm

def get_all_column_names(file_list):
    all_columns = set()
    for file in tqdm(file_list, desc="Extracting column names"):
        df = pq.read_table(file).to_pandas()
        all_columns.update(df.columns)
    return list(all_columns)

def count_same_values(file, aggregated_counts, all_columns):
    df = pq.read_table(file).to_pandas()
    
    for col in all_columns:
        if col in df.columns:
            data = df[col].values
            for t in range(2, len(data)):
                if data[t-1] == data[t-2]:
                    aggregated_counts[col]['same_2_previous'] += 1
                    if data[t] == data[t-1]:
                        aggregated_counts[col]['same_3_consecutive'] += 1

def calculate_probabilities(aggregated_counts):
    probabilities = {}
    for col, count in aggregated_counts.items():
        if count['same_2_previous'] > 0:
            probabilities[col] = count['same_3_consecutive'] / count['same_2_previous']
        else:
            probabilities[col] = None  # No cases where t-1 == t-2
    return probabilities

def main():
    # Directory containing the parquet files
    directory = f'D:\\2min-resample\MetaDataSeparation\MetaData Filtered\WO_RetT\\'
    file_list = glob.glob(directory + '*.parquet')
    # Step 1: Get all column names
    all_columns = get_all_column_names(file_list)
    
    # Initialize the aggregated counts dictionary
    aggregated_counts = {col: {'same_2_previous': 0, 'same_3_consecutive': 0} for col in all_columns}
    
    # Step 2: Count values for each file and aggregate
    for file in tqdm(file_list, desc="Processing files"):
        count_same_values(file, aggregated_counts, all_columns)
    
    # Step 3: Calculate probabilities
    probabilities = calculate_probabilities(aggregated_counts)

    # Step 4: Create a DataFrame to display results
    results = []
    for col in all_columns:
        same_2_previous = aggregated_counts[col]['same_2_previous']
        same_3_consecutive = aggregated_counts[col]['same_3_consecutive']
        prob = probabilities[col]
        results.append([col, same_2_previous, same_3_consecutive, prob])

    df_results = pd.DataFrame(results, columns=['Column', 'Same_2_Previous_Count', 'Same_3_Consecutive_Count', 'Conditional_Probability'])
    df_results.to_csv(f'D:\\2min-resample\MetaDataSeparation\MetaData Filtered\WO_RetT\conse_counts.csv',index=False)
    print(df_results)

if __name__ == "__main__":
    main()

Extracting column names: 100%|██████████| 1699/1699 [01:42<00:00, 16.54it/s]
Processing files: 100%|██████████| 1699/1699 [1:26:08<00:00,  3.04s/it]

                      Column  Same_2_Previous_Count  Same_3_Consecutive_Count  \
0                   PrimMaxT              140497018                 140493293   
1                      Brand               11395200                  11395118   
2                BurnNoStart              461384922                 442444309   
3                    ChBlock                1995802                   1995794   
4   HwTOutlet[degC](float32)              141063621                  98744424   
..                       ...                    ...                       ...   
72               ServBoilOpM                3136260                   3136238   
73                     ServY                2566031                   2566013   
74               ChViaSwitch              472613795                 467055774   
75                  FlameCur              434189691                 422304907   
76           ChimneySwActive              322032716                 322025392   

    Conditional_Probability




In [4]:
import os
import pandas as pd
import glob
import pyarrow.parquet as pq
from tqdm import tqdm

def replace_long_sequences(df, column, max_length):
    current_length = 1
    start_index = 0
    
    data = df[column].values
    for i in range(1, len(data)):
        if data[i] == data[i-1]:
            if current_length == 1:
                start_index = i - 1
            current_length += 1
        else:
            if current_length > max_length and data[i-1] != 0:
                df.iloc[start_index:i, df.columns.get_loc(column)] = None
            current_length = 1

    # Check the last sequence
    if current_length > max_length and data[-1] != 0:
        df.iloc[start_index:len(data), df.columns.get_loc(column)] = None

def process_file(file, max_lengths):
    df = pq.read_table(file).to_pandas()

    # Set 'datetime' column as index
    if 'datetime[](datetime)' in df.columns:
        df.set_index('datetime[](datetime)', inplace=True)
    
    for col, max_length in max_lengths.items():
        if col in df.columns:
            replace_long_sequences(df, col, max_length)

    return df

def main():
    # Directory containing the parquet files
    directory = f'D:\\2min-resample\MetaDataSeparation\MetaData Filtered\WO_RetT/'
    file_list = glob.glob(os.path.join(directory, '*.parquet'))

    # Directory to save processed files
    processed_directory = os.path.join(directory, 'processed_files_Version3')
    os.makedirs(processed_directory, exist_ok=True)

    # Read the CSV file containing the longest sequence lengths
    max_lengths_df = pd.read_csv(f'D:\\2min-resample\MetaDataSeparation\MetaData Filtered\With_RetT\conse_counts.csv')
    columns_of_interest = [
 "RetT", "BoilOpTime",
        "SafSensT", "HwOpTime", "BurnOpTime", 
          "GasValMain", "HwActive[bool](float32)",
        "HwFlow[L/min](float32)",  "ActPow[%](float32)", 
        "HwTOutlet[degC](float32)", 
        "OutTemp", "SysPrimT"
    ]
    max_lengths_df = max_lengths_df[max_lengths_df['Column'].isin(columns_of_interest)]
    max_lengths = max_lengths_df.set_index('Column')['Max_Length'].to_dict()

    # Process each file
    for file in tqdm(file_list, desc="Processing files"):
        processed_df = process_file(file, max_lengths)
        
        # Generate processed file path
        file_name = os.path.basename(file)
        processed_file_path = os.path.join(processed_directory, file_name)
        
        # Save the processed DataFrame to the new parquet file in the processed directory
        processed_df.to_parquet(processed_file_path, index=True)

if __name__ == "__main__":
    main()

Processing files: 100%|██████████| 1699/1699 [23:12<00:00,  1.22it/s]
