This notebook is used for subsetting the input datasets to get ready for different training scenarios.

In [1]:
import itertools
from pathlib import Path
from functools import reduce
from copy import deepcopy
import pandas as pd
import numpy as np


In [2]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s', datefmt='%m/%d/%Y %H:%M:%S')

# Read the data

In [3]:
# Read the raw input data
dir_data = Path("/qfs/people/jian449/KIM/examples/im_cloudmodel/data")
f_state = dir_data / "Input_512.csv"
df_state = pd.read_csv(f_state)



In [10]:
# Get the features of the keys
keys = list(df_state.keys())
locs = [k.split('_')[0] for k in keys]
times = [k.split('_')[1] for k in keys]
varns = [k.split('_')[2] for k in keys]
nkeys = len(keys)


# Subsetting

In [11]:
# Locations of the points
middle_pts = ['s1', 's10', 's19']
cold_pts = ['s2', 's3', 's9', 's11', 's12', 's18', 's20', 's21', 's27']
warm_pts = ['s5', 's6', 's7', 's14', 's15', 's16', 's23', 's24', 's25']

In [12]:
# Multiple criteria
middle_pts_r = [True, False]
all_ss_varns = [True, False]
wstd = [True, False]
wall_types = ['cold', 'warm', 'both']
temperature = [True, False]
combinations = itertools.product(middle_pts_r, all_ss_varns, wstd, wall_types, temperature)

In [13]:
for mp, ss, w, wall, t in combinations:
    keys_case = deepcopy(keys)

    # If keeping all variables at middle points
    label1 = 'mp'
    removed_set1 = [False] * nkeys
    if not mp:
        label1 = 'nompR'
        # c1 = [l in middle_pts for l in locs]
        # c2 = [v.startswith('R') for v in varns]
        # removed_set1 = [a and b for a, b in zip(c1, c2)]
        removed_set1 = [l in middle_pts for l in locs]
    removed_set1 = np.array(removed_set1)

    # If keeping all SS variables
    label2 = 'ss'
    removed_set2 = [False] * nkeys
    if not ss:
        label2 = 'noss'
        removed_set2 = [v.startswith('SS') for v in varns]
    removed_set2 = np.array(removed_set2)

    # If keeping wstd
    label3 = 'wstd'
    removed_set3 = [False] * nkeys
    if not w:
        label3 = 'nowstd'
        removed_set3 = [v == 'Wstd' for v in varns]
    removed_set3 = np.array(removed_set3)

    # What wall to be used:
    label4 = 'both'
    removed_set4 = [False] * nkeys
    if wall == 'cold':
        label4 = 'cold'
        removed_set4 = [l in warm_pts for l in locs]
    elif wall == 'warm':
        label4 = 'warm'
        removed_set4 = [l in cold_pts for l in locs]
    removed_set4 = np.array(removed_set4)

    # If using temperature data
    label5 = 't'
    removed_set5 = [False] * nkeys
    if not t:
        label5 = 'not'
        removed_set5 = [v.startswith('T') for v in varns]
    removed_set5 = np.array(removed_set5)
    
    # Combine all removed sets
    removed_set = reduce(np.logical_or, [removed_set1, removed_set2, removed_set3, removed_set4, removed_set5])

    # Subset the data
    df_state_subset = df_state.iloc[:,~removed_set].copy()
    logging.info(f'Combination: {label1} {label2} {label3} {label4} {label5}; total number of keys: {df_state_subset.shape[1]}')
    f_state_subset = dir_data / f'Input-{label1}-{label2}-{label3}-{label4}-{label5}.csv'
    df_state_subset.to_csv(f_state_subset)


11/13/2024 09:40:17 - INFO: Combination: mp ss wstd cold t; total number of keys: 972
11/13/2024 09:40:18 - INFO: Combination: mp ss wstd cold not; total number of keys: 756
11/13/2024 09:40:19 - INFO: Combination: mp ss wstd warm t; total number of keys: 972
11/13/2024 09:40:19 - INFO: Combination: mp ss wstd warm not; total number of keys: 756
11/13/2024 09:40:20 - INFO: Combination: mp ss wstd both t; total number of keys: 1458
11/13/2024 09:40:21 - INFO: Combination: mp ss wstd both not; total number of keys: 1134
11/13/2024 09:40:22 - INFO: Combination: mp ss nowstd cold t; total number of keys: 864
11/13/2024 09:40:23 - INFO: Combination: mp ss nowstd cold not; total number of keys: 648
11/13/2024 09:40:24 - INFO: Combination: mp ss nowstd warm t; total number of keys: 864
11/13/2024 09:40:24 - INFO: Combination: mp ss nowstd warm not; total number of keys: 648
11/13/2024 09:40:25 - INFO: Combination: mp ss nowstd both t; total number of keys: 1296
11/13/2024 09:40:26 - INFO: Com

In [58]:
removed_set

array([ True, False, False, ..., False, False, False])