In [5]:
import pickle
import pandas as pd

def loadFromPickle(fullname):
    try:
        return pd.read_pickle(fullname, compression={'method': 'gzip'})
    except Exception:
        return pd.read_pickle(fullname)

def saveToPickle(fullname, df):
    df.to_pickle(fullname, compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})

def loadTrainTests(name, version):
    df_train = loadFromPickle(name+'_train_'+version+".pickle")
    df_test = loadFromPickle(name+'_test_'+version+".pickle")
    return df_train, df_test

def mergeTrainTest(name, version):
    df_train, df_test = loadTrainTests(name, version)
    return pd.concat([df_train, df_test], ignore_index=True)

In [34]:
data = mergeTrainTest("./workloads", "9")
#PERF_OBJS = [0.9, 0.95, 0.965, 0.98, 0.99, 0.995, 0.997]
data = data[data['perf_target_level'] == 0.98]


In [36]:
print(data['perf_target_level'])
print("_______________")
df = data[data['combined_column'] == '5 0.1 2 Ga']
#df = df[df['observation.normalized_buf_size'] == 1]
df = df[df['observation.innodb_buffer_pool_size'] == 8589934592]
print(df['sysbench_filtered.latency_mean'])

36864    0.98
36865    0.98
36866    0.98
36867    0.98
36868    0.98
         ... 
98299    0.98
98300    0.98
98301    0.98
98302    0.98
98303    0.98
Name: perf_target_level, Length: 15360, dtype: float64
_______________
37056    7.520286
Name: sysbench_filtered.latency_mean, dtype: float64


In [None]:
#pickle_file_paths = ["/home/cloud/poc_fanfan/vstune-bandits/df_eval.pkl"]
pickle_file_paths = ["/home/cloud/src/app/workloads_test_11.pickle", "/home/cloud/src/app/workloads_train_11.pickle"]
pickle_file_paths_to_save = ["/home/cloud/src/app/workloads_c098_test_11.pickle", "/home/cloud/src/app/workloads_c098_train_11.pickle"]


for idx, pickle_file_path in enumerate(pickle_file_paths):
    print(f'Loading {pickle_file_path}...')
    #with open(pickle_file_path, "rb") as f:
        #data = pickle.load(f)
    data = loadFromPickle(pickle_file_path)

    print("Initial df length ", len(data))

    # Group by the combination of interest
    grouped = data.groupby(["tables_rows", "wl_clients", "randtype", 'tables', 'db_size_mb'])

    # Create a dictionary that maps each combination to a sub-DataFrame
    grouped_data = {}
    for (tbl_rows, wl_cli, rtype, tbs, dbsz), grp_df in grouped:
        grouped_data[(tbl_rows, wl_cli, rtype, tbs, dbsz)] = grp_df

    iperf_sla = 0.98
    indices_to_remove = []

    for combo, df_sub in grouped_data.items():
        # Sort by buf_size to define a clear "time-like" or ascending order
        df_sub_sorted = df_sub.sort_values("buf_size", ascending=True)

        # Create boolean masks
        above_098 = df_sub_sorted["iperf01"] >= iperf_sla
        below_098 = df_sub_sorted["iperf01"] <= iperf_sla

        # Check if there's at least one point above 0.98 and one point below 0.98
        if above_098.any() and below_098.any():
            first_above_idx = above_098[above_098].index.max()
            last_below_idx = below_098[below_098].index.min()

            # Convert these indices to their position in df_sub_sorted
            first_above_order = df_sub_sorted.index.get_loc(first_above_idx)
            last_below_order = df_sub_sorted.index.get_loc(last_below_idx)

            # Check if the "above 0.98" event comes before the "below 0.98" event
            if first_above_order < last_below_order:
                # Mark these rows for removal
                print(f"               num rows    num tables    distribution   num cons    size db")
                print(f"Combination to kill: {combo}")
                # print(df_sub_sorted[["buf_size", "iperf01", "db_size_mb", "tables"]])
                print(df_sub_sorted.index.tolist())
                print("----------")
                indices_to_remove.extend(df_sub_sorted.index.tolist())

    # Remove the rows that meet the condition
    data_cleaned = data.drop(indices_to_remove)

    print("Cleaned df length ", len(data_cleaned), " vs. Initial ", len(data))

    print(f'Save cleaned data: {pickle_file_paths_to_save[idx]}')
    saveToPickle(pickle_file_paths_to_save[idx], data_cleaned)
