In [19]:
import pandas as pd
import ast

# Load ground truth and prediction files
ground_truth = pd.read_csv('../MetaOD/MetaOD_top_3.csv')
predictions = pd.read_csv('recommended_models_with_runtime_top_3.csv')

# Convert ground truth string to actual list
ground_truth['top3_ground_truth'] = ground_truth['top3_ground_truth'].apply(ast.literal_eval)

# Normalize dataset names to match
ground_truth['dataset'] = ground_truth['dataset'].str.lower().str.replace('.csv', '', regex=False)
predictions['dataset'] = predictions['FileName'].str.lower().str.replace('_test.csv', '', regex=False)

# Merge on normalized dataset name
merged = pd.merge(
    predictions[['dataset', 'Rec_1', 'Rec_2', 'Rec_3']],
    ground_truth[['dataset', 'top3_ground_truth']],
    on='dataset',
    how='inner'
)

# Create top3_prediction as a list
merged['top3_prediction'] = merged[['Rec_1', 'Rec_2', 'Rec_3']].values.tolist()

# Final DataFrame with necessary columns
top3_preds_df = merged[['dataset', 'top3_prediction', 'top3_ground_truth']]

# Save to CSV
output_csv_path = "Orthus_top_3.csv"
top3_preds_df.to_csv(output_csv_path, index=False)

print(f"✅ Saved to: {output_csv_path}")


✅ Saved to: Orthus_top_3.csv


In [23]:
top3_preds_df['top3_prediction'][0][0]

'OmniAnomaly'

In [24]:
df = top3_preds_df

# NDCG

In [30]:
import numpy as np

def compute_dcg(relevance_scores):
    return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores))

def compute_ndcg(ground_truth, prediction, k=None):
    if k is None:
        k = len(prediction)

    # Binary relevance
    relevance = [1 if item in ground_truth else 0 for item in prediction[:k]]
    print(relevance)
    # DCG
    dcg = compute_dcg(relevance)
    print(dcg)
    # Ideal DCG (relevant items all at top ranks)
    ideal_relevance = [1] * min(len(ground_truth), k) + [0] * (k - min(len(ground_truth), k))
    print(ideal_relevance)
    idcg = compute_dcg(ideal_relevance)
    print(idcg)
    return dcg / idcg if idcg > 0 else 0.0

# Test

def compute_meta_ndcg(df, k):
    ndcg = []
    for _, row in df.iterrows():
        pred_top = row['top3_prediction'][:k] if row['top3_prediction'] else []
        gt  = row['top3_ground_truth'][0] # use top 3 as relevant
        print(pred_top)
        print(gt)
        ndcg_score = compute_ndcg(gt, pred_top, k)
        print(f"NDCG: {ndcg_score:.4f}")
        ndcg.append(ndcg_score)
        break
    return np.mean(ndcg)





In [31]:
df = pd.read_csv("Orthus_top_3.csv")
mean_ndcg = compute_meta_ndcg(df, 3)
mean_ndcg

['O
[
[1, 0, 0]
1.0
[1, 0, 0]
1.0
NDCG: 1.0000


1.0

In [34]:
df['top3_prediction'][0][0]

'['

In [35]:
top3_preds_df['top3_prediction'][0]

"['OmniAnomaly', 'MCD', 'ALLM4TS']"

In [32]:


# Compute Hit top 3 
def compute_hit_top_3(df):
    hit_count = 0
    total = len(df)

    for _, row in df.iterrows():
        pred_top1 = row['top3_prediction'][0] if row['top3_prediction'] else None
        truth_top3 = row['top3_ground_truth']
        if pred_top1 in truth_top3:
            hit_count += 1

    hit_rate = (hit_count / total) * 100 if total > 0 else 0
    return hit_rate

hit_top_3 = compute_hit_top_3(top3_preds_df)
hit_top_3


100.0

In [14]:


# Compute  top 3 best 
def compute_top_3_best(df):
    hit_count = 0
    total = len(df)

    for _, row in df.iterrows():
        pred_top3 = row['top3_prediction'] 
        print(pred_top3)
        truth_top1 = row['top3_ground_truth'][0] if row['top3_ground_truth'] else None
        print(truth_top1)
        if truth_top1 in pred_top3:
            hit_count += 1

    hit_rate = (hit_count / total) * 100 if total > 0 else 0
    return hit_rate

top_3_best = compute_top_3_best(top3_preds_df)
top_3_best

['OmniAnomaly', 'MCD', 'ALLM4TS']
[
['MCD', 'CNN', 'LOF']
[
['PCA', 'MCD', 'ALLM4TS']
[
['MCD', 'CNN', 'ALLM4TS']
[
['OmniAnomaly', 'PCA', 'MCD']
[
['IForest', 'KMeansAD', 'ALLM4TS']
[
['PCA', 'MCD', 'ALLM4TS']
[
['OmniAnomaly', 'PCA', 'MCD']
[
['OmniAnomaly', 'PCA', 'MCD']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[
['PCA', 'KMeansAD', 'ALLM4TS']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[
['PCA', 'KMeansAD', 'ALLM4TS']
[
['CNN', 'KMeansAD', 'LOF']
[
['PCA', 'KMeansAD', 'ALLM4TS']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[
['PCA', 'KMeansAD', 'ALLM4TS']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[
['PCA', 'MCD', 'ALLM4TS']
[
['PCA', 'KMeansAD', 'ALLM4TS']
[
['PCA', 'KMeansAD', 'ALLM4TS']
[
['PCA', 'KMeansAD', 'ALLM4TS']
[
['PCA', 'KMeansAD', 'ALLM4TS']
[
['OmniAnomaly', 'PCA', 'MCD']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[
['OmniAnomaly', 'PCA', 'KMeansAD']
[


100.0

# runtime 

In [2]:
import pandas as pd

df = pd.read_csv('recommended_models_with_runtime.csv')
df["dataset_group"] = df["FileName"].str.split("_").str[0]

# Group by dataset_group and compute mean runtime
mean_runtime = df.groupby("dataset_group")["RunTimeSeconds"].mean().reset_index()

# Sort for readability
mean_runtime = mean_runtime.sort_values(by="RunTimeSeconds", ascending=False).reset_index(drop=True)

print(mean_runtime)


     dataset_group  RunTimeSeconds
0           cicids       19.622162
1       creditcard       11.381611
2             swan        9.944022
3              PSM        7.223324
4      OPPORTUNITY        7.159275
5              GHL        4.081409
6              SMD        3.773699
7         Exathlon        2.590681
8            GECCO        2.422797
9          Daphnet        1.602919
10        GutenTAG        1.483593
11           metro        1.182768
12           MITDB        1.090990
13         Genesis        1.070981
14            SMAP        0.981866
15            SVDB        0.750473
16             MSL        0.281833
17  room-occupancy        0.201931


In [3]:
import pandas as pd

# Load the runtime records
df = pd.read_csv('recommended_models_with_runtime.csv')
df["dataset_group"] = df["FileName"].str.replace("_test.csv", "", regex=False).str.strip()

# Load dataset size info
df_summary = pd.read_csv('../Datasets/data_summary.csv')  # contains File and NumRows
df_summary["Dataset"] = df_summary["File"].str.replace("_test.csv", "", regex=False).str.strip()

# Define size bins
bins = [0, 5000, 20000, 50000, float('inf')]
labels = ['1-5k', '5k-20k', '20k-50k', '>50k']
df_summary['SizeCategory'] = pd.cut(df_summary['NumRows'], bins=bins, labels=labels, right=False)

# Merge size category into runtime dataframe
df = df.merge(df_summary[['Dataset', 'SizeCategory']], left_on='dataset_group', right_on='Dataset', how='left')

# Group by size category and compute mean runtime
mean_runtime_by_size = df.groupby('SizeCategory')['RunTimeSeconds'].mean().reset_index()

# Sort categories in logical order
size_order = ['1-5k', '5k-20k', '20k-50k', '>50k']
mean_runtime_by_size['SizeCategory'] = pd.Categorical(mean_runtime_by_size['SizeCategory'], categories=size_order, ordered=True)
mean_runtime_by_size = mean_runtime_by_size.sort_values('SizeCategory')

print(mean_runtime_by_size)


  SizeCategory  RunTimeSeconds
0         1-5k        0.262813
1       5k-20k        1.507565
2      20k-50k        4.843169
3         >50k        2.273170


  mean_runtime_by_size = df.groupby('SizeCategory')['RunTimeSeconds'].mean().reset_index()
