# Setup

In [1]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

fatal: destination path 'rapidsai-csp-utils' already exists and is not an empty directory.


[38;5;57m[1m⚡️ Tip[0m	Check organization access: [4mhttps://github.com/settings/connections/applications/c7457225b242a94d60c6[0m

Installing RAPIDS remaining 24.4.* libraries
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com

        ***********************************************************************
        The pip install of RAPIDS is complete.
        
        Please do not run any further installation from the conda based installation methods, as they may cause issues!
        
        Please ensure that you're pulling from the git repo to remain updated with the latest working install scripts.

        Troubleshooting:
            - If there is an installation failure, please check back on RAPIDSAI owned templates/notebooks to see how to update your personal files. 
            - If an installation failure persists when using the latest script, please make an is

## Some GPU Megiks

In [2]:
import cudf
print(cudf.__version__)

import cuml
print(cuml.__version__)

import cugraph
print(cugraph.__version__)

import cuspatial
print(cuspatial.__version__)

import cuxfilter
print(cuxfilter.__version__)

%load_ext cudf.pandas

24.04.01
24.04.00
24.04.00
24.04.00
24.04.01


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import time

for d in os.listdir('scores'):
    x = os.listdir(os.path.join('scores', d))
    l = len(x)
    n = len(set(x))
    if (l!=250) or (n!=250):
        print(f'Dir: {d}')
        print(f'#files: {l}')
        print(f"#unique: {n}\n")


In [4]:
from utilities import applyWindowing, reverseWindowing
from dataloader import DataLoader
# from lof import LocalOutlierFactor
from ocsvm import OCSVM
from benchmarker import benchmark

In [5]:
pd.options.mode.copy_on_write = True
UCR = 'ucrdata'

In [10]:
import numpy as np
from lof import LocalOutlierFactor
from isolation_forest import IsolationForest
from utilities import applyWindowing, reverseWindowing
from dataloader import TimeSeries
from sklearn.preprocessing import MinMaxScaler
import time

class IFLOFMultiplicativeEnsemble:
    '''
    ensemble = IFLOFMultiplicativeEnsemble(window_size=100, if_estimators=100, lof_neighbors=20)
    ensemble.fit(train_ts)
    result_ts, processing_time = ensemble.predict(test_ts)
    '''
    def __init__(self, window_size=50, if_estimators=100, lof_neighbors=50):
        self.window_size = window_size
        self.if_estimators = if_estimators
        self.lof_neighbors = lof_neighbors
        self.iforest = IsolationForest(windowSize=window_size, n_estimators=if_estimators)
        self.lof = LocalOutlierFactor(windowSize=window_size, neighbors=lof_neighbors)
        self.scaler = MinMaxScaler()

    def fit(self, tsObject: TimeSeries):
        self.iforest.fit(tsObject)

    def predict(self, tsObject: TimeSeries):

        start_time = time.time()
        # Apply windowing
        windowed_data = applyWindowing(tsObject.testData.values, self.window_size)

        # Pass windowed data to IF, get anomaly scores
        if_result, _ = self.iforest.predict(tsObject)
        if_scores = if_result.testData["Score"].values

        # Normalize IF scores to [0,1]
        if_scores_normalized = self.scaler.fit_transform(if_scores.reshape(-1, 1)).flatten()

        # Pass windowed data to LOF, get anomaly scores
        lof_result, _ = self.lof.predict(tsObject)
        lof_scores = lof_result.testData["Score"].values

        # Normalize LOF scores to [0,1]
        lof_scores_normalized = self.scaler.fit_transform(lof_scores.reshape(-1, 1)).flatten()

        # Compute final scores as: Final_Score = IF_Score * LOF_Score
        final_scores = if_scores_normalized * lof_scores_normalized

        processing_time = time.time() - start_time

        # Identify the point with maximum combined score
        # anomaly_index = np.argmax(final_scores)

        # Update tsObject with final scores
        tsObject.testData["Score"] = final_scores

        return tsObject, processing_time

    def toString(self):
        return f"iflof_mult_w{self.window_size}_e{self.if_estimators}_n{self.lof_neighbors}"

In [11]:
filename = os.listdir(UCR)[0]
file_path = os.path.join(UCR, filename)

dataloader = DataLoader()
ts = dataloader.load_file(file_path)

ts

detector = IFLOFMultiplicativeEnsemble()
detector.fit(ts)
ts, t = detector.predict(ts)

scores = ts.testData["Score"]
prediction = np.argmax(scores)

correct = ts.anomalies[0].start <= prediction <= ts.anomalies[0].end
print(f"Correct: {correct}")


Correct: False


In [9]:
estimator = IFLOFMultiplicativeEnsemble()
results_file = os.path.join('.', "results", f"{estimator.toString()}.csv")
scores_dir = os.path.join('.', "scores", estimator.toString())

benchmark(estimator, UCR, results_file, scores_dir)

Benchmarking:   0%|          | 1/250 [00:15<1:04:00, 15.42s/it]


KeyboardInterrupt: 

# LOF detectors

First, a sanity check:

In [None]:
# filename = os.listdir(UCR)[0]
# file_path = os.path.join(UCR, filename)

# dataloader = DataLoader()
# ts = dataloader.load_file(file_path)

# ts

# detector = OCSVM(windowSize=10)
# detector.fit(ts)
# ts, t = detector.predict(ts)

# scores = ts.testData["Scores"]
# prediction = np.argmax(scores)

# correct = ts.anomalies[0].start <= prediction <= ts.anomalies[0].end
# print(f"Correct: {correct}")


In [None]:
# lof = LocalOutlierFactor(windowSize=50, neighbors=50, gpu=True)
# results_file = os.path.join('.', "results", f"{lof.toString()}.csv")
# scores_dir = os.path.join('.', "results", lof.toString(), "scores")

# benchmark(lof, UCR, results_file, scores_dir)

# WV-OLS

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


def wv_knn(scores, k=3):
    """
    Weighted Voting with KNN
    
    :param scores: numpy array of shape (n_samples, n_detectors)
    :param k: number of neighbors for KNN
    :return: numpy array of final anomaly scores
    """
    n_samples, n_detectors = scores.shape
    weights = np.zeros(n_detectors)
    
    for d in range(n_detectors):
        X = np.delete(scores, d, axis=1)
        y = scores[:, d]
        
        knn = KNeighborsRegressor(n_neighbors=k)
        knn.fit(X, y)
        y_pred = knn.predict(X)
        
        weights[d] = r2_score(y, y_pred)
    
    # Normalize weights
    weights = weights / np.sum(weights)
    
    # Compute final anomaly scores
    final_scores = np.max(weights * scores, axis=1)
    
    return final_scores, weights

def wv_ols(scores):
    """
    Weighted Voting with Ordinary Least Squares
    
    :param scores: numpy array of shape (n_samples, n_detectors)
    :return: numpy array of final anomaly scores
    """
    n_samples, n_detectors = scores.shape
    weights = np.zeros(n_detectors)
    
    for d in range(n_detectors):
        X = np.delete(scores, d, axis=1)
        y = scores[:, d]
        
        ols = LinearRegression()
        ols.fit(X, y)
        y_pred = ols.predict(X)
        
        rmse = np.sqrt(np.mean((y - y_pred)**2))
        weights[d] = max(0, 1 - rmse)
    
    # Normalize weights
    weights = weights / np.sum(weights)
    
    # Compute final anomaly scores
    final_scores = np.max(weights * scores, axis=1)
    
    return final_scores, weights

# def wv_ols_r2(scores):
#     """
#     Weighted Voting with Ordinary Least Squares using R^2
    
#     :param scores: numpy array of shape (n_samples, n_detectors)
#     :return: numpy array of final anomaly scores, and weights
#     """
#     n_samples, n_detectors = scores.shape
#     weights = np.zeros(n_detectors)
    
#     for d in range(n_detectors):
#         X = np.delete(scores, d, axis=1)
#         y = scores[:, d]
        
#         ols = LinearRegression()
#         ols.fit(X, y)
#         y_pred = ols.predict(X)
        
#         weights[d] = r2_score(y, y_pred)
    
#     # Normalize weights
#     weights = weights / np.sum(weights)
    
#     # Compute final anomaly scores
#     final_scores = np.max(weights * scores, axis=1)
    
#     return final_scores, weights    

def wv_ols_r2(scores, emphasize_diversity=False):
    """
    Weighted Voting with Ordinary Least Squares using R^2
    
    :param scores: numpy array of shape (n_samples, n_detectors)
    :param emphasize_diversity: if True, give higher weights to less predictable detectors
    :return: numpy array of final anomaly scores, and weights
    """
    n_samples, n_detectors = scores.shape
    weights = np.zeros(n_detectors)
    
    for d in range(n_detectors):
        X = np.delete(scores, d, axis=1)
        y = scores[:, d]
        
        ols = LinearRegression()
        ols.fit(X, y)
        y_pred = ols.predict(X)
        
        weights[d] = r2_score(y, y_pred)
    
    if emphasize_diversity:
        # Invert weights to emphasize diversity
        weights = 1 - weights
    
    # Avoid division by zero
    weights = np.maximum(weights, 1e-10)
    
    # Normalize weights
    weights = weights / np.sum(weights)
    
    # Compute final anomaly scores
    final_scores = np.max(weights * scores, axis=1)
    
    return final_scores, weights

# # Example usage
# np.random.seed(42)
# n_samples = 1000
# n_detectors = 5

# # Simulating anomaly scores from multiple detectors
# scores = np.random.rand(n_samples, n_detectors)

# # Apply the method with original assumption (a)
# final_scores_a, weights_a = wv_ols_r2(scores, emphasize_diversity=False)

# # Apply the method with assumption (b) to emphasize diversity
# final_scores_b, weights_b = wv_ols_r2(scores, emphasize_diversity=True)



def wv_ols_r2_topk(scores, k):
    """
    Voting with top-k detectors selected based on R^2
    
    :param scores: numpy array of shape (n_samples, n_detectors)
    :param k: number of top detectors to use
    :return: numpy array of final anomaly scores, and selected detector indices
    """
    n_samples, n_detectors = scores.shape
    r2_scores = np.zeros(n_detectors)
    
    for d in range(n_detectors):
        X = np.delete(scores, d, axis=1)
        y = scores[:, d]
        
        ols = LinearRegression()
        ols.fit(X, y)
        y_pred = ols.predict(X)
        
        r2_scores[d] = r2_score(y, y_pred)
    
    # Select top-k detectors (lower R^2 means more unique information)
    top_k_indices = np.argsort(r2_scores)[:k]
    
    # Compute final anomaly scores (average of top-k detectors)
    final_scores = np.mean(scores[:, top_k_indices], axis=1)
    
    return final_scores, top_k_indices

# # Example usage
# np.random.seed(42)
# n_samples = 1000
# n_detectors = 5

# # Simulating anomaly scores from multiple detectors
# scores = np.random.rand(n_samples, n_detectors)

# # Apply the new method
# final_scores, selected_detectors = wv_ols_r2_topk(scores, k=3)

# print("Selected detectors:", selected_detectors)

# # Visualize results
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 6))

# plt.subplot(2, 1, 1)
# plt.plot(final_scores, label='Top-k R^2 Voting')
# plt.legend()
# plt.title('Anomaly Scores')
# plt.xlabel('Sample')
# plt.ylabel('Anomaly Score')

# plt.subplot(2, 1, 2)
# plt.bar(range(n_detectors), [1 if i in selected_detectors else 0 for i in range(n_detectors)])
# plt.title('Selected Detectors')
# plt.xlabel('Detector Index')
# plt.ylabel('Selected (1) / Not Selected (0)')

# plt.tight_layout()
# plt.show()

# Soft voting

In [None]:

# base_learners = os.listdir('scores')
# dataloader = DataLoader()

# time_series = []
# status = []

# for f in tqdm(os.listdir(UCR)):
#     ts_name = f.split('.')[0]
#     base_learner_scores = []

#     for base_learner in base_learners:
#         base_learner_scores.append(pd.read_csv(f'scores/{base_learner}/{ts_name}_scores.csv').to_numpy())

#     scores = np.column_stack(tuple(base_learner_scores))
#     final_scores =  np.mean(scores, axis=1)

#     df_final_scores = pd.DataFrame(final_scores, columns=['Score'])
#     prediction = np.argmax(final_scores)

#     ts = dataloader.load_file(file_path = os.path.join(UCR, f'{ts_name}.csv'))
#     if (ts.anomalies[0].start <= prediction <= ts.anomalies[0].end):
#         status.append(True)
#     else:
#         status.append(False)

#     time_series.append(f"{ts_name}.csv")

In [None]:
# df_results = pd.DataFrame({'name':time_series, 'status':status})
# accuracy = df_results['status'].sum()/df_results.shape[0]
# print(f'Accuracy: {accuracy}' )

# WV-OLS

In [None]:

# base_learners = os.listdir('scores')
# dataloader = DataLoader()

# time_series = []
# status = []

# for f in tqdm(os.listdir(UCR)):
#     ts_name = f.split('.')[0]
#     base_learner_scores = []

#     for base_learner in base_learners:
#         base_learner_scores.append(pd.read_csv(f'scores/{base_learner}/{ts_name}_scores.csv').to_numpy())

#     scores = np.column_stack(tuple(base_learner_scores))
#     final_scores, weights = wv_ols(scores)


#     df_final_scores = pd.DataFrame(final_scores, columns=['Score'])
#     ts = dataloader.load_file(file_path = os.path.join(UCR, f'{ts_name}.csv'))
#     prediction = np.argmax(final_scores)

#     if (ts.anomalies[0].start <= prediction <= ts.anomalies[0].end):
#         status.append(True)
#     else:
#         status.append(False)

#     time_series.append(f"{ts_name}.csv")


In [None]:
# df_results = pd.DataFrame({'name':time_series, 'status':status})
# accuracy = df_results['status'].sum()/df_results.shape[0]
# print(f'Accuracy: {accuracy}' )

# WV_OLS_R2

In [None]:

# base_learners = os.listdir('scores')
# dataloader = DataLoader()

# time_series = []
# status = []

# for f in tqdm(os.listdir(UCR)):
#     ts_name = f.split('.')[0]
#     base_learner_scores = []

#     for base_learner in base_learners:
#         base_learner_scores.append(pd.read_csv(f'scores/{base_learner}/{ts_name}_scores.csv').to_numpy())

#     scores = np.column_stack(tuple(base_learner_scores))
#     final_scores, weights = wv_ols_r2(scores)


#     df_final_scores = pd.DataFrame(final_scores, columns=['Score'])
#     ts = dataloader.load_file(file_path = os.path.join(UCR, f'{ts_name}.csv'))
#     prediction = np.argmax(final_scores)

#     if (ts.anomalies[0].start <= prediction <= ts.anomalies[0].end):
#         status.append(True)
#     else:
#         status.append(False)

#     time_series.append(f"{ts_name}.csv")


In [None]:
# df_results = pd.DataFrame({'name':time_series, 'status':status})
# accuracy = df_results['status'].sum()/df_results.shape[0]
# print(f'Accuracy: {accuracy}' )

In [None]:
# base_learners = os.listdir('scores')
# dataloader = DataLoader()

# time_series = []
# status = []

# for f in tqdm(os.listdir(UCR)):
#     ts_name = f.split('.')[0]
#     base_learner_scores = []

#     for base_learner in base_learners:
#         base_learner_scores.append(pd.read_csv(f'scores/{base_learner}/{ts_name}_scores.csv').to_numpy())

#     scores = np.column_stack(tuple(base_learner_scores))
#     final_scores, weights = wv_knn(scores)


#     df_final_scores = pd.DataFrame(final_scores, columns=['Score'])
#     ts = dataloader.load_file(file_path = os.path.join(UCR, f'{ts_name}.csv'))
#     prediction = np.argmax(final_scores)

#     if (ts.anomalies[0].start <= prediction <= ts.anomalies[0].end):
#         status.append(True)
#     else:
#         status.append(False)

#     time_series.append(f"{ts_name}.csv")


# WV_OLS_R2_DIVERSITY_TRUE

In [None]:

# base_learners = os.listdir('scores')
# dataloader = DataLoader()

# time_series = []
# status = []

# for f in tqdm(os.listdir(UCR)):
#     ts_name = f.split('.')[0]
#     base_learner_scores = []

#     for base_learner in base_learners:
#         base_learner_scores.append(pd.read_csv(f'scores/{base_learner}/{ts_name}_scores.csv').to_numpy())

#     scores = np.column_stack(tuple(base_learner_scores))
#     final_scores, weights = wv_ols_r2(scores, emphasize_diversity=True)


#     df_final_scores = pd.DataFrame(final_scores, columns=['Score'])
#     ts = dataloader.load_file(file_path = os.path.join(UCR, f'{ts_name}.csv'))
#     prediction = np.argmax(final_scores)

#     if (ts.anomalies[0].start <= prediction <= ts.anomalies[0].end):
#         status.append(True)
#     else:
#         status.append(False)

#     time_series.append(f"{ts_name}.csv")


In [None]:
# df_results = pd.DataFrame({'name':time_series, 'status':status})
# accuracy = df_results['status'].sum()/df_results.shape[0]
# print(f'Accuracy: {accuracy}' )

Preliminary experiments showed that this was a kak idea

# Top-K

In [None]:

# base_learners = os.listdir('scores')
# dataloader = DataLoader()

# time_series = []
# status = []

# for f in tqdm(os.listdir(UCR)):
#     ts_name = f.split('.')[0]
#     base_learner_scores = []

#     for base_learner in base_learners:
#         base_learner_scores.append(pd.read_csv(f'scores/{base_learner}/{ts_name}_scores.csv').to_numpy())

#     scores = np.column_stack(tuple(base_learner_scores))
#     final_scores, weights = wv_ols_r2_topk(scores, 10)


#     df_final_scores = pd.DataFrame(final_scores, columns=['Score'])
#     ts = dataloader.load_file(file_path = os.path.join(UCR, f'{ts_name}.csv'))
#     prediction = np.argmax(final_scores)

#     if (ts.anomalies[0].start <= prediction <= ts.anomalies[0].end):
#         status.append(True)
#     else:
#         status.append(False)

#     time_series.append(f"{ts_name}.csv")


In [None]:
# df_results = pd.DataFrame({'name':time_series, 'status':status})
# accuracy = df_results['status'].sum()/df_results.shape[0]
# print(f'Accuracy: {accuracy}' )

# Isolation Forest

In [None]:
# from sklearn.ensemble import IsolationForest

# if_meta_detector = IsolationForest()

# base_learners = os.listdir('scores')
# dataloader = DataLoader()

# time_series = []
# status = []

# for f in tqdm(os.listdir(UCR)):
#     ts_name = f.split('.')[0]
#     base_learner_scores = []

#     for base_learner in base_learners:
#         base_learner_scores.append(pd.read_csv(f'scores/{base_learner}/{ts_name}_scores.csv').to_numpy())

#     scores = np.column_stack(tuple(base_learner_scores))
#     if_meta_detector.fit(scores)
#     # NB! THIS IS WRONG! THE DECISION FUNCTION VALUES NEED TO BE USED AS WEIGHTS FOR THE DETECTORS
#     final_scores = -if_meta_detector.decision_function(scores)

#     df_final_scores = pd.DataFrame(final_scores, columns=['Score'])
#     prediction = np.argmax(final_scores)

#     ts = dataloader.load_file(file_path = os.path.join(UCR, f'{ts_name}.csv'))
#     if (ts.anomalies[0].start <= prediction <= ts.anomalies[0].end):
#         status.append(True)
#     else:
#         status.append(False)

#     time_series.append(f"{ts_name}.csv")

In [None]:
# df_results = pd.DataFrame({'name':time_series, 'status':status})
# accuracy = df_results['status'].sum()/df_results.shape[0]
# print(f'Accuracy: {accuracy}' )

# OneClassSVM

remember this is the shape of the input data to your ONCSVM.

```
t | d1 | d2 | d3 | ... | dn
```

It assigns greater _weight_ to the detector with the most **different** score

# NB! NB! NB!
USE THE "UN-CORRELATEDNESS" AS A MEASURE OF CAPUTRING INFORMATION, AND THEN ONLY SELECT THE TOP-K MOST UNCORRELATED

# EXPERIMENTS

# NB! LOF(50, 50) RESULTS FILE IS INVALID.  SCORES ARE STILL VALID.  NEED TO RE-CACLUCLATE RESULTS FILE ONLY!!!

## Expermient 1

In [None]:
# import numpy as np
# import pandas as pd
# from tqdm import tqdm

# from lof import LocalOutlierFactor
# from matrix_profile import MatrixProfile
# from isolation_forest import IsolationForest
# from kmeans import KMeans
# # Import other necessary modules (MP, etc.)

# from dataloader import DataLoader
# from benchmarker import benchmark, process_precomputed_scores
# import os
# import itertools

# # Define base learners and their parameter ranges
# base_learners = {
#     'LOF': {
#         'class': LocalOutlierFactor,
#         'params': {
#             'windowSize': [25, 50, 100, 150, 200, 250], #[25, 250], #
#             'neighbors': [10, 20, 50, 100], #[50, 100], #
#             'gpu': [True]
#         }
#     },
#     'IF': {
#         'class': IsolationForest,
#         'params': {
#             'windowSize': [20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, 500, 550, 600]
#         }
#     },
#     'KMeans': {
#         'class': KMeans,
#         'params': {
#             'windowSize': [50, 100, 200, 500],
#             'n_clusters': [10, 20, 50, 100, 200],
#         }
#     },
#     'MP': {
#         'class': MatrixProfile,
#         'params': {
#             'windowSize': [20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, 500, 550, 600]
#         }
#     }
# }

# def run_experiment(base_learner_name, base_learner_class, params, ucr_path, results_path, scores_path):
#     """Run experiment for a single base learner with given parameters."""
#     learner = base_learner_class(**params)
#     benchmark_results_file = os.path.join(results_path, f"{learner.toString()}.csv")
#     # benchmark_scores_dir = os.path.join(scores_path, f"{learner.toString()}")
        
#     # Load results
#     summary = pd.read_csv(benchmark_results_file, nrows=1)
    
    
#     return {
#         'params': params,
#         'ucr_score': summary['accuracy'].values[0],
#         'computational_time': summary['total_time'].values[0],
#     }

# def generate_param_combinations(param_dict):
#     """Generate all combinations of parameters."""
#     keys = list(param_dict.keys())
#     values = list(param_dict.values())
#     for instance in itertools.product(*values):
#         yield dict(zip(keys, instance))

# # Main experiment loop
# ucr_path = 'ucrdata'
# results_path = 'results'
# scores_path = 'scores'

# all_results = []

# for base_learner_name, base_learner_info in base_learners.items():
#     print(f"Running experiments for {base_learner_name}")
#     base_learner_class = base_learner_info['class']
#     param_combinations = list(generate_param_combinations(base_learner_info['params']))
    
#     for params in tqdm(param_combinations, desc=f"{base_learner_name} configurations"):
#         result = run_experiment(base_learner_name, base_learner_class, params, ucr_path, results_path, scores_path)
#         result['base_learner'] = base_learner_name
#         all_results.append(result)

# # Convert results to DataFrame
# results_df = pd.DataFrame(all_results)

# # Analysis
# for base_learner_name in base_learners.keys():
#     learner_results = results_df[results_df['base_learner'] == base_learner_name]
    
#     print(f"\nAnalysis for {base_learner_name}:")
#     for metric in ['ucr_score', 'computational_time']:
#         mean = learner_results[metric].mean()
#         std = learner_results[metric].std()
#         cv = std / mean
#         print(f"{metric}:")
#         print(f"  Mean: {mean:.4f}")
#         print(f"  Std Dev: {std:.4f}")
#         print(f"  Coeff of Variation: {cv:.4f}")
    
#     best_config = learner_results.loc[learner_results['ucr_score'].idxmax()]
#     print(f"\nBest configuration for {base_learner_name}:")
#     print(best_config['params'])
#     print(f"UCR Score: {best_config['ucr_score']:.4f}")

# # Save full results
# experiments_path = os.path.join('experiments', 'experiment_e1_results.csv')
# results_df.to_csv(experiments_path, index=False)
# print(f"\nFull results saved to {experiments_path}")

We trained a diverse set of algorithms.  Diversity was introduced in two ways, namely:
1. Multiple hyper-parameter congiurations (homogeneous and heterogneous ensembles)
2. Multiple model classes (heterogeneous ensembles only)

A very practical consideration for real-world TSAD is that the performance of a given model cannot typically be validated.  To simluate this lack of prior knowledge on the performance of a model, we randomly select 20 (or 30 since this is what the SoTA used) models for each ensemble learner and determine performance.   We repeat this process for a total of 30 repitions, in order to draw general conclusions from the results.

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Load the data
# df = pd.read_csv('experiments/experiment_e1_results.csv')

# # Convert params column from string to dictionary
# df['params'] = df['params'].apply(eval)

# # 1. Line Plots
# plt.figure(figsize=(20, 15))

# for i, learner in enumerate(['LOF', 'IF', 'KMeans', 'MP'], 1):
#     plt.subplot(2, 2, i)
#     learner_df = df[df['base_learner'] == learner]
    
#     if learner == 'LOF':
#         for neighbors in learner_df['params'].apply(lambda x: x['neighbors']).unique():
#             data = learner_df[learner_df['params'].apply(lambda x: x['neighbors'] == neighbors)]
#             plt.plot(data['params'].apply(lambda x: x['windowSize']), data['ucr_score'], 
#                      label=f'neighbors={neighbors}')
#     elif learner == 'KMeans':
#         for n_clusters in learner_df['params'].apply(lambda x: x['n_clusters']).unique():
#             data = learner_df[learner_df['params'].apply(lambda x: x['n_clusters'] == n_clusters)]
#             plt.plot(data['params'].apply(lambda x: x['windowSize']), data['ucr_score'], 
#                      label=f'n_clusters={n_clusters}')
#     else:  # This covers both IF and MP
#         plt.plot(learner_df['params'].apply(lambda x: x['windowSize']), learner_df['ucr_score'], 
#                  label=learner)
    
#     plt.title(f'{learner} Performance')
#     plt.xlabel('Window Size')
#     plt.ylabel('UCR Score')
#     plt.legend()

# plt.tight_layout()
# plt.savefig('visualisations/line_plots.png')
# plt.close()

# # 2. Scatter Plot
# plt.figure(figsize=(12, 8))
# for learner in df['base_learner'].unique():
#     learner_df = df[df['base_learner'] == learner]
#     plt.scatter(learner_df['computational_time'], learner_df['ucr_score'], label=learner, alpha=0.7)

# plt.xlabel('Computational Time')
# plt.ylabel('UCR Score')
# plt.title('Performance vs Computational Time')
# plt.legend()
# plt.savefig('visualisations/scatter_plot.png')
# plt.close()

# # 3. Box Plot
# plt.figure(figsize=(10, 6))
# sns.boxplot(x='base_learner', y='ucr_score', data=df)
# plt.title('Distribution of UCR Scores by Base Learner')
# plt.savefig('visualisations/box_plot.png')
# plt.close()

# # 4. Summary Table
# summary = df.groupby('base_learner').agg({
#     'ucr_score': ['max', 'mean', 'std'],
#     'computational_time': 'mean'
# }).reset_index()

# summary.columns = ['Base Learner', 'Best UCR Score', 'Mean UCR Score', 'Std UCR Score', 'Mean Computational Time']

# # Get parameters for best score
# best_params = df.loc[df.groupby('base_learner')['ucr_score'].idxmax(), ['base_learner', 'params']]
# best_params.columns = ['Base Learner', 'Best Parameters']

# # Merge summary and best_params
# summary = summary.merge(best_params, on='Base Learner')

# # Calculate best performance-time trade-off (you may want to adjust this metric)
# df['trade_off'] = df['ucr_score'] / df['computational_time']
# best_trade_off = df.loc[df.groupby('base_learner')['trade_off'].idxmax(), ['base_learner', 'params']]
# best_trade_off.columns = ['Base Learner', 'Best Trade-off Parameters']

# # Merge summary and best_trade_off
# summary = summary.merge(best_trade_off, on='Base Learner')

# # Save summary to CSV
# summary.to_csv('visualisations/summary_table.csv', index=False)

# print("Visualizations and summary table have been created and saved.")

In [None]:
# summary_table = pd.read_csv('visualisations/summary_table.csv')
# summary_table

In [None]:
import numpy as np
from dataloader import DataLoader, TimeSeries
from lof import LocalOutlierFactor
from matrix_profile import MatrixProfile
from kmeans import KMeans
from isolation_forest import IsolationForest
from ensemble_detector import EnsembleDetector
from benchmarker import benchmark

# Define the path to your UCR dataset
UCR_PATH = 'ucrdata'
cached_scores_dir = 'scores'

# Initialize DataLoader
dataloader = DataLoader()

# Create base learners
lof1 = LocalOutlierFactor(windowSize=50, neighbors=50, gpu=True)
lof2 = LocalOutlierFactor(windowSize=50, neighbors=100, gpu=True)
mp = MatrixProfile(windowSize=140)
km = KMeans(windowSize=50, n_clusters=100)
if_detector = IsolationForest(windowSize=100)

# Example 1: Homogeneous Ensemble
homogeneous_ensemble = EnsembleDetector(
    base_learners=[lof1, lof2],
    method='simple_average',
    scores_dir = cached_scores_dir
)

# Example 2: Heterogeneous Ensemble
heterogeneous_ensemble = EnsembleDetector(
    base_learners=[lof1, mp, km, if_detector],
    method='wv_ols_r2',
    method_params={'emphasize_diversity': False},
    scores_dir = cached_scores_dir
)

# Example 3: Ensemble with top-k method
topk_ensemble = EnsembleDetector(
    base_learners=[lof1, lof2, mp, km, if_detector],
    method='wv_ols_r2_topk',
    method_params={'k': 3},
    scores_dir = cached_scores_dir
)

# Load a time series from the UCR dataset
ts_file = '001_UCR_Anomaly_DISTORTED1sddb40.csv'
ts = dataloader.load_file(f"{UCR_PATH}/{ts_file}")

# Fit and predict using the ensembles
homogeneous_ensemble.fit(ts)
het_ts, het_time = heterogeneous_ensemble.predict(ts)

print(f"Heterogeneous Ensemble Name: {heterogeneous_ensemble.toString()}")
print(f"Prediction Time: {het_time:.2f} seconds")

# Run benchmark on the ensemble detector
save_results_file = f"ensembles/results/{heterogeneous_ensemble.toString()}.csv"
save_scores_dir = f"ensembles/scores/{heterogeneous_ensemble.toString()}"

benchmark(heterogeneous_ensemble, UCR_PATH, save_results_file, save_scores_dir)

# Analyze results


results = pd.read_csv(save_results_file, nrows=1)
accuracy = results['accuracy'].values[0]
total_time = results['total_time'].values[0]

print(f"Ensemble Accuracy: {accuracy:.4f}")
print(f"Total Benchmark Time: {total_time:.2f} seconds")


In [None]:
ensemble = EnsembleDetector(
                    base_learners=base_learners,
                    method=method,
                    method_params=method_params,
                    scores_dir=CACHED_SCORES_DIR
                )
                
ensemble_name = ensemble.toString()
save_results_file = os.path.join(ENSEMBLE_RESULTS_DIR, f"{ensemble_name}_rep{rep}.csv")
save_scores_dir = os.path.join(ENSEMBLE_SCORES_DIR, f"{ensemble_name}_rep{rep}")
                
benchmark(ensemble, UCR_PATH, save_results_file, save_scores_dir)