# Define analysis functions

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
import time
import math
import os
import scipy.io
from scipy.io import arff
import copy
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score
)

from pyod.models.lof import LOF
from fastlof import FastLOF
from ranged_lof import RangedLOF

global lof_auto_scores, lof_auto_time_stats, lof_brute_time_stats, fastlof_results, lof_auto_metrics
global ranged_lof_scores, ranged_lof_time_stats, ranged_lof_metrics
lof_auto_scores = None
lof_auto_time_stats = None
lof_brute_time_stats = None
fastlof_results = None
lof_auto_metrics = None
ranged_lof_scores = None
ranged_lof_time_stats = None
ranged_lof_metrics = None


def load_dataset(filepath, fraction=1.0):
    """Load dataset from various file formats (.csv, .mat, .arff).

    Returns
    -------
    X : ndarray
        Feature matrix
    y : ndarray or None
        Binary labels (1 = anomaly, 0 = normal) if available, else None
    """
    _, ext = os.path.splitext(filepath)
    ext = ext.lower()
    X = None
    y = None

    # Lightweight preview of the dataset (first 5 rows with column headings, where possible)
    try:
        if ext == '.csv':
            preview_df = pd.read_csv(filepath, nrows=5)
            print("\n--- Dataset preview (first 5 rows) ---")
            print(preview_df.head())
            print("--------------------------------------")
        elif ext == '.arff':
            preview_data, preview_meta = arff.loadarff(filepath)
            preview_df = pd.DataFrame(preview_data.tolist(), columns=preview_data.dtype.names)
            print("\n--- Dataset preview (first 5 rows) ---")
            print(preview_df.head())
            print("--------------------------------------")
        # For .mat we will preview after loading X below
    except Exception as e:
        print(f"(Preview skipped due to error: {e})")

    def _label_from_value(value):
        """Convert textual/numeric labels to binary anomaly flags."""
        if isinstance(value, (bytes, bytearray)):
            value = value.decode('utf-8')
        value_str = str(value).strip().strip('"').strip("'").lower()
        if value_str in {
            '1', 'anomaly', 'attack', 'outlier', 'abnormal', 'yes', 'true',
            'o', 'outliers', 'anomalous', 'attack.', 'anomaly.', 'outlier.', 'abnormal.'
        }:
            return 1
        return 0

    if ext == '.csv':
        data = []
        labels = []
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parts = line.split(',')
                try:
                    row = [float(x) for x in parts]
                    data.append(row)
                except ValueError:
                    # Attempt to treat last column as label
                    try:
                        features = [float(x) for x in parts[:-1]]
                        label = _label_from_value(parts[-1])
                        data.append(features)
                        labels.append(label)
                    except ValueError:
                        print(f"Warning: Could not process row (even after label handling): {line}")
                        continue
        if not data:
            raise ValueError("No valid rows were loaded from the CSV.")
        X = np.array(data, dtype=float)
        if labels and len(labels) == len(data):
            y = np.array(labels, dtype=np.int32)

    elif ext == '.mat':
        try:
            mat_data = scipy.io.loadmat(filepath)
            if 'X' in mat_data and isinstance(mat_data['X'], np.ndarray) and mat_data['X'].ndim == 2:
                X = mat_data['X']
            elif 'data' in mat_data and isinstance(mat_data['data'], np.ndarray) and mat_data['data'].ndim == 2:
                X = mat_data['data']
            else:
                potential_X = None
                max_size = 0
                for key, value in mat_data.items():
                    if isinstance(value, np.ndarray) and value.ndim == 2 and value.size > max_size and not key.startswith('__'):
                        potential_X = value
                        max_size = value.size
                if potential_X is not None:
                    X = potential_X
                else:
                    raise ValueError(f"Could not find a suitable 2D data array in .mat file. Available keys: {mat_data.keys()}")

            X = X.astype(float)

            # Preview first 5 rows for .mat as a DataFrame with generic column names
            try:
                import pandas as _pd_mat_preview
                n_cols = X.shape[1]
                col_names = [f"f{i}" for i in range(n_cols)]
                preview_df = _pd_mat_preview.DataFrame(X[:5, :], columns=col_names)
                print("\n--- Dataset preview (first 5 rows) [from .mat X] ---")
                print(preview_df.head())
                print("--------------------------------------")
            except Exception as e:
                print(f"(Preview for .mat skipped due to error: {e})")

            for label_key in ['y', 'Y', 'labels', 'label']:
                if label_key in mat_data:
                    label_arr = mat_data[label_key]
                    label_arr = np.ravel(label_arr)
                    if label_arr.size == X.shape[0]:
                        y = np.array([_label_from_value(val) for val in label_arr], dtype=np.int32)
                        break

        except Exception as e:
            raise Exception(f"Error loading .mat file '{filepath}': {e}")

    elif ext == '.arff':
        try:
            data, meta = arff.loadarff(filepath)
            numeric_cols = []
            labels = None
            for col_name in data.dtype.names:
                col_data = data[col_name]
                if np.issubdtype(col_data.dtype, np.number):
                    numeric_cols.append(col_data)
                else:
                    if labels is None:
                        labels = np.array([_label_from_value(val) for val in col_data], dtype=np.int32)
                    else:
                        print(f"Warning: Multiple non-numeric columns detected. Using '{col_name}' as additional label information.")
                        labels = np.array([_label_from_value(val) for val in col_data], dtype=np.int32)

            if not numeric_cols:
                raise ValueError("No numeric columns found in .arff file after filtering.")

            X = np.column_stack(numeric_cols).astype(float)
            if labels is not None and labels.shape[0] == X.shape[0]:
                y = labels

        except Exception as e:
            raise Exception(f"Error loading .arff file '{filepath}': {e}")

    else:
        raise ValueError(f"Unsupported file format: {ext}. Supported formats are .csv, .mat, .arff.")

    if X is None or X.size == 0:
        raise ValueError("Loaded data is empty or could not be processed.")

    if fraction < 1.0:
        n_samples = int(X.shape[0] * fraction)
        X = X[:n_samples]
        if y is not None:
            y = y[:n_samples]
        print(f"Loaded {fraction*100:.0f}% of dataset: {X.shape[0]} samples")

    return X, y


def run_lof_auto(X_normalized, k, contamination, n_runs):
    """Run LOF (auto) multiple times and collect scores and timing statistics."""
    print(f"\nTesting LOF (auto) ({n_runs} runs)...")

    def run_lof():
        lof = LOF(n_neighbors=k, contamination=contamination, algorithm='auto')
        lof.fit(X_normalized)
        return lof.decision_scores_, lof.detector_._fit_method

    # Warmup run
    print("  Warming up LOF (auto)...")
    warmup_n = min(max(k + 1, 1024), X_normalized.shape[0])
    if warmup_n < X_normalized.shape[0]:
        lof_wu = LOF(n_neighbors=k, contamination=contamination, algorithm='auto')
        lof_wu.fit(X_normalized[:warmup_n])
        _ = lof_wu.decision_scores_
    else:
        _ = run_lof()

    # Collect scores and times
    all_scores = []
    times = []
    fit_method = None
    for _ in range(n_runs):
        t0 = time.perf_counter()
        scores, fit_method = run_lof()
        t_run = time.perf_counter() - t0
        all_scores.append(scores)
        times.append(t_run)

    print(f"Algorithm chosen by Auto: {fit_method}")
    avg_scores = np.mean(all_scores, axis=0)

    time_stats = {
        'avg': np.mean(times),
        'min': np.min(times),
        'max': np.max(times),
        'std': np.std(times),
        'times': times,
        'algorithm': 'LOF auto: ' + fit_method
    }

    print(f"LOF (auto) - avg: {time_stats['avg']:.4f}s, "
          f"min: {time_stats['min']:.4f}s, "
          f"max: {time_stats['max']:.4f}s, "
          f"std: {time_stats['std']:.4f}s")

    return avg_scores, time_stats


def run_lof_brute(X_normalized, k, contamination, n_runs):
    """Run LOF (brute) multiple times and collect scores and timing statistics."""
    print(f"\nTesting LOF (brute) ({n_runs} runs)...")

    def run_lof():
        lof = LOF(n_neighbors=k, contamination=contamination, algorithm='brute')
        lof.fit(X_normalized)
        return lof.decision_scores_

    # Warmup run
    print("  Warming up LOF (brute)...")
    warmup_n = min(max(k + 1, 1024), X_normalized.shape[0])
    if warmup_n < X_normalized.shape[0]:
        lof_wu = LOF(n_neighbors=k, contamination=contamination, algorithm='brute')
        lof_wu.fit(X_normalized[:warmup_n])
        _ = lof_wu.decision_scores_
    else:
        _ = run_lof()

    all_scores = []
    times = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        scores = run_lof()
        t_run = time.perf_counter() - t0
        all_scores.append(scores)
        times.append(t_run)

    avg_scores = np.mean(all_scores, axis=0)

    time_stats = {
        'avg': np.mean(times),
        'min': np.min(times),
        'max': np.max(times),
        'std': np.std(times),
        'times': times,
        'algorithm': 'LOF_brute'
    }
    
    print(f"LOF (brute) - avg: {time_stats['avg']:.4f}s, "
          f"min: {time_stats['min']:.4f}s, "
          f"max: {time_stats['max']:.4f}s, "
          f"std: {time_stats['std']:.4f}s")

    return avg_scores, time_stats

def run_ranged_lof(X_normalized, k, contamination, n_runs, n_neighbors_lb=None):
    """Run Ranged LOF multiple times and collect scores and timing statistics."""
    print(f"\nTesting Ranged LOF ({n_runs} runs)...")

    if n_neighbors_lb is None:
        n_neighbors_lb = k
        
    def run_once():
        rlof = RangedLOF(
            n_neighbors=k,
            n_neighbors_lb=n_neighbors_lb,
            contamination=contamination,
            algorithm='auto',
        )
        rlof.fit(X_normalized)
        return rlof.decision_scores_

    # Warmup run
    print("  Warming up Ranged LOF...")
    _ = run_once()

    all_scores = []
    times = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        scores = run_once()
        t_run = time.perf_counter() - t0
        all_scores.append(scores)
        times.append(t_run)

    avg_scores = np.mean(all_scores, axis=0)

    time_stats = {
        'avg': np.mean(times),
        'min': np.min(times),
        'max': np.max(times),
        'std': np.std(times),
        'times': times,
        'algorithm': 'RangedLOF',
        'n_neighbors_lb': n_neighbors_lb,
    }

    print(
        f"Ranged LOF - avg: {time_stats['avg']:.4f}s, "
        f"min: {time_stats['min']:.4f}s, "
        f"max: {time_stats['max']:.4f}s, "
        f"std: {time_stats['std']:.4f}s"
    )

    return avg_scores, time_stats

    # Warmup run
    print("  Warming up LOF (brute)...")
    warmup_n = min(max(k + 1, 1024), X_normalized.shape[0])
    if warmup_n < X_normalized.shape[0]:
        lof_wu = LOF(n_neighbors=k, contamination=contamination, algorithm='brute')
        lof_wu.fit(X_normalized[:warmup_n])
        _ = lof_wu.decision_scores_
    else:
        _ = run_lof()

    # Collect times
    times = []

    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = run_lof()
        t_run = time.perf_counter() - t0
        times.append(t_run)

    time_stats = {
        'avg': np.mean(times),
        'min': np.min(times),
        'max': np.max(times),
        'std': np.std(times),
        'times': times,
        'algorithm': 'LOF_brute'
    }

    print(f"LOF (brute) - avg: {time_stats['avg']:.4f}s, "
          f"min: {time_stats['min']:.4f}s, "
          f"max: {time_stats['max']:.4f}s, "
          f"std: {time_stats['std']:.4f}s")

    return time_stats


def _aggregate_timing_info(timing_dicts):
    """Average timing dictionaries across runs, preserving structural info."""
    if not timing_dicts:
        return None

    phase_keys = [
        'total', 'initialization', 'chunk_processing',
        'lof_calculation', 'active_set_updates', 'finalization'
    ]
    detail_keys = ['distance_computation', 'neighbor_updates', 'self_distance_handling']

    agg = {k: 0.0 for k in phase_keys}
    agg['chunk_processing_details'] = {k: 0.0 for k in detail_keys}

    for timing in timing_dicts:
        for key in phase_keys:
            agg[key] += timing.get(key, 0.0)
        details = timing.get('chunk_processing_details', {})
        for key in detail_keys:
            agg['chunk_processing_details'][key] += details.get(key, 0.0)

    count = len(timing_dicts)
    for key in phase_keys:
        agg[key] /= count
    for key in detail_keys:
        agg['chunk_processing_details'][key] /= count

    # Keep a copy of the structural information from the last run (distances_computed, passes, iterations, n_chunks)
    last = timing_dicts[-1]
    agg['distances_computed'] = last.get('distances_computed')
    agg['passes'] = copy.deepcopy(last.get('passes', []))
    agg['iterations'] = copy.deepcopy(last.get('iterations', []))
    agg['n_chunks'] = last.get('n_chunks')

    return agg


def _compute_anomaly_metrics(y_true, scores, contamination):
    """Compute ROC/PR metrics given ground-truth labels."""
    if y_true is None:
        return None

    y_true = np.asarray(y_true).ravel()
    if y_true.shape[0] != scores.shape[0]:
        raise ValueError("Label array and scores must have the same length for metric evaluation.")

    # Check for only one class - crucial for metric calculation
    if np.unique(y_true).size < 2:
        print("Warning: Only one class present in labels. Skipping metric computation.")
        return None

    roc_auc = roc_auc_score(y_true, scores)
    pr_auc = average_precision_score(y_true, scores)

    k = max(1, int(len(y_true) * contamination))
    sorted_idx = np.argsort(scores)[::-1]
    top_k_idx = sorted_idx[:k]
    precision_at_k = np.sum(y_true[top_k_idx]) / k
    recall_at_k = np.sum(y_true[top_k_idx]) / max(1, np.sum(y_true))

    return {
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'precision_at_k': precision_at_k,
        'recall_at_k': recall_at_k,
        'k_top': k
    }


def run_fastlof_multiple_chunks(X_normalized, k, contamination, n_runs, dataset_size,
                                 min_chunk_size, max_chunk_size, chunk_interval, threshold=1.2,
                                 y_true=None):
    """Run FastLOF with multiple chunk sizes."""
    print(f"\nTesting FastLOF with multiple chunk sizes ({n_runs} runs each)...")

    # Warmup run
    print("  Warming up FastLOF...")
    warmup_n = min(max(k + 1, 1024), X_normalized.shape[0])
    fastlof = FastLOF(n_neighbors=k, contamination=contamination, threshold=threshold)
    if warmup_n < X_normalized.shape[0]:
        fastlof.fit(X_normalized[:warmup_n])
    else:
        fastlof.fit(X_normalized)

    results = []

    chunk_interval = int(chunk_interval)
    if chunk_interval <= 0:
        raise ValueError("chunk_interval must be a positive integer")

    max_chunk_size = min(int(max_chunk_size), dataset_size)
    min_chunk_size = max(int(min_chunk_size), k + 1)

    if min_chunk_size > max_chunk_size:
        # Raise an error/warning as requested for per dataset handling.
        raise ValueError(f"min_chunk_size ({min_chunk_size}) must be <= max_chunk_size ({max_chunk_size}) (after min clip to k+1). Adjust parameters.")

    chunk_sizes = list(range(max_chunk_size, min_chunk_size - 1, -chunk_interval))
    if not chunk_sizes or chunk_sizes[-1] != min_chunk_size:
        chunk_sizes.append(min_chunk_size)

    # Use set to remove duplicates and sort by size (descending for plotting consistency later)
    chunk_sizes = sorted(list(set(chunk_sizes)), reverse=True)

    print(f"  Testing {len(chunk_sizes)} different chunk sizes...")
    
    # Store FastLOF results and times for final output
    for idx, chunk_size in enumerate(chunk_sizes):
        chunk_count = int(math.ceil(dataset_size / chunk_size))
        print(f"  [{idx+1}/{len(chunk_sizes)}] Chunk size: {chunk_size}, Chunk count: {chunk_count}")

        all_scores = []
        times = []
        timing_runs = []

        for run_idx in range(n_runs):
            def run_fastlof():
                fastlof = FastLOF(n_neighbors=k, contamination=contamination,
                                  chunk_size=chunk_size, threshold=threshold)
                fastlof.fit(X_normalized)
                return fastlof.decision_scores_, copy.deepcopy(fastlof.timing_)

            t0 = time.perf_counter()
            scores, timing_info = run_fastlof()
            t_run = time.perf_counter() - t0
            all_scores.append(scores)
            times.append(t_run)
            timing_runs.append(timing_info)

        avg_scores = np.mean(all_scores, axis=0)

        time_stats = {
            'avg': np.mean(times),
            'min': np.min(times),
            'max': np.max(times),
            'std': np.std(times),
            'times': times,
            'algorithm': 'FastLOF',
            'chunk_size': chunk_size
        }

        timing_summary = _aggregate_timing_info(timing_runs)

        metrics = None
        if y_true is not None:
            metrics = _compute_anomaly_metrics(y_true, avg_scores, contamination)

        results.append({
            'chunk_size': chunk_size,
            'chunk_count': chunk_count,
            'avg_scores': avg_scores,
            'time_stats': time_stats,
            'timing': timing_summary,
            'metrics': metrics
        })

        print(f"    avg: {time_stats['avg']:.4f}s, min: {time_stats['min']:.4f}s, "
              f"max: {time_stats['max']:.4f}s")

    return results

def save_timing_csv(lof_auto_time_stats, lof_brute_time_stats, fastlof_results,
                    dataset_filepath, k, contamination, n_runs, threshold,
                    ranged_lof_time_stats=None):
    """Saves final timing results to a CSV file.

    Output structure (relative to project root):
    results/<dataset_name>/k{k}_t{threshold}/...
    where <dataset_name> is the dataset filename without extension.
    """
    
    timing_data = []

    # 1. LOF Auto
    if lof_auto_time_stats:
        timing_data.append({
            'Algorithm': 'LOF_auto',
            'k': k,
            'Contamination': contamination,
            'N_Runs': n_runs,
            'Chunk_Size': 'N/A',
            'Avg_Time_s': lof_auto_time_stats['avg'],
            'Min_Time_s': lof_auto_time_stats['min'],
            'Max_Time_s': lof_auto_time_stats['max'],
            'Std_Dev_s': lof_auto_time_stats['std']
        })

    # 2. LOF Brute
    if lof_brute_time_stats:
        timing_data.append({
            'Algorithm': 'LOF_brute',
            'k': k,
            'Contamination': contamination,
            'N_Runs': n_runs,
            'Chunk_Size': 'N/A',
            'Avg_Time_s': lof_brute_time_stats['avg'],
            'Min_Time_s': lof_brute_time_stats['min'],
            'Max_Time_s': lof_brute_time_stats['max'],
            'Std_Dev_s': lof_brute_time_stats['std']
        })

    # 2b. Ranged LOF (optional)
    if ranged_lof_time_stats:
        timing_data.append({
            'Algorithm': 'RangedLOF',
            'k': k,
            'Contamination': contamination,
            'N_Runs': n_runs,
            'Chunk_Size': 'N/A',
            'Avg_Time_s': ranged_lof_time_stats['avg'],
            'Min_Time_s': ranged_lof_time_stats['min'],
            'Max_Time_s': ranged_lof_time_stats['max'],
            'Std_Dev_s': ranged_lof_time_stats['std']
        })

    # 3. FastLOF Results
    for result in fastlof_results:
        time_stats = result['time_stats']
        timing_data.append({
            'Algorithm': 'FastLOF',
            'k': k,
            'Contamination': contamination,
            'N_Runs': n_runs,
            'Chunk_Size': result['chunk_size'],
            'Avg_Time_s': time_stats['avg'],
            'Min_Time_s': time_stats['min'],
            'Max_Time_s': time_stats['max'],
            'Std_Dev_s': time_stats['std']
        })

    if not timing_data:
        print("Warning: No timing data available to save.")
        return None

    df = pd.DataFrame(timing_data)

    # Build results directory from current working directory: results/<dataset_name>/k{k}_t{threshold}
    dataset_name = os.path.splitext(os.path.basename(dataset_filepath))[0]
    project_root = os.getcwd()
    results_base = os.path.join(project_root, 'results', dataset_name)
    k_threshold_folder = os.path.join(results_base, f"k{k}_t{threshold}")
    
    os.makedirs(k_threshold_folder, exist_ok=True)
    
    csv_filename = f"timing_results_{dataset_name}.csv"
    csv_path = os.path.join(k_threshold_folder, csv_filename)
    
    df.to_csv(csv_path, index=False)
    
    print(f"\n Timing results successfully saved to: '{csv_path}'")
    return csv_path


def create_plots(lof_auto_scores, lof_auto_time_stats, lof_brute_time_stats, fastlof_results,
                 n_runs, threshold, fraction, filepath, k,
                 lof_auto_metrics=None,
                 ranged_lof_time_stats=None,
                 ranged_lof_metrics=None,
                 ranged_lof_scores=None):
    """Create comparison plots and save to the results folder.

    Output structure (relative to project root):
    results/<dataset_name>/k{k}_t{threshold}/...
    where <dataset_name> is the dataset filename without extension.
    """
    
    # Build results directory from current working directory: results/<dataset_name>/k{k}_t{threshold}
    dataset_name = os.path.splitext(os.path.basename(filepath))[0]
    project_root = os.getcwd()
    results_base = os.path.join(project_root, 'results', dataset_name)
    k_threshold_folder = os.path.join(results_base, f"k{k}_t{threshold}")
    os.makedirs(k_threshold_folder, exist_ok=True)
    import datetime
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    
    if lof_auto_scores is None or fastlof_results is None or not fastlof_results:
        print("Skipping plot generation: Missing LOF auto scores or FastLOF results.")
        return []

    metrics_available = (
        (lof_auto_metrics is not None or ranged_lof_metrics is not None)
        and any(res.get('metrics') for res in fastlof_results)
    )
    num_plots = 4 if metrics_available else 3
    fig = plt.figure(figsize=(6 * num_plots, 5))

    fig.suptitle(
        f'FastLOF vs LOF Comparison (dataset={os.path.basename(filepath)}, runs={n_runs}, '
        f'FastLOF threshold={threshold}, k={k}, fraction={fraction})',
        fontsize=14, fontweight='bold', y=0.98
    )

    ax1 = plt.subplot(1, num_plots, 1)

    chunk_sizes = []
    correlations = []

    for result in fastlof_results:
        chunk_size = result['chunk_size']
        fastlof_scores = result['avg_scores']

        correlation = np.corrcoef(lof_auto_scores, fastlof_scores)[0, 1]
        chunk_sizes.append(chunk_size)
        correlations.append(correlation)

    # Sort by chunk size for consistent plotting
    sorted_indices = np.argsort(chunk_sizes)
    chunk_sizes_sorted = np.array(chunk_sizes)[sorted_indices]
    correlations_sorted = np.array(correlations)[sorted_indices]

    ax1.plot(chunk_sizes_sorted, correlations_sorted, 'o-', linewidth=2, markersize=6)
    ax1.set_xlabel('Chunk Size', fontsize=12)
    ax1.set_ylabel('Correlation with LOF (auto)', fontsize=12)
    ax1.set_title('FastLOF vs LOF (auto) Correlation', fontsize=13)
    ax1.grid(True, alpha=0.3)
    
    if len(chunk_sizes_sorted) <= 20: 
        ax1.set_xticks(chunk_sizes_sorted)
        ax1.set_xticklabels([str(int(cs)) for cs in chunk_sizes_sorted], rotation=45, ha='right', fontsize=9)
    else:
        # Simplified tick generation for many points
        min_cs = chunk_sizes_sorted.min()
        max_cs = chunk_sizes_sorted.max()
        ticks = np.linspace(min_cs, max_cs, min(10, len(chunk_sizes_sorted)))
        ax1.set_xticks(ticks)
        ax1.set_xticklabels([str(int(t)) for t in ticks], rotation=45, ha='right', fontsize=9)


    ax2 = plt.subplot(1, num_plots, 2)

    names = ['LOF (auto)', 'LOF (brute)']
    colors = ['blue', 'green']

    times_avg = [lof_auto_time_stats['avg'], lof_brute_time_stats['avg']]
    times_min = [lof_auto_time_stats['min'], lof_brute_time_stats['min']]
    times_max = [lof_auto_time_stats['max'], lof_brute_time_stats['max']]

    if ranged_lof_time_stats is not None:
        names.append('Ranged LOF')
        colors.append('orange')
        times_avg.append(ranged_lof_time_stats['avg'])
        times_min.append(ranged_lof_time_stats['min'])
        times_max.append(ranged_lof_time_stats['max'])

    x_pos = np.arange(len(names))

    error_lower = [max(0, avg - tmin) for avg, tmin in zip(times_avg, times_min)]
    error_upper = [tmax - avg for avg, tmax in zip(times_avg, times_max)]

    bars = ax2.bar(x_pos, times_avg, color=colors, alpha=0.7,
                     yerr=[error_lower, error_upper], capsize=8,
                     error_kw={'elinewidth': 2, 'capthick': 2})

    ax2.set_xticks(x_pos)
    ax2.set_xticklabels(names, fontsize=11)
    ax2.set_ylabel('Time (seconds)', fontsize=12)
    ax2.set_title('Runtime Comparison (LOF auto vs brute)', fontsize=13)
    ax2.grid(True, alpha=0.3, axis='y')

    for i, (bar, avg, tmin, tmax, err_upper) in enumerate(zip(bars, times_avg, times_min, times_max, error_upper)):
        height = bar.get_height()
        text_y = height + err_upper + height * 0.05
        current_top = ax2.get_ylim()[1]
        if height > 0 and height < (current_top * 0.1): # Avoid text overlap for very small bars
             text_y = current_top * 0.1
        
        ax2.text(bar.get_x() + bar.get_width()/2., text_y,
                 f'avg: {avg:.3f}s\nmin: {tmin:.3f}s\nmax: {tmax:.3f}s',
                 ha='center', va='bottom', fontsize=9,
                 bbox=dict(boxstyle='round,pad=0.4', facecolor='white', alpha=0.8))

    current_top = ax2.get_ylim()[1]
    ax2.set_ylim(bottom=0, top=current_top * 1.3)

    ax3 = plt.subplot(1, num_plots, 3)

    chunk_sizes = []
    times_avg = []
    times_min = []
    times_max = []

    for result in fastlof_results:
        chunk_sizes.append(result['chunk_size'])
        time_stats = result['time_stats']
        times_avg.append(time_stats['avg'])
        times_min.append(time_stats['min'])
        times_max.append(time_stats['max'])

    # Sort by chunk size for better visualization
    sorted_indices = np.argsort(chunk_sizes)
    chunk_sizes = [chunk_sizes[i] for i in sorted_indices]
    times_avg = [times_avg[i] for i in sorted_indices]
    times_min = [times_min[i] for i in sorted_indices]
    times_max = [times_max[i] for i in sorted_indices]

    x_pos = np.arange(len(chunk_sizes))

    error_lower = [max(0, avg - tmin) for avg, tmin in zip(times_avg, times_min)]
    error_upper = [tmax - avg for avg, tmax in zip(times_avg, times_max)]

    bars = ax3.bar(x_pos, times_avg, alpha=0.7, color='red',
                     yerr=[error_lower, error_upper], capsize=3,
                     error_kw={'elinewidth': 1.5, 'capthick': 1.5})

    lof_auto_avg = lof_auto_time_stats['avg'] if lof_auto_time_stats else 0
    lof_brute_avg = lof_brute_time_stats['avg'] if lof_brute_time_stats else 0
    ranged_lof_avg = ranged_lof_time_stats['avg'] if ranged_lof_time_stats else 0

    if lof_auto_time_stats:
        ax3.axhline(y=lof_auto_avg, color='blue', linestyle='--', linewidth=2,
                    label=f'LOF (auto) avg: {lof_auto_avg:.3f}s', alpha=0.8)
    if lof_brute_time_stats:
        ax3.axhline(y=lof_brute_avg, color='green', linestyle='--', linewidth=2,
                    label=f'LOF (brute) avg: {lof_brute_avg:.3f}s', alpha=0.8)
    if ranged_lof_time_stats:
        ax3.axhline(y=ranged_lof_avg, color='orange', linestyle='--', linewidth=2,
                    label=f'Ranged LOF avg: {ranged_lof_avg:.3f}s', alpha=0.8)

    ax3.set_xticks(x_pos)
    # Only label a subset of x-ticks if there are too many
    if len(chunk_sizes) > 20:
        step = len(chunk_sizes) // 10
        ax3.set_xticks(x_pos[::step])
        ax3.set_xticklabels([str(cs) for cs in chunk_sizes[::step]], rotation=45, ha='right', fontsize=8)
    else:
        ax3.set_xticklabels([str(cs) for cs in chunk_sizes], rotation=45, ha='right', fontsize=8)

    ax3.set_xlabel('Chunk Size', fontsize=12)
    ax3.set_ylabel('Time (seconds)', fontsize=12)
    ax3.set_title('FastLOF Runtime by Chunk Size', fontsize=13)
    ax3.grid(True, alpha=0.3, axis='y')
    ax3.legend(loc='best', fontsize=9)

    saved_paths = []
    
    if metrics_available:
        ax4 = plt.subplot(1, num_plots, 4)
        auc_sizes = []
        auc_values = []
        for result in fastlof_results:
            metrics = result.get('metrics')
            if metrics and metrics.get('roc_auc') is not None:
                auc_sizes.append(result['chunk_size'])
                auc_values.append(metrics['roc_auc'])

        if auc_sizes:
            sorted_auc_idx = np.argsort(auc_sizes)
            auc_sizes_sorted = np.array(auc_sizes)[sorted_auc_idx]
            auc_values_sorted = np.array(auc_values)[sorted_auc_idx]

            ax4.plot(auc_sizes_sorted, auc_values_sorted, 'o-', linewidth=2, markersize=6,
                      label='FastLOF ROC AUC')

            if lof_auto_metrics is not None and lof_auto_metrics.get('roc_auc') is not None:
                baseline_auc = lof_auto_metrics['roc_auc']
                ax4.axhline(y=baseline_auc, color='purple', linestyle='--', linewidth=2,
                             label=f'LOF (auto) ROC AUC: {baseline_auc:.3f}', alpha=0.8)

            if ranged_lof_metrics is not None and ranged_lof_metrics.get('roc_auc') is not None:
                ranged_auc = ranged_lof_metrics['roc_auc']
                ax4.axhline(y=ranged_auc, color='orange', linestyle='-.', linewidth=2,
                             label=f'Ranged LOF ROC AUC: {ranged_auc:.3f}', alpha=0.8)

            ax4.set_xlabel('Chunk Size', fontsize=12)
            ax4.set_ylabel('ROC AUC', fontsize=12)
            ax4.set_title('FastLOF ROC AUC by Chunk Size', fontsize=13)
            ax4.set_ylim(0.0, 1.05)
            ax4.grid(True, alpha=0.3)
            ax4.legend(loc='best', fontsize=9)
            
            if len(auc_sizes_sorted) <= 20: 
                ax4.set_xticks(auc_sizes_sorted)
                ax4.set_xticklabels([str(int(cs)) for cs in auc_sizes_sorted], rotation=45, ha='right', fontsize=9)
            else:
                min_cs = auc_sizes_sorted.min()
                max_cs = auc_sizes_sorted.max()
                ticks = np.linspace(min_cs, max_cs, min(10, len(auc_sizes_sorted)))
                ax4.set_xticks(ticks)
                ax4.set_xticklabels([str(int(t)) for t in ticks], rotation=45, ha='right', fontsize=9)


    plt.tight_layout(rect=[0, 0, 1, 0.96])
    comparison_path = os.path.join(k_threshold_folder, f"comparison_plots_{n_runs}.png")
    fig.savefig(comparison_path, dpi=300, bbox_inches="tight")
    # Also display the figure in the notebook
    plt.show()
    plt.close(fig)
    saved_paths.append(comparison_path)

    sorted_results = sorted(fastlof_results, key=lambda r: r['chunk_size']) if fastlof_results else []

    if sorted_results:
        n_results = len(sorted_results)
        n_cols = min(3, n_results)
        n_rows = int(np.ceil(n_results / n_cols))
        scatter_fig, axes = plt.subplots(n_rows, n_cols, figsize=(6 * n_cols, 4.5 * n_rows), squeeze=False)

        lof_min = np.min(lof_auto_scores)
        lof_max = np.max(lof_auto_scores)

        for idx, result in enumerate(sorted_results):
            row = idx // n_cols
            col = idx % n_cols
            ax = axes[row][col]

            fast_scores = result['avg_scores']
            # Filter out extreme scores (>100) for better scatter visualization, consistent with original logic
            mask = (lof_auto_scores < 100) & (fast_scores < 100) 

            if not np.any(mask):
                ax.text(0.5, 0.5, 'No points with scores < 100',
                          transform=ax.transAxes, ha='center', va='center', fontsize=12)
                ax.set_axis_off()
                continue

            lof_filtered = lof_auto_scores[mask]
            fast_filtered = fast_scores[mask]

            correlation = np.corrcoef(lof_filtered, fast_filtered)[0, 1] if np.shape(lof_filtered)[0] > 1 else np.nan

            combined_min = min(np.min(lof_filtered), np.min(fast_filtered))
            combined_max = max(np.max(lof_filtered), np.max(fast_filtered))

            ax.scatter(lof_filtered, fast_filtered, s=8, alpha=0.5, label='FastLOF vs LOF (auto)', color='teal')
            ax.plot([combined_min, combined_max], [combined_min, combined_max],
                      linestyle='--', color='red', linewidth=1.2, label='y = x')

            ax.set_xlabel('LOF (auto) score', fontsize=11)
            ax.set_ylabel('FastLOF score', fontsize=11)
            ax.set_title(f'Chunk size {result["chunk_size"]} (count {result["chunk_count"]})\n'
                          f'Correlation: {correlation:.4f}', fontsize=12)
            ax.grid(True, alpha=0.3)
            ax.set_xlim(combined_min, combined_max)
            ax.set_ylim(combined_min, combined_max)
            ax.legend(loc='best', fontsize=9)

        # Hide any unused subplots
        total_axes = n_rows * n_cols
        for idx in range(len(sorted_results), total_axes):
            row = idx // n_cols
            col = idx % n_cols
            axes[row][col].axis('off')

        scatter_fig.suptitle(f'FastLOF vs LOF (auto) Score Correlations\n(dataset={os.path.basename(filepath)}, runs={n_runs}, threshold={threshold}, fraction={fraction})',
                              fontsize=15, fontweight='bold', y=0.995)
        scatter_fig.tight_layout(rect=[0, 0, 1, 0.97])
        scatter_path = os.path.join(k_threshold_folder, f"score_scatter_{n_runs}.png")
        scatter_fig.savefig(scatter_path, dpi=300, bbox_inches="tight")
        # Also display the scatter figure in the notebook
        plt.show()
        plt.close(scatter_fig)
        saved_paths.append(scatter_path)

    timing_results = [result for result in sorted_results if result.get('timing')]
    if timing_results:
        timing_cols = min(3, len(timing_results))
        timing_rows = int(np.ceil(len(timing_results) / timing_cols))
        timing_fig, timing_axes = plt.subplots(timing_rows, timing_cols,
                                               figsize=(6 * timing_cols, 5 * timing_rows),
                                               squeeze=False)

        axes_flat = timing_axes.flatten()
        for idx, result in enumerate(timing_results):
            ax = axes_flat[idx]
            timing = result['timing'] or {}
            details = timing.get('chunk_processing_details', {})
            distance_time = details.get('distance_computation', 0.0)
            neighbor_time = details.get('neighbor_updates', 0.0)
            lof_time = timing.get('lof_calculation', 0.0)
            other_time = (
                timing.get('initialization', 0.0) +
                details.get('self_distance_handling', 0.0) +
                timing.get('active_set_updates', 0.0) +
                timing.get('finalization', 0.0)
            )

            values = [distance_time, neighbor_time, lof_time, other_time]
            labels = ['Distance', 'Neighbor', 'LOF calc', 'Others']
            total_time = timing.get('total', sum(values))

            if total_time <= 0 or np.isclose(sum(values), 0.0):
                ax.text(0.5, 0.5, 'No timing data', ha='center', va='center', fontsize=11)
                ax.axis('off')
                continue
            else:
                total_time = sum(values)

            def autopct_func(pct):
                if total_time <= 0: return ''
                value = pct * total_time / 100.0
                return f"{value:.2f}s\n({pct:.1f}%)"

            wedges, texts, autotexts = ax.pie(
                values,
                labels=labels,
                autopct=autopct_func,
                startangle=90,
                textprops={'fontsize': 8}
            )
            ax.axis('equal')
            distances_info = timing.get('distances_computed', 'N/A')
            ax.text(0.5, 1.15, f"Distances computed: {distances_info}",
                     transform=ax.transAxes, ha='center', va='center', fontsize=9, fontweight='bold')
            ax.set_title(
                f"Chunk size {result['chunk_size']} (count {result['chunk_count']})\n"
                f"Total time: {total_time:.2f}s",
                fontsize=11
            )

        for idx in range(len(timing_results), len(axes_flat)):
            axes_flat[idx].axis('off')

        timing_fig.suptitle(f'FastLOF Timing Breakdown (dataset={os.path.basename(filepath)})', fontsize=15, fontweight='bold', y=0.995)
        timing_fig.tight_layout(rect=[0, 0, 1, 0.97])
        timing_path = os.path.join(k_threshold_folder, f"timing_breakdown_{n_runs}.png")
        timing_fig.savefig(timing_path, dpi=300, bbox_inches="tight")
        # Also display the timing breakdown figure in the notebook
        plt.show()
        plt.close(timing_fig)
        saved_paths.append(timing_path)
        
    # AUC plot (separate file) - Only if metrics are available
    if metrics_available:
        auc_sizes = []
        auc_values = []
        for result in fastlof_results:
            metrics = result.get('metrics')
            if metrics and metrics.get('roc_auc') is not None:
                auc_sizes.append(result['chunk_size'])
                auc_values.append(metrics['roc_auc'])

        if auc_sizes:
            auc_fig, auc_ax = plt.subplots(figsize=(10, 6))
            
            sorted_auc_idx = np.argsort(auc_sizes)
            auc_sizes_sorted = np.array(auc_sizes)[sorted_auc_idx]
            auc_values_sorted = np.array(auc_values)[sorted_auc_idx]

            auc_ax.plot(auc_sizes_sorted, auc_values_sorted, 'o-', linewidth=2, markersize=8,
                        label='FastLOF ROC AUC', color='blue')

            if lof_auto_metrics is not None and lof_auto_metrics.get('roc_auc') is not None:
                baseline_auc = lof_auto_metrics['roc_auc']
                auc_ax.axhline(y=baseline_auc, color='purple', linestyle='--', linewidth=2,
                                  label=f'LOF (auto) ROC AUC: {baseline_auc:.3f}', alpha=0.8)

            if ranged_lof_metrics is not None and ranged_lof_metrics.get('roc_auc') is not None:
                ranged_auc = ranged_lof_metrics['roc_auc']
                auc_ax.axhline(y=ranged_auc, color='orange', linestyle='-.', linewidth=2,
                                  label=f'Ranged LOF ROC AUC: {ranged_auc:.3f}', alpha=0.8)

            auc_ax.set_xlabel('Chunk Size', fontsize=12)
            auc_ax.set_ylabel('ROC AUC', fontsize=12)
            auc_ax.set_title(f'FastLOF ROC AUC by Chunk Size\n(dataset={os.path.basename(filepath)}, runs={n_runs}, threshold={threshold}, fraction={fraction})', 
                              fontsize=13, fontweight='bold')
            auc_ax.set_ylim(0.0, 1.05)
            auc_ax.grid(True, alpha=0.3)
            auc_ax.legend(loc='best', fontsize=10)
            
            if len(auc_sizes_sorted) <= 20:
                auc_ax.set_xticks(auc_sizes_sorted)
                auc_ax.set_xticklabels([str(int(cs)) for cs in auc_sizes_sorted], rotation=45, ha='right', fontsize=9)
            else:
                min_cs = auc_sizes_sorted.min()
                max_cs = auc_sizes_sorted.max()
                ticks = np.linspace(min_cs, max_cs, min(10, len(auc_sizes_sorted)))
                auc_ax.set_xticks(ticks)
                auc_ax.set_xticklabels([str(int(t)) for t in ticks], rotation=45, ha='right', fontsize=9)
            
            auc_fig.tight_layout()
            auc_path = os.path.join(k_threshold_folder, f"auc_plot_{n_runs}.png")
            auc_fig.savefig(auc_path, dpi=300, bbox_inches="tight")
            # Also display the AUC figure in the notebook
            plt.show()
            plt.close(auc_fig)
            saved_paths.append(auc_path)


    # LOF vs Ranged LOF scatter plot (if Ranged LOF scores are available)
    if ranged_lof_scores is not None:
        try:
            lof_scores = np.asarray(lof_auto_scores)
            rlof_scores = np.asarray(ranged_lof_scores)
            if lof_scores.shape[0] == rlof_scores.shape[0]:
                # Optionally filter extreme scores for clearer visualization
                mask = (lof_scores < 100) & (rlof_scores < 100)
                if not np.any(mask):
                    mask = np.ones_like(lof_scores, dtype=bool)

                lof_f = lof_scores[mask]
                rlof_f = rlof_scores[mask]

                corr = np.corrcoef(lof_f, rlof_f)[0, 1] if lof_f.size > 1 else np.nan
                scatter_lr_fig, scatter_lr_ax = plt.subplots(figsize=(6, 5))
                scatter_lr_ax.scatter(lof_f, rlof_f, s=8, alpha=0.5, color='darkorange')

                combined_min = min(np.min(lof_f), np.min(rlof_f))
                combined_max = max(np.max(lof_f), np.max(rlof_f))
                scatter_lr_ax.plot([combined_min, combined_max], [combined_min, combined_max],
                                   linestyle='--', color='red', linewidth=1.2, label='y = x')

                scatter_lr_ax.set_xlabel('LOF (auto) score', fontsize=11)
                scatter_lr_ax.set_ylabel('Ranged LOF score', fontsize=11)
                scatter_lr_ax.set_title(
                    f'LOF (auto) vs Ranged LOF scores\n(correlation={corr:.4f})', fontsize=12
                )
                scatter_lr_ax.grid(True, alpha=0.3)
                scatter_lr_ax.legend(loc='best', fontsize=9)

                lof_rlof_scatter_path = os.path.join(k_threshold_folder, f"lof_vs_rangedlof_scatter_{n_runs}.png")
                scatter_lr_fig.savefig(lof_rlof_scatter_path, dpi=300, bbox_inches="tight")
                plt.show()
                plt.close(scatter_lr_fig)
                saved_paths.append(lof_rlof_scatter_path)
            else:
                print("Warning: Cannot plot LOF vs Ranged LOF scatter: length mismatch.")
        except Exception as e:
            print(f"Warning: Failed to create LOF vs Ranged LOF scatter plot: {e}")

    if saved_paths:
        print("\n Plots saved to " + ", ".join(f"'{path}'" for path in saved_paths))
        
    return saved_paths


def find_best_k_lof_auto(X_normalized, y, k_values, contamination, n_runs):
    """Search over multiple k values using LOF (auto) and select the best ROC AUC.

    Returns
    -------
    best_k : int
        k with highest ROC AUC (ties broken by smallest k).
    results_by_k : dict
        Mapping k -> dict with 'scores', 'time_stats', 'metrics'.
    """
    if y is None:
        raise ValueError("find_best_k_lof_auto requires labels y to compute AUC.")

    results_by_k = {}
    best_k = None
    best_auc = -np.inf

    for k in k_values:
        print(f"\n=== Evaluating LOF (auto) for k={k} ===")
        scores, time_stats = run_lof_auto(X_normalized, k, contamination, n_runs)
        metrics = _compute_anomaly_metrics(y, scores, contamination)
        if metrics is None or metrics.get('roc_auc') is None:
            print(f"  Skipping k={k}: metrics not available (labels may be degenerate).")
            continue
        auc = metrics['roc_auc']
        results_by_k[k] = {
            'scores': scores,
            'time_stats': time_stats,
            'metrics': metrics,
        }
        print(f"  k={k}: ROC AUC={auc:.4f}")
        if auc > best_auc or (np.isclose(auc, best_auc) and (best_k is None or k < best_k)):
            best_auc = auc
            best_k = k

    if best_k is None:
        raise RuntimeError("Could not determine best k: no valid metrics computed.")

    print(f"\n>>> Best k chosen by ROC AUC: k={best_k} (AUC={best_auc:.4f})")
    return best_k, results_by_k


def process_dataset(dataset_filepath,
                    dataset_fraction,
                    k_values,
                    contamination,
                    n_runs,
                    fastlof_thresholds,
                    min_chunk_size,
                    max_chunk_size,
                    chunk_interval, 
                    find_best_k=True,
                    default_k=20):
    """Full pipeline: load dataset, pick best k via LOF auto, run LOF/ FastLOF / RangedLOF,
    and generate plots & timing CSV for each FastLOF threshold.
    """
    print(f"\n=== Processing dataset: {dataset_filepath} ===")
    X, y = load_dataset(dataset_filepath, dataset_fraction)
    if X is None:
        raise RuntimeError("Dataset could not be loaded.")

    print_dataset_info(X, y)

    print("\nNormalizing data...")
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    print("Data normalized using StandardScaler.")

    # 1) Choose best k using LOF auto
    if find_best_k :
        best_k, _ = find_best_k_lof_auto(X_normalized, y, k_values, contamination, 1)
    else:
        best_k = default_k
    # 2) Run LOF auto with best k and get metrics
    lof_auto_scores, lof_auto_time_stats = run_lof_auto(
        X_normalized, best_k, contamination, n_runs
    )
    lof_auto_metrics = _compute_anomaly_metrics(y, lof_auto_scores, contamination) if y is not None else None

    # 3) Run LOF brute with best k
    lof_brute_scores, lof_brute_time_stats = run_lof_brute(
        X_normalized, best_k, contamination, n_runs
    )

    # 4) Run Ranged LOF
    ranged_lof_scores, ranged_lof_time_stats = run_ranged_lof(
        X_normalized, best_k, contamination, n_runs, 10
    )
    ranged_lof_metrics = _compute_anomaly_metrics(y, ranged_lof_scores, contamination) if y is not None else None

    all_results = {
        'dataset': dataset_filepath,
        'best_k': best_k,
        'lof_auto_scores': lof_auto_scores,
        'lof_auto_time_stats': lof_auto_time_stats,
        'lof_auto_metrics': lof_auto_metrics,
        'lof_brute_scores': lof_brute_scores,
        'lof_brute_time_stats': lof_brute_time_stats,
        'ranged_lof_scores': ranged_lof_scores,
        'ranged_lof_time_stats': ranged_lof_time_stats,
        'ranged_lof_metrics': ranged_lof_metrics,
        'fastlof_results_by_threshold': {},
    }

    # 5) For each FastLOF threshold, run chunked FastLOF, plots, and timing CSV
    for thr in fastlof_thresholds:
        print(f"\n=== FastLOF experiments for threshold={thr} ===")
        fastlof_results = run_fastlof_multiple_chunks(
            X_normalized, best_k, contamination, n_runs, X_normalized.shape[0],
            min_chunk_size, max_chunk_size, chunk_interval, thr,
            y_true=y,
        )

        # Add correlation info (for logging consistency)
        print(f"\n--- FastLOF Correlation Results vs LOF (auto) (threshold={thr}) ---")
        for result in fastlof_results:
            corr = np.corrcoef(lof_auto_scores, result['avg_scores'])[0, 1]
            result['correlation'] = corr
            metrics = result.get('metrics')
            metrics_str = f", ROC AUC = {metrics['roc_auc']:.4f}" if metrics and metrics.get('roc_auc') is not None else ""
            print(f"Chunk count {result['chunk_count']:3d} (size {result['chunk_size']:5d}): "
                  f"correlation = {corr:.6f}{metrics_str}")

        all_results['fastlof_results_by_threshold'][thr] = fastlof_results

        # Create plots and timing CSV for this threshold
        create_plots(
            lof_auto_scores,
            lof_auto_time_stats,
            lof_brute_time_stats,
            fastlof_results,
            n_runs,
            thr,
            dataset_fraction,
            dataset_filepath,
            best_k,
            lof_auto_metrics=lof_auto_metrics,
            ranged_lof_time_stats=ranged_lof_time_stats,
            ranged_lof_metrics=ranged_lof_metrics,
            ranged_lof_scores=ranged_lof_scores,
        )

        save_timing_csv(
            lof_auto_time_stats,
            lof_brute_time_stats,
            fastlof_results,
            dataset_filepath,
            best_k,
            contamination,
            n_runs,
            thr,
            ranged_lof_time_stats=ranged_lof_time_stats,
        )

    print("\n=== Finished processing dataset ===")
    return all_results


def print_dataset_info(X, y):
    """Prints a summary of the loaded dataset."""
    print("\n--- Dataset Exploration ---")
    print(f"Filepath: {DATASET_FILEPATH}")
    print(f"Fraction Used: {DATASET_FRACTION}")
    print(f"Data Shape (X): {X.shape[0]} samples, {X.shape[1]} features")
    if y is not None:
        num_anomalies = np.sum(y)
        total_samples = y.shape[0]
        anomaly_perc = (num_anomalies / total_samples) * 100 if total_samples > 0 else 0
        print(f"Labels Found: Yes (y shape: {y.shape})")
        print(f"Anomalies (1s): {num_anomalies}")
        print(f"Normals (0s): {total_samples - num_anomalies}")
        print(f"Anomaly Percentage: {anomaly_perc:.4f}%")
        print(f"Contamination Rate (Targeted): {CONTAMINATION_RATE}")
        if np.abs(anomaly_perc/100 - CONTAMINATION_RATE) > 0.05:
             print("⚠️ **WARNING:** Actual anomaly percentage differs significantly from target contamination rate.")
    else:
        print("Labels Found: No (Unsupervised setup)")
    print("----------------------------")



# Pen Local Dataset

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/pen-local-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.0015,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=50,     
    max_chunk_size=3000,
    chunk_interval=100,
)

# Split Code Demonstration

## Define Dataset and Experiment Parameters

In [None]:
DATASET_FILEPATH = "data/pen-local-unsupervised-ad.csv"
DATASET_FRACTION = 1.0
K_NEIGHBORS = 20
CONTAMINATION_RATE = 0.0015

N_RUNS = 10

FASTLOF_THRESHOLD = 1.1
MIN_CHUNK_SIZE = 50
MAX_CHUNK_SIZE = 3000
CHUNK_INTERVAL = 100

X, y, X_normalized = None, None, None

## Load, Explore, and Normalize Pen Local Dataset

In [None]:
try:
    X, y = load_dataset(DATASET_FILEPATH, DATASET_FRACTION)
except FileNotFoundError:
    print(f"Error: Dataset file not found at '{DATASET_FILEPATH}'")
    X, y = None, None
except Exception as e:
    print(f"An error occurred loading the dataset: {e}")
    X, y = None, None

if X is not None:
    print_dataset_info(X, y)
    
    print("\nNormalizing data...")
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    print("Data normalized using StandardScaler.")
else:
    print("Cannot proceed. Data loading failed.")

## Run LOF Brute and Auto (Averaged)


In [None]:
if X_normalized is None:
    print("Cannot proceed. Data is not loaded or normalized.")
else:
    # 1. Run LOF (auto) n_runs times
    lof_auto_scores, lof_auto_time_stats = run_lof_auto(
        X_normalized, K_NEIGHBORS, CONTAMINATION_RATE, N_RUNS
    )
    
    # Calculate metrics for LOF Auto (if labels exist)
    if y is not None:
        lof_auto_metrics = _compute_anomaly_metrics(y, lof_auto_scores, CONTAMINATION_RATE)
        if lof_auto_metrics:
            print(f"LOF (auto) Metrics: ROC AUC={lof_auto_metrics['roc_auc']:.4f}, PR AUC={lof_auto_metrics['pr_auc']:.4f}")

    # 2. Run LOF (brute) n_runs times
    lof_brute_scores, lof_brute_time_stats = run_lof_brute(
        X_normalized, K_NEIGHBORS, CONTAMINATION_RATE, N_RUNS
    )

    # 3. Run Ranged LOF n_runs times
    ranged_lof_scores, ranged_lof_time_stats = run_ranged_lof(
        X_normalized, K_NEIGHBORS, CONTAMINATION_RATE, N_RUNS
    )

    # Calculate metrics for Ranged LOF (if labels exist)
    if y is not None:
        ranged_lof_metrics = _compute_anomaly_metrics(y, ranged_lof_scores, CONTAMINATION_RATE)
        if ranged_lof_metrics:
            print(
                f"Ranged LOF Metrics: ROC AUC={ranged_lof_metrics['roc_auc']:.4f}, "
                f"PR AUC={ranged_lof_metrics['pr_auc']:.4f}"
            )

    print("\n LOF Brute, Auto, and Ranged LOF tests complete. Results saved to memory.")

## Run FastLOF across Chunk Range (Averaged)

In [None]:
if X_normalized is None:
    print("Cannot proceed. Data is not loaded or normalized.")
elif lof_auto_scores is None:
    print("Cannot proceed. LOF Auto scores are required for correlation calculation.")
else:
    # Run FastLOF with multiple chunk sizes
    try:
        fastlof_results = run_fastlof_multiple_chunks(
            X_normalized, K_NEIGHBORS, CONTAMINATION_RATE, N_RUNS, X_normalized.shape[0],
            MIN_CHUNK_SIZE, MAX_CHUNK_SIZE, CHUNK_INTERVAL, FASTLOF_THRESHOLD,
            y_true=y
        )
        
        print(f"\n--- FastLOF Correlation Results (vs LOF auto) ---")
        for result in fastlof_results:
            correlation = np.corrcoef(lof_auto_scores, result['avg_scores'])[0, 1]
            result['correlation'] = correlation
            metrics_str = ""
            metrics = result.get('metrics')
            if metrics and metrics.get('roc_auc') is not None:
                metrics_str = f", ROC AUC = {metrics['roc_auc']:.4f}"
            print(f"Chunk count {result['chunk_count']:3d} (size {result['chunk_size']:5d}): "
                  f"correlation = {correlation:.6f}{metrics_str}")

        print("\n FastLOF tests complete. Results saved to memory.")

    except ValueError as e:
        print(f" Parameter Error: {e}")
        fastlof_results = None
    except Exception as e:
        print(f" An error occurred during FastLOF execution: {e}")
        fastlof_results = None

## Create Plots and Save Timing CSV

In [None]:
if all([lof_auto_scores is not None, lof_auto_time_stats is not None,
        lof_brute_time_stats is not None, fastlof_results is not None]):
    
    # 1. Create and save plots
    saved_plot_paths = create_plots(
        lof_auto_scores,
        lof_auto_time_stats,
        lof_brute_time_stats,
        fastlof_results,
        N_RUNS,
        FASTLOF_THRESHOLD,
        DATASET_FRACTION,
        DATASET_FILEPATH,
        K_NEIGHBORS,
        lof_auto_metrics=lof_auto_metrics,
        ranged_lof_time_stats=ranged_lof_time_stats,
        ranged_lof_metrics=ranged_lof_metrics,
    )
    
    # 2. Save timing results to CSV
    saved_csv_path = save_timing_csv(
        lof_auto_time_stats,
        lof_brute_time_stats,
        fastlof_results,
        DATASET_FILEPATH,
        K_NEIGHBORS,
        CONTAMINATION_RATE,
        N_RUNS,
        FASTLOF_THRESHOLD,
        ranged_lof_time_stats=ranged_lof_time_stats,
    )
    
    print("\nExperiment finished!")

else:
    print("Cannot generate plots/CSV. One or more required result variables (LOF scores/timings, FastLOF results) are missing. Please run all preceding cells.")

# Creditcard dataset

In [None]:
fastlof_thresholds = [1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/creditcard.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.1,
    n_runs=2,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=10000,
    chunk_interval=500,
)

# Breast Cancer Dataset

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/breast-cancer-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.0272,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=20,     
    max_chunk_size=180,
    chunk_interval=10,
)

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/breast-cancer-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.0272,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=20,     
    max_chunk_size=180,
    chunk_interval=10,
    find_best_k=False,
    default_k=20,
)

# Artificial Unsupervised AD Dataset

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/dfki-artificial-3000-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.0123,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=10,     
    max_chunk_size=1500,
    chunk_interval=50,
)

# Satellite Dataset

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/satellite-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.0149,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=50,     
    max_chunk_size=2500,
    chunk_interval=100,
)

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/satellite-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.0149,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=50,     
    max_chunk_size=2500,
    chunk_interval=100,
    find_best_k=False,
    default_k=20,
)

# Annthyroid Dataset

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/annthyroid-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.0361,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=50,     
    max_chunk_size=3500,
    chunk_interval=100,
)

# Pen Global Dataset

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/pen-global-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=10,     
    max_chunk_size=300,
    chunk_interval=10,
)

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data/pen-global-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=10,     
    max_chunk_size=300,
    chunk_interval=10,
    find_best_k=False,
    default_k=20,
)

# Kdd99 Dataset

In [None]:
fastlof_thresholds = [1.1]
k_values = [20]

res_pen_local = process_dataset(
    dataset_filepath="data/kdd99-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,
    contamination=0.017,
    n_runs=2,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=10000,
    chunk_interval=500,
    find_best_k=False,
    default_k=20,
)

# Internet Ads Dataset

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\InternetAds_norm_02_v01.arff",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=1500,
    chunk_interval=100,
)

In [None]:
fastlof_thresholds = [0.0, 1.0, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\InternetAds_norm_02_v01.arff",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=1500,
    chunk_interval=100,
    find_best_k=False,
    default_k=20,
)

In [None]:
fastlof_thresholds = [1.3, 1.4, 1.5]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\InternetAds_norm_02_v01.arff",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=1500,
    chunk_interval=100,
    find_best_k=False,
    default_k=20,
)

In [None]:
fastlof_thresholds = [1.6, 1.8, 2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\InternetAds_norm_02_v01.arff",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=1500,
    chunk_interval=100,
    find_best_k=False,
    default_k=20,
)

In [None]:
fastlof_thresholds = [3, 5, 7]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\InternetAds_norm_02_v01.arff",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=1500,
    chunk_interval=100,
    find_best_k=False,
    default_k=20,
)

In [None]:
fastlof_thresholds = [10, 15, 20]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\InternetAds_norm_02_v01.arff",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=1500,
    chunk_interval=100,
    find_best_k=False,
    default_k=20,
)

# Pen Digits Dataset

In [None]:
fastlof_thresholds = [0, 1, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\PenDigits_withoutdupl_norm_v01.arff",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.1,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=3000,
    chunk_interval=100,
)

# Mammography Dataset

In [None]:
fastlof_thresholds = [0, 1, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\mammography.mat",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=3000,
    chunk_interval=100,
)

In [None]:
fastlof_thresholds = [0, 1, 1.01, 1.1, 1.2]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\mammography.mat",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=100,     
    max_chunk_size=3000,
    chunk_interval=100,
    find_best_k=False,
    default_k=20,
)

# Shuttle Dataset

In [None]:
fastlof_thresholds = [0, 1, 1.01, 1.1, 1.2, 1.3]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\shuttle-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=500,     
    max_chunk_size=10000,
    chunk_interval=500
)

In [None]:
fastlof_thresholds = [0, 1, 1.01, 1.1, 1.2, 1.3]
k_values = [10, 20, 30, 40, 50]

res_pen_local = process_dataset(
    dataset_filepath="data\shuttle-unsupervised-ad.csv",
    dataset_fraction=1.0,
    k_values=k_values,     
    contamination=0.11,
    n_runs=10,
    fastlof_thresholds=fastlof_thresholds,
    min_chunk_size=500,     
    max_chunk_size=10000,
    chunk_interval=500,
    find_best_k = False,
    default_k = 20
)

In [None]:
from sklearn.datasets import fetch_kddcup99
sklearn.datasets.fetch_kddcup99(subset=None, data_home=None, shuffle=False, random_state=None, percent10=True, download_if_missing=True)