# Sliding Window Weighted Majority Voting Anomaly Globality Index

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyod.models.knn import KNN
from pyod.models.lof import LOF
import math

df = pd.read_csv('../data/dfki-artificial-3000-unsupervised-ad.csv')

# ==========================================
# 1. ENSEMBLE GENERATOR
# ==========================================
def generate_ensemble_ranks_pyod(
    df,
    coordinate_cols=['attribute_1', 'attribute_2'],
    k_range=range(5, 55, 1)
):
    """
    Runs KNN and LOF over a range of k values to stabilize the anomaly scores.
    Stores accumulated scores and returns averaged ranks.
    """
    df = df.copy()
    X = df[coordinate_cols].values

    knn_accum = np.zeros(len(df))
    lof_accum = np.zeros(len(df))
    
    for k in k_range:
        # KNN: distance-based (global bias)
        clf_knn = KNN(n_neighbors=k, method='mean') 
        clf_knn.fit(X)
        knn_accum += clf_knn.decision_scores_
        
        # LOF: density-based (local bias)
        clf_lof = LOF(n_neighbors=k)
        clf_lof.fit(X)
        lof_accum += clf_lof.decision_scores_

    # Store accumulated scores
    df['knn_score'] = knn_accum
    df['lof_score'] = lof_accum

    # Convert to ranks (1 = most anomalous)
    df['knn_rank'] = pd.Series(knn_accum).rank(ascending=False)
    df['lof_rank'] = pd.Series(lof_accum).rank(ascending=False)

    return df


# ==========================================
# 2. UNIFIED CONFIDENCE SCORER
# ==========================================
def simple_anomaly_scoring(
    df,
    contamination_rate=0.01, 
    decay_power=1.0,  
    sigma_margin=3.0,
    step_size=0.001
):
    """
    Calculates a unified confidence score in [0, 1].
    - High score: global agreement (KNN + LOF)
    - Mid score: local-only or global-only detection
    """
    df = df.copy()
    N = len(df)
    
    # Dynamic cutoff using binomial 3-sigma rule
    mu = N * contamination_rate 
    sigma = math.sqrt(N * contamination_rate * (1 - contamination_rate))
    limit_rank = mu + sigma_margin * sigma
    max_tau = min(limit_rank / N, 1.0)
    
    # Voting
    df['total_votes'] = 0.0
    taus = np.arange(1.0 / N, max_tau + step_size, step_size)
    
    max_possible = 0.0
    
    print(f"Scanning top {max_tau*100:.2f}% ({len(taus)} steps)...")

    for tau in taus:
        weight = 1.0 / (tau ** decay_power)
        max_possible += 2 * weight
        
        c_tau = max(int(np.ceil(N * tau)), 1)
        
        if_knn = df['knn_rank'] <= c_tau
        if_lof = df['lof_rank'] <= c_tau
        
        df.loc[if_knn, 'total_votes'] += weight
        df.loc[if_lof, 'total_votes'] += weight

    df['confidence_score'] = df['total_votes'] / max_possible
    return df


# ==========================================
# 3. EXECUTION
# ==========================================

# A. Ensemble ranks and scores
df_ensembled = generate_ensemble_ranks_pyod(
    df,
    k_range=range(5, 60, 1)
)

# B. Unified confidence score
df_final = simple_anomaly_scoring(
    df_ensembled,
    contamination_rate=1,
    decay_power=1.0,
    sigma_margin=3.0
)

# ==========================================
# 4. LOF MAGNITUDE â†’ POINT SIZE
# ==========================================

# Robust normalization of accumulated LOF score

lof_raw = df_final['lof_score']
q_low = lof_raw.quantile(0.05)
q_high = lof_raw.quantile(0.99)

df_final['lof_size_norm'] = (
    lof_raw.clip(q_low, q_high) - q_low
) / (q_high - q_low)

# Map to marker size (area)
min_size = 20
max_size = 300
gamma = 0.7

df_final['marker_size'] = (
    min_size
    + (max_size - min_size)
    * (df_final['lof_size_norm'] ** gamma)
)

# ==========================================
# 5. VISUALIZATION
# ==========================================

fig, ax = plt.subplots(1, 1, figsize=(11, 8))

# Filter weak noise
mask_signal = df_final['confidence_score'] > 0.01
df_plot = df_final[mask_signal].copy()
mask_noise = ~mask_signal

# Background points
ax.scatter(
    df_final.loc[mask_noise, 'attribute_1'],
    df_final.loc[mask_noise, 'attribute_2'],
    c='#e0e0e0',
    s=15,
    alpha=0.3,
    zorder=0
)

# Anomalies: color = confidence, size = LOF magnitude
sc = ax.scatter(
    df_plot['attribute_1'],
    df_plot['attribute_2'],
    c=df_plot['confidence_score'],
    cmap='plasma',
    s=df_plot['marker_size'],
    alpha=0.9,
    edgecolors='black',
    linewidth=0.5
)

# Colorbar
cbar = plt.colorbar(sc, ax=ax)
cbar.set_label("Anomaly Globality Index", rotation=270, labelpad=15)
cbar.set_ticks([0.1, 0.95])
cbar.set_ticklabels(['Local Outlier', 'Global Outlier'])

ax.set_title(
    "Anomaly Detection Explainability\n"
    "Color = Global Locality Index | Size = Normalized LOF Score",
    fontsize=14
)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# ==========================================
# 6. TOP ANOMALIES
# ==========================================

print("Top 5 Anomalies:")
print(
    df_final
    .sort_values('confidence_score', ascending=False)
    .head(5)[['attribute_1', 'attribute_2', 'confidence_score', 'lof_score']]
)
