In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
df = pd.read_csv('../data/dfki-artificial-3000-unsupervised-ad.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nOutlier label distribution:")
print(df['outlier_label'].value_counts())

# Convert outlier_label to binary for plotting
df['is_outlier'] = (df['outlier_label'] == 'outlier').astype(int)

# Create scatter plot
fig, ax = plt.subplots(figsize=(10, 8))

# Plot normal points
normal_mask = df['is_outlier'] == 0
ax.scatter(df.loc[normal_mask, 'attribute_1'], 
           df.loc[normal_mask, 'attribute_2'],
           c='blue', alpha=0.5, s=20, label='Normal')

# Plot outliers
outlier_mask = df['is_outlier'] == 1
ax.scatter(df.loc[outlier_mask, 'attribute_1'], 
           df.loc[outlier_mask, 'attribute_2'],
           c='red', alpha=0.7, s=30, label='Outlier', marker='x')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title('DFKI Artificial Dataset (3000 samples)', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nSummary statistics:")
print(df[['attribute_1', 'attribute_2']].describe())



In [None]:
from pyod.models.knn import KNN
from sklearn.preprocessing import MinMaxScaler

X = df[['attribute_1', 'attribute_2']].values
y = df['is_outlier'].values

scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

k = 20
knn_detector = KNN(n_neighbors=k, contamination=0.01)
knn_detector.fit(X_normalized)

scores = knn_detector.decision_scores_

sorted_indices = np.argsort(scores)[::-1]
ranks = np.empty_like(scores, dtype=int)
ranks[sorted_indices] = np.arange(1, len(scores) + 1)

df['knn_score'] = scores
df['knn_rank'] = ranks

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

ax1 = axes[0]
scatter1 = ax1.scatter(df['attribute_1'], df['attribute_2'], 
                       c=scores, cmap='viridis', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title(f'Points colored by k-NN Score (k={k})', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(scatter1, ax=ax1)
cbar1.set_label('Anomaly Score', fontsize=11)
ax1.grid(True, alpha=0.3)

ax2 = axes[1]
scatter2 = ax2.scatter(df['attribute_1'], df['attribute_2'], 
                       c=ranks, cmap='plasma_r', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title(f'Points colored by k-NN Rank', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(scatter2, ax=ax2)
cbar2.set_label('Rank (1 = most anomalous)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()



In [None]:
from pyod.models.lof import LOF
from sklearn.preprocessing import MinMaxScaler

X = df[['attribute_1', 'attribute_2']].values
y = df['is_outlier'].values

scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

k = 20
lof_detector = LOF(n_neighbors=k, contamination=0.01)
lof_detector.fit(X_normalized)

scores = lof_detector.decision_scores_

sorted_indices = np.argsort(scores)[::-1]
ranks = np.empty_like(scores, dtype=int)
ranks[sorted_indices] = np.arange(1, len(scores) + 1)

df['lof_score'] = scores
df['lof_rank'] = ranks

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

ax1 = axes[0]
scatter1 = ax1.scatter(df['attribute_1'], df['attribute_2'], 
                       c=scores, cmap='viridis', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title(f'Points colored by LOF Score (k={k})', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(scatter1, ax=ax1)
cbar1.set_label('Anomaly Score', fontsize=11)
ax1.grid(True, alpha=0.3)

ax2 = axes[1]
scatter2 = ax2.scatter(df['attribute_1'], df['attribute_2'], 
                       c=ranks, cmap='plasma_r', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title(f'Points colored by LOF Rank', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(scatter2, ax=ax2)
cbar2.set_label('Rank (1 = most anomalous)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()



In [None]:
# Euclidean combination of k-NN and LOF scores: magnitude + angle

knn_scores = df['knn_score'].values
lof_scores = df['lof_score'].values

mag = np.sqrt(knn_scores**2 + lof_scores**2)
angle = np.arctan2(lof_scores, knn_scores)  # LOF vs k-NN

# Store for later use if needed
df['euc_mag'] = mag
df['euc_angle'] = angle

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: magnitude of combined anomaly signal
ax1 = axes[0]
sc1 = ax1.scatter(
    df['attribute_1'], df['attribute_2'],
    c=mag, cmap='viridis',
    s=30, alpha=0.7, edgecolors='black', linewidth=0.5,
)
ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title('Outlier magnitude: sqrt(LOF^2 + kNN^2)', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(sc1, ax=ax1)
cbar1.set_label('Magnitude', fontsize=11)
ax1.grid(True, alpha=0.3)

# Plot 2: angle indicating locality vs globality
ax2 = axes[1]
sc2 = ax2.scatter(
    df['attribute_1'], df['attribute_2'],
    c=angle, cmap='twilight_shifted',
    s=30, alpha=0.7, edgecolors='black', linewidth=0.5,
)
ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title('Locality vs Globality: angle = arctan2(LOF, kNN)', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(sc2, ax=ax2)
cbar2.set_label('Angle (radians)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
c = 100
p = 0
top_knn_indices = df.nsmallest(c, 'knn_rank').index.values
top_lof_indices = df.nsmallest(c, 'lof_rank').index.values

all_top_indices = np.unique(np.concatenate([top_knn_indices, top_lof_indices]))

ratio_values = np.full(len(df), np.nan)
for idx in all_top_indices:
    knn_rank = df.loc[idx, 'knn_rank']
    lof_rank = df.loc[idx, 'lof_rank']
    ratio = (lof_rank + p) / (knn_rank + p)
    ratio_values[idx] = ratio

mask_top = ~np.isnan(ratio_values)
mask_other = np.isnan(ratio_values)

ratio_min = ratio_values[mask_top].min()
ratio_max = ratio_values[mask_top].max()
ratio_mean = ratio_values[mask_top].mean()

vmin = max(0.1, ratio_min * 0.9)
vmax = min(ratio_max * 1.1, ratio_max + 0.5)
vmax = 1
vmin = 0.6
fig, ax = plt.subplots(figsize=(12, 8))

scatter_other = ax.scatter(df.loc[mask_other, 'attribute_1'], 
                          df.loc[mask_other, 'attribute_2'],
                          c='black', s=20, alpha=0.3, label='Other points')

scatter_top = ax.scatter(df.loc[mask_top, 'attribute_1'], 
                         df.loc[mask_top, 'attribute_2'],
                         c=ratio_values[mask_top], cmap='RdYlGn', 
                         s=50, alpha=0.8, edgecolors='black', linewidth=0.5,
                         vmin=vmin, vmax=vmax, label='Top c points')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title(f'Global vs Local Outliers (c={c})\nRatio = (LOF rank + {p}) / (k-NN rank + {p})', 
             fontsize=13, fontweight='bold')
cbar = plt.colorbar(scatter_top, ax=ax)
cbar.set_label('Ratio (1 = global outlier, 0 = local outlier)', fontsize=11)
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Top {c} points by k-NN rank: {len(top_knn_indices)}")
print(f"Top {c} points by LOF rank: {len(top_lof_indices)}")
print(f"Total unique top points: {len(all_top_indices)}")
print(f"\nRatio statistics for top points:")
print(f"  Mean: {ratio_values[mask_top].mean():.4f}")
print(f"  Min: {ratio_values[mask_top].min():.4f}")
print(f"  Max: {ratio_values[mask_top].max():.4f}")



In [None]:
c = 100
p = 20
top_knn_indices = df.nsmallest(c, 'knn_rank').index.values
top_lof_indices = df.nsmallest(c, 'lof_rank').index.values

all_top_indices = np.unique(np.concatenate([top_knn_indices, top_lof_indices]))

ratio_values = np.full(len(df), np.nan)
for idx in all_top_indices:
    knn_rank = df.loc[idx, 'knn_rank']
    lof_rank = df.loc[idx, 'lof_rank']
    ratio = (lof_rank + p) / (knn_rank + p)
    ratio_values[idx] = ratio

mask_top = ~np.isnan(ratio_values)
mask_other = np.isnan(ratio_values)

ratio_min = ratio_values[mask_top].min()
ratio_max = ratio_values[mask_top].max()
ratio_mean = ratio_values[mask_top].mean()

vmin = max(0.1, ratio_min * 0.9)
vmax = min(ratio_max * 1.1, ratio_max + 0.5)
vmax = 1
vmin = 0.6
fig, ax = plt.subplots(figsize=(12, 8))

scatter_other = ax.scatter(df.loc[mask_other, 'attribute_1'], 
                          df.loc[mask_other, 'attribute_2'],
                          c='black', s=20, alpha=0.3, label='Other points')

scatter_top = ax.scatter(df.loc[mask_top, 'attribute_1'], 
                         df.loc[mask_top, 'attribute_2'],
                         c=ratio_values[mask_top], cmap='RdYlGn', 
                         s=50, alpha=0.8, edgecolors='black', linewidth=0.5,
                         vmin=vmin, vmax=vmax, label='Top c points')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title(f'Global vs Local Outliers (c={c})\nRatio = (LOF rank + {p}) / (k-NN rank + {p})', 
             fontsize=13, fontweight='bold')
cbar = plt.colorbar(scatter_top, ax=ax)
cbar.set_label('Ratio (1 = global outlier, 0 = local outlier)', fontsize=11)
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Top {c} points by k-NN rank: {len(top_knn_indices)}")
print(f"Top {c} points by LOF rank: {len(top_lof_indices)}")
print(f"Total unique top points: {len(all_top_indices)}")
print(f"\nRatio statistics for top points:")
print(f"  Mean: {ratio_values[mask_top].mean():.4f}")
print(f"  Min: {ratio_values[mask_top].min():.4f}")
print(f"  Max: {ratio_values[mask_top].max():.4f}")



In [None]:
print("k-NN mean score by label:")
print(df.groupby('is_outlier')['knn_score'].mean())
print("LOF mean score by label:")
print(df.groupby('is_outlier')['lof_score'].mean())


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np

def analyze_ensemble_globality(df, X_normalized, k_list=[3, 4, 5, 6, 7], contamination=0.05):
    """
    Computes Globality Index by averaging distances to centroids over multiple k values.
    Filters top anomalies using existing LOF scores.
    """
    
    print(f"--- Running Ensemble Globality Analysis ---")
    print(f"Averaging cluster structures for k = {k_list}")
    
    # ==========================================
    # 1. ENSEMBLE DISTANCE CALCULATION
    # ==========================================
    n_samples = X_normalized.shape[0]
    cumulative_dists = np.zeros(n_samples)
    
    # Iterate through each k in the list
    for k in k_list:
        # Fit KMeans
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto').fit(X_normalized)
        centers = kmeans.cluster_centers_
        
        # Get min distance to ANY center for this specific k
        dists = pairwise_distances(X_normalized, centers).min(axis=1)
        
        # Add to cumulative sum
        cumulative_dists += dists

    # Average the distances
    avg_dists_to_centers = cumulative_dists / len(k_list)
    
    # ==========================================
    # 2. FILTER CANDIDATES (Top C% by LOF)
    # ==========================================
    # We assume 'lof_score' already exists in df from previous steps
    if 'lof_score' not in df.columns:
        raise ValueError("DataFrame must contain 'lof_score' column.")
        
    threshold = df['lof_score'].quantile(1 - contamination)
    candidate_mask = df['lof_score'] > threshold
    
    # Extract distances only for the candidates
    candidate_dists = avg_dists_to_centers[candidate_mask]
    candidate_indices = df.index[candidate_mask]
    
    # ==========================================
    # 3. COMPUTE FINAL INDEX (Standardize Subset)
    # ==========================================
    # Normalize the average distances of the candidates to 0-1 for coloring
    scaler_subset = MinMaxScaler()
    globality_index = scaler_subset.fit_transform(candidate_dists.reshape(-1, 1)).flatten()
    
    # Store in DF
    col_name = 'ensemble_globality_index'
    df[col_name] = np.nan
    df.loc[candidate_indices, col_name] = globality_index
    
    # ==========================================
    # 4. VISUALIZATION
    # ==========================================
    fig, ax = plt.subplots(figsize=(12, 9))

    # A. Plot Background (Normal Points)
    ax.scatter(df.loc[~candidate_mask, 'attribute_1'], 
               df.loc[~candidate_mask, 'attribute_2'],
               c='gainsboro', s=20, alpha=0.4, label='Normal Data')

    # B. Plot Candidates (Colored by Ensemble Globality)
    scatter = ax.scatter(df.loc[candidate_mask, 'attribute_1'], 
                         df.loc[candidate_mask, 'attribute_2'], 
                         c=globality_index, 
                         cmap='plasma', # plasma is great for intensity
                         s=60, alpha=0.9, edgecolors='black', linewidth=0.5,
                         label=f'Top {int(contamination*100)}% Anomalies')

    # Formatting
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label(f'Ensemble Globality (Avg Dist to Centers, k={k_list})', fontsize=11)
    cbar.set_ticks([0, 0.5, 1])
    cbar.set_ticklabels(['Local (Sparse Inlier)', 'Mixed', 'Global (Isolated)'])

    ax.set_title(f'Ensemble Globality Map (Averaged over k={k_list})\n'
                 f'Showing top {int(contamination*100)}% LOF candidates', 
                 fontsize=14, fontweight='bold')
    ax.set_xlabel('Attribute 1')
    ax.set_ylabel('Attribute 2')
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()
    
    return df

# ==========================================
# HOW TO RUN IT
# ==========================================

# We average over k=3, 4, 5, 6, 7 to account for structural ambiguity
# This makes the "Global" score much more scientifically robust
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[5], 
    contamination=0.05
)

# Check the top results
print("\nTop 5 Global Outliers (Most Isolated across all k):")
cols = ['attribute_1', 'attribute_2', 'ensemble_globality_index']
print(df.dropna(subset=['ensemble_globality_index']).nlargest(5, 'ensemble_globality_index')[cols])

In [None]:
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[3, 4, 5, 6, 7], 
    contamination=0.05
)

In [None]:
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[4,5,6,7,8,9,10], 
    contamination=0.05
)

In [None]:
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[3,4,5,6,7], 
    contamination=1
)

In [None]:
c = 100
p = 20
top_knn_indices = df.nsmallest(c, 'knn_rank').index.values
top_lof_indices = df.nsmallest(c, 'lof_rank').index.values

all_top_indices = np.unique(np.concatenate([top_knn_indices, top_lof_indices]))

ratio_values = np.full(len(df), np.nan)
for idx in all_top_indices:
    knn_rank = df.loc[idx, 'knn_rank']
    lof_rank = df.loc[idx, 'lof_rank']
    ratio = (lof_rank + knn_rank) / 2* (lof_rank)
    ratio_values[idx] = ratio

mask_top = ~np.isnan(ratio_values)
mask_other = np.isnan(ratio_values)

ratio_min = ratio_values[mask_top].min()
ratio_max = ratio_values[mask_top].max()
ratio_mean = ratio_values[mask_top].mean()

vmin = max(0.1, ratio_min * 0.9)
vmax = min(ratio_max * 1.1, ratio_max + 0.5)
vmin = ratio_min
vmax = ratio_max
fig, ax = plt.subplots(figsize=(12, 8))

scatter_other = ax.scatter(df.loc[mask_other, 'attribute_1'], 
                          df.loc[mask_other, 'attribute_2'],
                          c='black', s=20, alpha=0.3, label='Other points')

scatter_top = ax.scatter(df.loc[mask_top, 'attribute_1'], 
                         df.loc[mask_top, 'attribute_2'],
                         c=ratio_values[mask_top], cmap='RdYlGn', 
                         s=50, alpha=0.8, edgecolors='black', linewidth=0.5,
                         vmin=vmin, vmax=vmax, label='Top c points')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title(f'Global vs Local Outliers (c={c})\nRatio = (LOF rank + {p}) / (k-NN rank + {p})', 
             fontsize=13, fontweight='bold')
cbar = plt.colorbar(scatter_top, ax=ax)
cbar.set_label('Ratio (1 = global outlier, 0 = local outlier)', fontsize=11)
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Top {c} points by k-NN rank: {len(top_knn_indices)}")
print(f"Top {c} points by LOF rank: {len(top_lof_indices)}")
print(f"Total unique top points: {len(all_top_indices)}")
print(f"\nRatio statistics for top points:")
print(f"  Mean: {ratio_values[mask_top].mean():.4f}")
print(f"  Min: {ratio_values[mask_top].min():.4f}")
print(f"  Max: {ratio_values[mask_top].max():.4f}")



# Majority Voting Algorithm for Local vs Global Outlier Classification

**Key Idea:**
- If a point is in the top-c anomalies for **both k-NN and LOF** ‚Üí vote for **Global**
- If a point is in the top-c anomalies for **LOF only** (not k-NN) ‚Üí vote for **Local**
- Repeat for multiple c thresholds and use **majority voting** to classify each point

In [None]:
def majority_voting_classifier(
    df,
    c_values,
    knn_rank_col='knn_rank',
    lof_rank_col='lof_rank',
    assume_smaller_is_more_anomalous=True,
    weight_by_inverse_c=False,
    min_votes_to_classify=1,
    handle_knn_only_as='knn_only'  # options: 'ignore', 'global', 'knn_only'
):
    """
    Ensemble top-c memberships across multiple c thresholds to label points as:
      - 'global'  : both methods agree (more global votes than local)
      - 'local'   : LOF-only signal dominates
      - 'knn_only': (optional) k-NN-only signal dominates (if handle_knn_only_as=='knn_only')
      - 'tie'     : equal non-zero votes
      - 'normal'  : no votes or below min_votes_to_classify

    IMPORTANT:
      - This implementation assumes smaller rank == more anomalous (rank=1 is top). 
        If you use scores where larger is more anomalous, set `assume_smaller_is_more_anomalous=False`
        and adjust how top-c are selected (use nlargest).
    """
    df = df.copy()
    n = len(df)

    # safe vote containers indexed by df.index
    votes_global = pd.Series(0.0, index=df.index)
    votes_local = pd.Series(0.0, index=df.index)
    votes_knn_only = pd.Series(0.0, index=df.index)

    for c in c_values:
        if c <= 0:
            continue
        c = min(c, n)
        weight = (1.0 / c) if weight_by_inverse_c else 1.0

        if assume_smaller_is_more_anomalous:
            top_knn_idx = df.nsmallest(c, knn_rank_col).index
            top_lof_idx = df.nsmallest(c, lof_rank_col).index
        else:
            top_knn_idx = df.nlargest(c, knn_rank_col).index
            top_lof_idx = df.nlargest(c, lof_rank_col).index

        knn_mask = df.index.isin(top_knn_idx)
        lof_mask = df.index.isin(top_lof_idx)

        both_mask = knn_mask & lof_mask
        only_lof_mask = lof_mask & ~knn_mask
        only_knn_mask = knn_mask & ~lof_mask

        votes_global.loc[both_mask] += weight
        votes_local.loc[only_lof_mask] += weight

        if handle_knn_only_as == 'global':
            votes_global.loc[only_knn_mask] += weight
        elif handle_knn_only_as == 'knn_only':
            votes_knn_only.loc[only_knn_mask] += weight
        # else: ignore k-NN-only

    df['votes_global'] = votes_global
    df['votes_local'] = votes_local
    if handle_knn_only_as == 'knn_only':
        df['votes_knn_only'] = votes_knn_only
    df['total_votes'] = df['votes_global'] + df['votes_local'] + (df.get('votes_knn_only', 0))

    # Classification rules
    def classify_row(row):
        g = row['votes_global']
        l = row['votes_local']
        k = row.get('votes_knn_only', 0)
        total = g + l + k
        if total < min_votes_to_classify:
            return 'normal'
        # prefer global if strictly higher
        if g > l and g > k:
            return 'global'
        if l > g and l > k:
            return 'local'
        if k > g and k > l:
            return 'knn_only'
        if (g == l == 0) and k > 0:
            return 'knn_only' if handle_knn_only_as == 'knn_only' else 'normal'
        if g == l and g > 0 and g > k:
            return 'tie'
        # fallback
        return 'normal'

    df['classification'] = df.apply(classify_row, axis=1)
    return df


In [None]:
# ============================
# Visualization of Results
# ============================

# Check if classification column exists, if not run the classifier first
if 'classification' not in df.columns:
    print("Classification column not found. Running majority voting classifier...")
    # Ensure we have the required rank columns
    if 'knn_rank' not in df.columns or 'lof_rank' not in df.columns:
        raise ValueError("Missing required columns: 'knn_rank' and 'lof_rank' must exist. Please run k-NN and LOF detection first.")
    
    # Define threshold values for majority voting
    c_values = [5, 10, 15, 20, 25, 30, 35, 40, 45]
    
    # Run the classifier
    df = majority_voting_classifier(df, c_values)
    print(f"Classification complete. Classified {len(df[df['classification'] != 'normal'])} anomalies.")

fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Consistent color map for categories
color_map = {
    'normal': 'lightgray',
    'global': 'red',
    'local': 'blue',
    'tie': 'purple'
}

# ---- PLOT 1: Classification Scatter ----
ax1 = axes[0]

for cls in ['global', 'local', 'tie', 'normal']:
    mask = df['classification'] == cls
    if mask.any():
        ax1.scatter(
            df.loc[mask, 'attribute_1'],
            df.loc[mask, 'attribute_2'],
            c=color_map[cls],
            s=90 if cls in ['global', 'local'] else 35,
            alpha=0.75 if cls in ['global', 'local'] else 0.35,
            label=f"{cls.capitalize()} ({mask.sum()})",
            edgecolors='black' if cls in ['global', 'local'] else 'none',
            linewidth=0.5
        )

ax1.set_title(
    "Local vs Global Outlier Classification (Majority Voting)",
    fontsize=14, fontweight='bold'
)
ax1.set_xlabel("Attribute 1", fontsize=12)
ax1.set_ylabel("Attribute 2", fontsize=12)
ax1.grid(alpha=0.3)
ax1.legend(loc='best', fontsize=10)

# ---- PLOT 2: Total Votes Heat Scatter ----
ax2 = axes[1]

# Normal/tie as background
background_mask = df['classification'].isin(['normal', 'tie'])

ax2.scatter(
    df.loc[background_mask, 'attribute_1'],
    df.loc[background_mask, 'attribute_2'],
    c='lightgray',
    s=25,
    alpha=0.25,
    label='Normal/Tie'
)

# Only anomalies for heatmap coloring
anom_mask = df['classification'].isin(['global', 'local'])

heat = ax2.scatter(
    df.loc[anom_mask, 'attribute_1'],
    df.loc[anom_mask, 'attribute_2'],
    c=df.loc[anom_mask, 'total_votes'],
    cmap='viridis',
    s=80,
    alpha=0.85,
    edgecolors='black',
    linewidth=0.5
)

ax2.set_title(
    "Total Votes per Anomaly (Strength of Evidence)",
    fontsize=14, fontweight='bold'
)
ax2.set_xlabel("Attribute 1", fontsize=12)
ax2.set_ylabel("Attribute 2", fontsize=12)
ax2.grid(alpha=0.3)

cbar = plt.colorbar(heat, ax=ax2)
cbar.set_label("Total Votes", fontsize=11)

plt.tight_layout()
plt.show()

print("Visualization complete.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ==========================================
# FORMULA IMPLEMENTATIONS
# ==========================================

def compute_formulas(df, knn_rank_col='knn_rank', lof_rank_col='lof_rank'):
    """
    Compute 4 different rank-based globality formulas.
    All formulas output values where:
      - Positive ‚Üí Global outlier (kNN ranks it worse)
      - Negative ‚Üí Local outlier (LOF ranks it worse)
      - Near 0 ‚Üí Ambiguous
    """
    knn_ranks = df[knn_rank_col].values
    lof_ranks = df[lof_rank_col].values
    n = len(df)
    
    # Formula 1: Grounded Rank Difference (YOUR TARGET)
    # Most stable, grounded by total rank magnitude
    df['f1_grounded'] = (lof_ranks - knn_ranks) / (lof_ranks + knn_ranks + 1)
    
    # Formula 2: Simple Normalized Difference
    # Normalized by the larger rank (more sensitive to extremes)
    df['f2_normalized'] = (lof_ranks - knn_ranks) / np.maximum(lof_ranks, knn_ranks)
    
    # Formula 3: Percentile-Based Difference
    # Distribution-aware, accounts for dataset size
    lof_percentile = lof_ranks / n
    knn_percentile = knn_ranks / n
    df['f3_percentile'] = lof_percentile - knn_percentile
    
    # Formula 4: Average-Grounded Difference
    # Similar to F1 but uses average instead of sum
    avg_rank = (lof_ranks + knn_ranks) / 2
    df['f4_avg_grounded'] = (lof_ranks - knn_ranks) / (avg_rank + 1)
    
    return df

# ==========================================
# CLASSIFICATION FUNCTION
# ==========================================

def classify_by_formula(df, formula_col, threshold_global=0.1, threshold_local=-0.1):
    """
    Classify points based on formula score:
      - score > threshold_global ‚Üí 'global'
      - score < threshold_local ‚Üí 'local'
      - else ‚Üí 'normal'
    """
    classifications = []
    for score in df[formula_col]:
        if score > threshold_global:
            classifications.append('global')
        elif score < threshold_local:
            classifications.append('local')
        else:
            classifications.append('normal')
    return classifications

# ==========================================
# EVALUATION AGAINST GROUND TRUTH
# ==========================================

def evaluate_method(df, pred_col, true_col='is_outlier'):
    """
    Evaluate predictions against ground truth.
    Treats 'global' and 'local' as outliers (1), 'normal' as inliers (0).
    """
    # Convert classifications to binary
    y_true = df[true_col].values
    y_pred = df[pred_col].apply(lambda x: 1 if x in ['global', 'local'] else 0).values
    
    # Compute metrics
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    # Count classifications
    n_global = (df[pred_col] == 'global').sum()
    n_local = (df[pred_col] == 'local').sum()
    n_normal = (df[pred_col] == 'normal').sum()
    
    # Ground truth breakdown for detected outliers
    detected_outliers = df[df[pred_col].isin(['global', 'local'])]
    true_positives = detected_outliers[detected_outliers[true_col] == 1]
    false_positives = detected_outliers[detected_outliers[true_col] == 0]
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'n_global': n_global,
        'n_local': n_local,
        'n_normal': n_normal,
        'true_positives': len(true_positives),
        'false_positives': len(false_positives),
        'pct_true_outliers_as_global': (true_positives[pred_col] == 'global').sum() / max(1, len(true_positives)) * 100
    }

# ==========================================
# MAIN TESTING FUNCTION
# ==========================================

def test_all_formulas(df):
    """
    Test all formulas and compare to majority voting.
    """
    print("="*60)
    print("TESTING RANK DIFFERENCE FORMULAS")
    print("="*60)
    
    # Compute all formulas
    df = compute_formulas(df)
    
    # Test different thresholds for each formula
    formulas = {
        'F1: Grounded': ('f1_grounded', 0.15, -0.15),
        'F2: Normalized': ('f2_normalized', 0.15, -0.15),
        'F3: Percentile': ('f3_percentile', 0.05, -0.05),
        'F4: Avg Grounded': ('f4_avg_grounded', 0.20, -0.20)
    }
    
    results = {}
    
    for name, (col, thresh_g, thresh_l) in formulas.items():
        class_col = f'{col}_class'
        df[class_col] = classify_by_formula(df, col, thresh_g, thresh_l)
        metrics = evaluate_method(df, class_col)
        results[name] = metrics
        
        print(f"\n{name} (thresholds: >{thresh_g:.2f} global, <{thresh_l:.2f} local)")
        print(f"  Accuracy: {metrics['accuracy']:.3f}")
        print(f"  Precision: {metrics['precision']:.3f}")
        print(f"  Recall: {metrics['recall']:.3f}")
        print(f"  F1-Score: {metrics['f1_score']:.3f}")
        print(f"  Classified: {metrics['n_global']} global, {metrics['n_local']} local, {metrics['n_normal']} normal")
        print(f"  True outliers as global: {metrics['pct_true_outliers_as_global']:.1f}%")
    
    # Compare to majority voting if it exists
    if 'classification' in df.columns:
        print(f"\nMAJORITY VOTING (for comparison)")
        mv_metrics = evaluate_method(df, 'classification')
        results['Majority Voting'] = mv_metrics
        print(f"  Accuracy: {mv_metrics['accuracy']:.3f}")
        print(f"  Precision: {mv_metrics['precision']:.3f}")
        print(f"  Recall: {mv_metrics['recall']:.3f}")
        print(f"  F1-Score: {mv_metrics['f1_score']:.3f}")
        print(f"  Classified: {mv_metrics['n_global']} global, {mv_metrics['n_local']} local, {mv_metrics['n_normal']} normal")
        print(f"  True outliers as global: {mv_metrics['pct_true_outliers_as_global']:.1f}%")
    
    return df, results

# ==========================================
# VISUALIZATION: COMPARE ALL METHODS
# ==========================================

def visualize_all_methods(df):
    """
    Create a comprehensive comparison visualization.
    """
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    
    color_map = {
        'normal': 'lightgray',
        'global': 'red',
        'local': 'blue',
        'tie': 'purple'
    }
    
    methods = [
        ('f1_grounded_class', 'F1: Grounded Rank Diff'),
        ('f2_normalized_class', 'F2: Normalized Diff'),
        ('f3_percentile_class', 'F3: Percentile Diff'),
        ('f4_avg_grounded_class', 'F4: Avg Grounded'),
        ('classification', 'Majority Voting'),
        ('is_outlier', 'Ground Truth (Labels)')
    ]
    
    for idx, (col, title) in enumerate(methods):
        if col not in df.columns:
            continue
            
        ax = axes[idx // 3, idx % 3]
        
        # Handle ground truth differently (binary)
        if col == 'is_outlier':
            for val, color, label in [(0, 'lightgray', 'Normal'), (1, 'red', 'Outlier')]:
                mask = df[col] == val
                ax.scatter(df.loc[mask, 'attribute_1'], df.loc[mask, 'attribute_2'],
                          c=color, s=60 if val == 1 else 25, alpha=0.7 if val == 1 else 0.3,
                          label=f"{label} ({mask.sum()})", edgecolors='black' if val == 1 else 'none',
                          linewidth=0.5)
        else:
            for cls in ['global', 'local', 'tie', 'normal']:
                mask = df[col] == cls
                if mask.any():
                    ax.scatter(df.loc[mask, 'attribute_1'], df.loc[mask, 'attribute_2'],
                              c=color_map[cls], s=70 if cls in ['global', 'local'] else 30,
                              alpha=0.75 if cls in ['global', 'local'] else 0.3,
                              label=f"{cls.capitalize()} ({mask.sum()})",
                              edgecolors='black' if cls in ['global', 'local'] else 'none',
                              linewidth=0.5)
        
        ax.set_title(title, fontsize=13, fontweight='bold')
        ax.set_xlabel('Attribute 1', fontsize=10)
        ax.set_ylabel('Attribute 2', fontsize=10)
        ax.legend(loc='best', fontsize=8)
        ax.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# ==========================================
# FORMULA DISTRIBUTION VISUALIZATION
# ==========================================

def visualize_formula_distributions(df):
    """
    Show the distribution of formula scores for true outliers vs normal points.
    """
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    formulas = [
        ('f1_grounded', 'F1: Grounded Rank Difference'),
        ('f2_normalized', 'F2: Normalized Difference'),
        ('f3_percentile', 'F3: Percentile Difference'),
        ('f4_avg_grounded', 'F4: Avg Grounded Difference')
    ]
    
    for idx, (col, title) in enumerate(formulas):
        ax = axes[idx // 2, idx % 2]
        
        # Separate by ground truth
        normal_scores = df[df['is_outlier'] == 0][col]
        outlier_scores = df[df['is_outlier'] == 1][col]
        
        # Plot histograms
        ax.hist(normal_scores, bins=50, alpha=0.6, color='blue', label=f'Normal (n={len(normal_scores)})')
        ax.hist(outlier_scores, bins=30, alpha=0.7, color='red', label=f'Outliers (n={len(outlier_scores)})')
        
        # Add vertical lines for mean
        ax.axvline(normal_scores.mean(), color='blue', linestyle='--', linewidth=2, label=f'Normal mean: {normal_scores.mean():.3f}')
        ax.axvline(outlier_scores.mean(), color='red', linestyle='--', linewidth=2, label=f'Outlier mean: {outlier_scores.mean():.3f}')
        
        ax.set_title(title, fontsize=12, fontweight='bold')
        ax.set_xlabel('Formula Score', fontsize=10)
        ax.set_ylabel('Frequency', fontsize=10)
        ax.legend(loc='best', fontsize=9)
        ax.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# ==========================================
# RESULTS COMPARISON TABLE
# ==========================================

def print_results_table(results):
    """
    Print a clean comparison table of all methods.
    """
    print("\n" + "="*90)
    print("COMPREHENSIVE RESULTS COMPARISON")
    print("="*90)
    print(f"{'Method':<20} {'Accuracy':<10} {'Precision':<11} {'Recall':<10} {'F1':<10} {'% True as Global':<15}")
    print("-"*90)
    
    for name, metrics in results.items():
        print(f"{name:<20} {metrics['accuracy']:<10.3f} {metrics['precision']:<11.3f} "
              f"{metrics['recall']:<10.3f} {metrics['f1_score']:<10.3f} "
              f"{metrics['pct_true_outliers_as_global']:<15.1f}")
    print("="*90)

# ==========================================
# HOW TO USE THIS CODE
# ==========================================
"""
# Assuming you have df with 'knn_rank', 'lof_rank', and 'is_outlier' columns:

# 1. Test all formulas
df, results = test_all_formulas(df)

# 2. Visualize all methods side-by-side
visualize_all_methods(df)

# 3. See formula distributions
visualize_formula_distributions(df)

# 4. Print comparison table
print_results_table(results)

# 5. Examine specific outliers
print("\nTop 10 outliers by Grounded Formula:")
print(df.nlargest(10, 'f1_grounded')[['attribute_1', 'attribute_2', 'f1_grounded', 
                                       'knn_rank', 'lof_rank', 'is_outlier']])
"""

In [None]:
# Test all formulas and get results
df, results = test_all_formulas(df)

# Visualize everything side-by-side
visualize_all_methods(df)

# See how formulas separate outliers vs normal
visualize_formula_distributions(df)

# Print comparison table
print_results_table(results)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# ==========================================
# CANDIDATE FILTERING
# ==========================================

def get_candidate_mask(df, c=50, knn_rank_col='knn_rank', lof_rank_col='lof_rank'):
    """
    Get union of top-c candidates from both kNN and LOF.
    Returns boolean mask.
    """
    top_knn = df.nsmallest(c, knn_rank_col).index
    top_lof = df.nsmallest(c, lof_rank_col).index
    candidates = df.index.isin(top_knn) | df.index.isin(top_lof)
    
    n_candidates = candidates.sum()
    print(f"Selected {n_candidates} candidates (union of top-{c} from kNN and LOF)")
    print(f"  - Top-{c} kNN: {len(top_knn)}")
    print(f"  - Top-{c} LOF: {len(top_lof)}")
    print(f"  - Overlap: {len(set(top_knn) & set(top_lof))}")
    
    return candidates

# ==========================================
# FORMULA IMPLEMENTATIONS
# ==========================================

def compute_formulas(df, candidate_mask, knn_rank_col='knn_rank', lof_rank_col='lof_rank',
                     knn_score_col='knn_score', lof_score_col='lof_score'):
    """
    Compute rank-based AND score-based globality formulas for candidates only.
    All formulas output values where:
      - Positive ‚Üí Global outlier (kNN ranks/scores it worse)
      - Negative ‚Üí Local outlier (LOF ranks/scores it worse)
      - Near 0 ‚Üí Ambiguous
    """
    # Extract candidate data
    knn_ranks = df.loc[candidate_mask, knn_rank_col].values
    lof_ranks = df.loc[candidate_mask, lof_rank_col].values
    knn_scores = df.loc[candidate_mask, knn_score_col].values
    lof_scores = df.loc[candidate_mask, lof_score_col].values
    n = len(df)
    
    # Initialize all formula columns with NaN
    formula_cols = ['f0_simple', 'f1_grounded', 'f2_normalized', 'f3_percentile', 
                    'f4_avg_grounded', 'f5_score_ratio', 'f6_score_diff', 
                    'f7_log_ratio', 'f8_hybrid', 'f9_knn_direct', 'f10_lof_inverse', 
                    'f11_contrast']
    
    for col in formula_cols:
        df[col] = np.nan
    
    # ==========================================
    # RANK-BASED FORMULAS
    # ==========================================
    
    # F0: Simple Rank Difference (BASELINE)
    df.loc[candidate_mask, 'f0_simple'] = lof_ranks - knn_ranks
    
    # F1: Grounded Rank Difference (PRIMARY TARGET)
    df.loc[candidate_mask, 'f1_grounded'] = (lof_ranks - knn_ranks) / (lof_ranks + knn_ranks + 1)
    
    # F2: Normalized Difference
    df.loc[candidate_mask, 'f2_normalized'] = (lof_ranks - knn_ranks) / np.maximum(lof_ranks, knn_ranks)
    
    # F3: Percentile-Based Difference
    lof_percentile = lof_ranks / n
    knn_percentile = knn_ranks / n
    df.loc[candidate_mask, 'f3_percentile'] = lof_percentile - knn_percentile
    
    # F4: Average-Grounded Difference
    avg_rank = (lof_ranks + knn_ranks) / 2
    df.loc[candidate_mask, 'f4_avg_grounded'] = (lof_ranks - knn_ranks) / (avg_rank + 1)
    
    # ==========================================
    # SCORE-BASED FORMULAS
    # ==========================================
    
    # F5: Score Ratio (inverted so positive = global)
    # High LOF/kNN ratio means LOF thinks it's more anomalous = local
    # We invert: high kNN/LOF = global
    with np.errstate(divide='ignore', invalid='ignore'):
        ratio = knn_scores / lof_scores
        # Normalize to [-1, 1] range using log
        log_ratio = np.log(ratio)
        # Clip extreme values
        log_ratio = np.clip(log_ratio, -5, 5)
        # Normalize to [-1, 1]
        df.loc[candidate_mask, 'f5_score_ratio'] = log_ratio / 5
    
    # F6: Normalized Score Difference
    df.loc[candidate_mask, 'f6_score_diff'] = (lof_scores - knn_scores) / (lof_scores + knn_scores + 1e-10)
    
    # F7: Log-Ratio (LOF/kNN)
    with np.errstate(divide='ignore', invalid='ignore'):
        log_ratio = np.log(lof_scores / knn_scores)
        log_ratio = np.clip(log_ratio, -5, 5)
        df.loc[candidate_mask, 'f7_log_ratio'] = log_ratio / 5
    
    # F8: Hybrid - Rank difference weighted by score magnitude
    score_magnitude = np.sqrt(lof_scores * knn_scores)
    # Normalize score magnitude to [0, 1]
    score_magnitude = (score_magnitude - score_magnitude.min()) / (score_magnitude.max() - score_magnitude.min() + 1e-10)
    rank_diff = (lof_ranks - knn_ranks) / (lof_ranks + knn_ranks + 1)
    df.loc[candidate_mask, 'f8_hybrid'] = rank_diff * (0.5 + score_magnitude)
    
    # ==========================================
    # DIRECT SCORE INDICATORS (NEW - SIMPLE!)
    # ==========================================
    
    # F9: kNN Score as Globality (YOUR IDEA!)
    # High kNN score = far from neighbors = GLOBAL
    # Normalize to [-1, 1] for consistency
    knn_normalized = (knn_scores - knn_scores.min()) / (knn_scores.max() - knn_scores.min() + 1e-10)
    df.loc[candidate_mask, 'f9_knn_direct'] = 2 * knn_normalized - 1  # Scale to [-1, 1]
    
    # F10: Inverse LOF Score as Locality
    # High LOF = local anomaly, so inverse for comparison
    lof_normalized = (lof_scores - lof_scores.min()) / (lof_scores.max() - lof_scores.min() + 1e-10)
    df.loc[candidate_mask, 'f10_lof_inverse'] = -(2 * lof_normalized - 1)  # Inverted
    
    # F11: Simple Score Contrast (kNN strong, LOF weak = global)
    # Normalize both, then subtract
    df.loc[candidate_mask, 'f11_contrast'] = knn_normalized - lof_normalized
    
    return df

# ==========================================
# 1. MAIN COMPARISON GRID (2√ó4)
# ==========================================

def plot_main_comparison_grid(df, candidate_mask):
    """
    Main hero visualization: Compare ground truth, majority voting, and top formulas.
    Only candidates are colored, all others are gray background.
    """
    fig, axes = plt.subplots(2, 4, figsize=(24, 12))
    axes = axes.flatten()
    
    # Color schemes
    discrete_colors = {
        'normal': 'lightgray',
        'global': 'red',
        'local': 'blue',
        'tie': 'purple'
    }
    
    # Plot configurations - select best formulas to show
    plots = [
        ('ground_truth', 'Ground Truth (Binary Labels)'),
        ('majority_voting', 'Majority Voting (Discrete)'),
        ('f9_knn_direct', 'F9: kNN_score (normalized) ‚≠ê'),
        ('f11_contrast', 'F11: kNN_norm - LOF_norm ‚≠ê'),
        ('f1_grounded', 'F1: (LOF_rank - kNN_rank) / (LOF_rank + kNN_rank + 1)'),
        ('f6_score_diff', 'F6: (LOF_score - kNN_score) / (LOF_score + kNN_score)'),
        ('f8_hybrid', 'F8: [(LOF_rank - kNN_rank) / Œ£ranks] √ó ‚àö(LOF¬∑kNN)'),
        ('f0_simple', 'F0: LOF_rank - kNN_rank')
    ]
    
    for idx, (plot_type, title) in enumerate(plots):
        ax = axes[idx]
        
        # Plot background (all normal points)
        ax.scatter(df.loc[~candidate_mask, 'attribute_1'], 
                  df.loc[~candidate_mask, 'attribute_2'],
                  c='lightgray', s=15, alpha=0.2, label='_nolegend_')
        
        # --- GROUND TRUTH ---
        if plot_type == 'ground_truth':
            # Candidates only
            cand_df = df[candidate_mask]
            normal_mask = cand_df['is_outlier'] == 0
            outlier_mask = cand_df['is_outlier'] == 1
            
            ax.scatter(cand_df.loc[normal_mask, 'attribute_1'], 
                      cand_df.loc[normal_mask, 'attribute_2'],
                      c='gray', s=80, alpha=0.6, edgecolors='black', 
                      linewidth=0.5, label=f'Normal candidates ({normal_mask.sum()})')
            ax.scatter(cand_df.loc[outlier_mask, 'attribute_1'], 
                      cand_df.loc[outlier_mask, 'attribute_2'],
                      c='red', s=120, alpha=0.9, edgecolors='black', 
                      linewidth=0.8, label=f'True outliers ({outlier_mask.sum()})')
            ax.legend(loc='best', fontsize=9)
        
        # --- MAJORITY VOTING ---
        elif plot_type == 'majority_voting' and 'classification' in df.columns:
            cand_df = df[candidate_mask]
            for cls in ['normal', 'global', 'local', 'tie']:
                mask = cand_df['classification'] == cls
                if mask.any():
                    ax.scatter(cand_df.loc[mask, 'attribute_1'], 
                              cand_df.loc[mask, 'attribute_2'],
                              c=discrete_colors[cls],
                              s=120 if cls in ['global', 'local'] else 80,
                              alpha=0.85 if cls in ['global', 'local'] else 0.6,
                              edgecolors='black',
                              linewidth=0.7,
                              label=f"{cls.capitalize()} ({mask.sum()})")
            ax.legend(loc='best', fontsize=9)
        
        # --- FORMULAS (CONTINUOUS GRADIENT) ---
        else:
            if plot_type in df.columns:
                cand_df = df[candidate_mask]
                scores = cand_df[plot_type].values
                
                # Remove NaN values
                valid_mask = ~np.isnan(scores)
                if valid_mask.sum() == 0:
                    continue
                
                # Determine vmin/vmax based on formula type
                if plot_type == 'f0_simple':
                    vmin, vmax = np.percentile(scores[valid_mask], [5, 95])
                    vmax = max(abs(vmin), abs(vmax))
                    vmin = -vmax
                else:
                    vmin, vmax = -1, 1
                
                scatter = ax.scatter(cand_df.loc[valid_mask, 'attribute_1'], 
                                    cand_df.loc[valid_mask, 'attribute_2'],
                                    c=scores[valid_mask], cmap='RdBu_r', 
                                    s=120, alpha=0.85, 
                                    edgecolors='black', linewidth=0.7,
                                    vmin=vmin, vmax=vmax)
                
                cbar = plt.colorbar(scatter, ax=ax)
                cbar.set_label('Global ‚Üê ‚Üí Local', fontsize=9)
                cbar.ax.tick_params(labelsize=8)
        
        ax.set_title(title, fontsize=11, fontweight='bold')
        ax.set_xlabel('Attribute 1', fontsize=9)
        ax.set_ylabel('Attribute 2', fontsize=9)
        ax.grid(alpha=0.2)
    
    plt.tight_layout()
    plt.suptitle(f'Comparison: Candidates Only (n={candidate_mask.sum()})', 
                 fontsize=15, fontweight='bold', y=0.995)
    plt.show()

# ==========================================
# 2. FORMULA SHOWCASE (All Formulas)
# ==========================================

def plot_all_formulas_showcase(df, candidate_mask):
    """
    Show all 12 formulas in a comprehensive grid.
    """
    fig, axes = plt.subplots(3, 4, figsize=(28, 18))
    axes = axes.flatten()
    
    formulas = [
        ('f9_knn_direct', 'F9: kNN_score (normalized) ‚≠ê'),
        ('f11_contrast', 'F11: kNN_norm - LOF_norm ‚≠ê'),
        ('f10_lof_inverse', 'F10: -LOF_score (normalized)'),
        ('f1_grounded', 'F1: (LOF_rank - kNN_rank) / (Œ£ranks + 1)'),
        ('f6_score_diff', 'F6: (LOF_score - kNN_score) / (Œ£scores)'),
        ('f8_hybrid', 'F8: rank_diff √ó ‚àö(LOF¬∑kNN)'),
        ('f0_simple', 'F0: LOF_rank - kNN_rank'),
        ('f2_normalized', 'F2: (LOF_rank - kNN_rank) / max(ranks)'),
        ('f3_percentile', 'F3: (LOF_rank/N) - (kNN_rank/N)'),
        ('f4_avg_grounded', 'F4: (LOF_rank - kNN_rank) / (avg_rank + 1)'),
        ('f5_score_ratio', 'F5: log(kNN_score / LOF_score) / 5'),
        ('f7_log_ratio', 'F7: log(LOF_score / kNN_score) / 5')
    ]
    
    for idx, (col, title) in enumerate(formulas):
        ax = axes[idx]
        
        # Background
        ax.scatter(df.loc[~candidate_mask, 'attribute_1'], 
                  df.loc[~candidate_mask, 'attribute_2'],
                  c='lightgray', s=15, alpha=0.2)
        
        # Candidates
        cand_df = df[candidate_mask]
        scores = cand_df[col].values
        valid_mask = ~np.isnan(scores)
        
        if valid_mask.sum() > 0:
            if col == 'f0_simple':
                vmin, vmax = np.percentile(scores[valid_mask], [5, 95])
                vmax = max(abs(vmin), abs(vmax))
                vmin = -vmax
            else:
                vmin, vmax = -1, 1
            
            scatter = ax.scatter(cand_df.loc[valid_mask, 'attribute_1'], 
                                cand_df.loc[valid_mask, 'attribute_2'],
                                c=scores[valid_mask], cmap='RdBu_r',
                                s=100, alpha=0.85,
                                edgecolors='black', linewidth=0.6,
                                vmin=vmin, vmax=vmax)
            
            cbar = plt.colorbar(scatter, ax=ax)
            cbar.set_label('Score', fontsize=8)
            cbar.ax.tick_params(labelsize=7)
        
        ax.set_title(title, fontsize=10, fontweight='bold')
        ax.set_xlabel('Attribute 1', fontsize=8)
        ax.set_ylabel('Attribute 2', fontsize=8)
        ax.grid(alpha=0.2)
    
    plt.tight_layout()
    plt.suptitle('All Formula Results (Candidates Only)', fontsize=15, fontweight='bold', y=0.995)
    plt.show()

# ==========================================
# 3. TOP-K TABLES
# ==========================================

def print_top_k_table(df, candidate_mask, formula_col, formula_name, k=10):
    """
    Print top K most global and most local candidates for a formula.
    """
    cand_df = df[candidate_mask].copy()
    cand_df = cand_df.dropna(subset=[formula_col])
    
    if len(cand_df) == 0:
        print(f"\n‚ö†Ô∏è  No valid data for {formula_name}")
        return
    
    print("\n" + "="*110)
    print(f"{formula_name.upper()} - TOP {k} CANDIDATES ANALYSIS")
    print("="*110)
    
    # Top K Most Global
    print(f"\nüî¥ TOP {k} MOST GLOBAL (Highest Scores - Isolated from Everything):")
    print("-"*110)
    top_global = cand_df.nlargest(min(k, len(cand_df)), formula_col)
    print(f"{'Index':<8} {'Attr1':<10} {'Attr2':<10} {'Score':<12} {'kNN_rank':<12} {'LOF_rank':<12} {'True_Label':<12}")
    print("-"*110)
    for idx, row in top_global.iterrows():
        label = 'OUTLIER' if row['is_outlier'] == 1 else 'normal'
        print(f"{idx:<8} {row['attribute_1']:<10.3f} {row['attribute_2']:<10.3f} "
              f"{row[formula_col]:<12.4f} {row['knn_rank']:<12} {row['lof_rank']:<12} {label:<12}")
    
    # Top K Most Local
    print(f"\nüîµ TOP {k} MOST LOCAL (Lowest Scores - Locally Anomalous Only):")
    print("-"*110)
    top_local = cand_df.nsmallest(min(k, len(cand_df)), formula_col)
    print(f"{'Index':<8} {'Attr1':<10} {'Attr2':<10} {'Score':<12} {'kNN_rank':<12} {'LOF_rank':<12} {'True_Label':<12}")
    print("-"*110)
    for idx, row in top_local.iterrows():
        label = 'OUTLIER' if row['is_outlier'] == 1 else 'normal'
        print(f"{idx:<8} {row['attribute_1']:<10.3f} {row['attribute_2']:<10.3f} "
              f"{row[formula_col]:<12.4f} {row['knn_rank']:<12} {row['lof_rank']:<12} {label:<12}")
    print("="*110)

# ==========================================
# 4. DISTRIBUTION HISTOGRAMS (Candidates Only)
# ==========================================

def plot_candidate_distributions(df, candidate_mask):
    """
    Show distribution of scores for all formulas (candidates only).
    """
    fig, axes = plt.subplots(3, 4, figsize=(26, 16))
    axes = axes.flatten()
    
    formulas = [
        ('f9_knn_direct', 'F9: kNN_score (norm) ‚≠ê'),
        ('f11_contrast', 'F11: kNN_norm - LOF_norm ‚≠ê'),
        ('f10_lof_inverse', 'F10: -LOF_score (norm)'),
        ('f1_grounded', 'F1: (LOF_r - kNN_r) / (Œ£r+1)'),
        ('f6_score_diff', 'F6: (LOF_s - kNN_s) / Œ£s'),
        ('f8_hybrid', 'F8: r_diff √ó ‚àö(LOF¬∑kNN)'),
        ('f0_simple', 'F0: LOF_rank - kNN_rank'),
        ('f2_normalized', 'F2: (LOF_r - kNN_r) / max(r)'),
        ('f3_percentile', 'F3: (LOF_r/N) - (kNN_r/N)'),
        ('f4_avg_grounded', 'F4: (LOF_r - kNN_r) / (avg+1)'),
        ('f5_score_ratio', 'F5: log(kNN_s / LOF_s) / 5'),
        ('f7_log_ratio', 'F7: log(LOF_s / kNN_s) / 5')
    ]
    
    for idx, (col, title) in enumerate(formulas):
        ax = axes[idx]
        cand_df = df[candidate_mask]
        scores = cand_df[col].dropna().values
        
        if len(scores) == 0:
            continue
        
        # Plot histogram
        ax.hist(scores, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
        
        # Add vertical line at 0
        ax.axvline(0, color='red', linestyle='--', linewidth=2, label='Neutral (0)')
        
        # Add mean line
        mean_val = scores.mean()
        ax.axvline(mean_val, color='green', linestyle='--', linewidth=2, 
                  label=f'Mean: {mean_val:.3f}')
        
        ax.set_title(title, fontsize=11, fontweight='bold')
        ax.set_xlabel('Score', fontsize=9)
        ax.set_ylabel('Frequency', fontsize=9)
        ax.legend(loc='best', fontsize=8)
        ax.grid(alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.suptitle(f'Distribution of Formula Scores (Candidates Only, n={candidate_mask.sum()})', 
                 fontsize=14, fontweight='bold', y=0.995)
    plt.show()

# ==========================================
# 5. SUMMARY STATISTICS TABLE
# ==========================================

def print_formula_statistics(df, candidate_mask):
    """
    Print summary statistics for all formulas (candidates only).
    """
    cand_df = df[candidate_mask]
    
    print("\n" + "="*110)
    print("FORMULA STATISTICS SUMMARY (CANDIDATES ONLY)")
    print("="*110)
    
    formulas = {
        'f9_knn_direct': 'F9: kNN_score (normalized) ‚≠ê',
        'f11_contrast': 'F11: kNN_norm - LOF_norm ‚≠ê',
        'f10_lof_inverse': 'F10: -LOF_score (normalized)',
        'f0_simple': 'F0: LOF_rank - kNN_rank',
        'f1_grounded': 'F1: (LOF_rank - kNN_rank) / (Œ£ranks + 1)',
        'f2_normalized': 'F2: (LOF_rank - kNN_rank) / max(ranks)',
        'f3_percentile': 'F3: (LOF_rank/N) - (kNN_rank/N)',
        'f4_avg_grounded': 'F4: (LOF_rank - kNN_rank) / (avg_rank + 1)',
        'f5_score_ratio': 'F5: log(kNN_score / LOF_score) / 5',
        'f6_score_diff': 'F6: (LOF_score - kNN_score) / (Œ£scores)',
        'f7_log_ratio': 'F7: log(LOF_score / kNN_score) / 5',
        'f8_hybrid': 'F8: rank_diff √ó ‚àö(LOF¬∑kNN)'
    }
    
    print(f"\n{'Formula':<35} {'Mean':<10} {'Std':<10} {'Min':<10} {'Max':<10} {'Range':<10}")
    print("-"*110)
    
    for col, name in formulas.items():
        scores = cand_df[col].dropna().values
        if len(scores) > 0:
            print(f"{name:<35} {scores.mean():<10.4f} {scores.std():<10.4f} "
                  f"{scores.min():<10.4f} {scores.max():<10.4f} {scores.max()-scores.min():<10.4f}")
    
    # Breakdown by true labels (for candidates only)
    print(f"\n{'Formula':<35} {'Mean (Normal)':<15} {'Mean (Outliers)':<15} {'Difference':<15}")
    print("-"*110)
    
    for col, name in formulas.items():
        normal_scores = cand_df[cand_df['is_outlier'] == 0][col].dropna()
        outlier_scores = cand_df[cand_df['is_outlier'] == 1][col].dropna()
        
        if len(normal_scores) > 0 and len(outlier_scores) > 0:
            mean_normal = normal_scores.mean()
            mean_outlier = outlier_scores.mean()
            diff = mean_outlier - mean_normal
            print(f"{name:<35} {mean_normal:<15.4f} {mean_outlier:<15.4f} {diff:<15.4f}")
    
    print("="*110)

# ==========================================
# MASTER FUNCTION: RUN ALL ANALYSES
# ==========================================

def run_complete_analysis(df, c=50, show_all_formulas=True, show_top_k=True, k=10):
    """
    Run the complete analysis suite:
    1. Filter to top-c candidates
    2. Compute all formulas for candidates only
    3. Show main comparison grid
    4. Show all formulas showcase
    5. Print top-K tables
    6. Show distributions
    7. Print statistics
    """
    print("="*110)
    print("STARTING COMPREHENSIVE FORMULA ANALYSIS (CANDIDATES ONLY)")
    print("="*110)
    
    # 1. Get candidates
    candidate_mask = get_candidate_mask(df, c=c)
    
    # 2. Compute all formulas
    df = compute_formulas(df, candidate_mask)
    print(f"‚úì Computed 12 formulas (5 rank-based + 4 score-based + 3 direct indicators)")
    print(f"  ‚≠ê NEW: F9 (kNN Direct), F10 (LOF Inverse), F11 (Contrast) - Simple score-based!")
    
    # 3. Main comparison grid
    print("\nüìä Generating main comparison grid...")
    plot_main_comparison_grid(df, candidate_mask)
    
    # 4. All formulas showcase
    if show_all_formulas:
        print("\nüìä Generating complete formula showcase...")
        plot_all_formulas_showcase(df, candidate_mask)
    
    # 5. Top-K tables
    if show_top_k:
        print("\nüìã Generating top-K tables...")
        formulas = [
            ('f9_knn_direct', 'F9: kNN_score (normalized)'),
            ('f11_contrast', 'F11: kNN_norm - LOF_norm'),
            ('f1_grounded', 'F1: (LOF_rank - kNN_rank) / (Œ£ranks+1)'),
            ('f6_score_diff', 'F6: (LOF_score - kNN_score) / Œ£scores'),
            ('f0_simple', 'F0: LOF_rank - kNN_rank'),
            ('f8_hybrid', 'F8: rank_diff √ó ‚àö(LOF¬∑kNN)')
        ]
        
        for col, name in formulas:
            print_top_k_table(df, candidate_mask, col, name, k)
    
    # 6. Distributions
    print("\nüìä Generating distribution histograms...")
    plot_candidate_distributions(df, candidate_mask)
    
    # 7. Statistics
    print_formula_statistics(df, candidate_mask)
    
    print("\n‚úì Complete analysis finished!")
    print(f"‚úì Analyzed {candidate_mask.sum()} candidates from {len(df)} total points")
    return df, candidate_mask

# ==========================================
# HOW TO USE THIS CODE
# ==========================================
"""
# USAGE EXAMPLE:

# Assuming you have df with these columns:
# - 'attribute_1', 'attribute_2' (features)
# - 'is_outlier' (binary ground truth: 37 outliers)
# - 'knn_rank', 'lof_rank' (from k-NN and LOF detectors)
# - 'knn_score', 'lof_score' (anomaly scores)
# - 'classification' (from majority voting - optional)

# Run the complete analysis (c=50 since we have 37 true outliers):
df, candidate_mask = run_complete_analysis(df, c=50, show_all_formulas=True, show_top_k=True, k=10)

# NOTATION USED IN PLOTS:
# - kNN_score, LOF_score: Raw anomaly scores from detectors
# - kNN_rank, LOF_rank: Ranks (1 = most anomalous)
# - kNN_norm, LOF_norm: Normalized scores to [0, 1]
# - Œ£ranks = LOF_rank + kNN_rank
# - Œ£scores = LOF_score + kNN_score
# - r = rank (shortened for space)
# - s = score (shortened for space)

# NEW SIMPLE FORMULAS TO CHECK FIRST:
# - F9: kNN_score (normalized) - High kNN = far from neighbors = GLOBAL ‚≠ê
# - F11: kNN_norm - LOF_norm - Contrast shows which detector is stronger
# - F10: -LOF_score (normalized) - Inverted LOF for comparison

# The results will show:
# - Only the top-50 candidates (union of kNN and LOF top-50)
# - All background points in light gray
# - Clear gradient coloring for candidates only
# - F9 should show clear global vs local separation!
"""

In [None]:
df, candidate_mask = run_complete_analysis(df, c=50)

# Extra Testing

In [None]:
# ======================================
# Distribution of Votes (Hist + KDE)
# ======================================
import seaborn as sns
sns.set(style="whitegrid", font_scale=1.1)

plt.figure(figsize=(12, 6))

sns.histplot(df['votes_global'], kde=True, bins=20, color='red', alpha=0.5, label='Global votes')
sns.histplot(df['votes_local'], kde=True, bins=20, color='blue', alpha=0.5, label='Local votes')

plt.title("Distribution of Global vs Local Votes", fontsize=14, fontweight='bold')
plt.xlabel("Votes")
plt.ylabel("Count")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
def majority_voting_classifier(df, c_values, knn_rank_col='knn_rank', lof_rank_col='lof_rank'):
    """
    Classify anomalies as local or global using majority voting across multiple thresholds.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame containing k-NN and LOF ranks
    c_values : list or array
        List of top-c thresholds to test (e.g., [10, 20, 30, 40, 50, 100])
    knn_rank_col : str
        Column name for k-NN ranks
    lof_rank_col : str
        Column name for LOF ranks
    
    Returns:
    --------
    DataFrame with added columns:
        - 'votes_global': number of global votes
        - 'votes_local': number of local votes
        - 'classification': 'global', 'local', or 'normal' (no votes)
    """
    
    # Initialize vote counters for each point
    votes_global = np.zeros(len(df), dtype=int)
    votes_local = np.zeros(len(df), dtype=int)
    
    # For each threshold c
    for c in c_values:
        # Get top c points by k-NN rank (highest rank = most anomalous)
        top_knn = set(df.nsmallest(c, knn_rank_col).index)
        
        # Get top c points by LOF rank (highest rank = most anomalous)
        top_lof = set(df.nsmallest(c, lof_rank_col).index)
        
        # Get all points that are in top-c of at least one method
        all_top = top_knn.union(top_lof)
        
        for idx in all_top:
            in_knn = idx in top_knn
            in_lof = idx in top_lof
            
            if in_knn and in_lof:
                # Point is in both ‚Üí Global outlier
                votes_global[idx] += 1
            elif in_lof and not in_knn:
                # Point is only in LOF ‚Üí Local outlier
                votes_local[idx] += 1
            # If only in k-NN, we don't vote (ambiguous case)
    
    # Add votes to dataframe
    df['votes_global'] = votes_global
    df['votes_local'] = votes_local
    df['total_votes'] = votes_global + votes_local
    
    # Classify based on majority voting
    classifications = []
    
    for i in range(len(df)):
        if votes_global[i] > votes_local[i]:
            classifications.append('global')
        elif votes_local[i] > votes_global[i]:
            classifications.append('local')
        elif votes_global[i] == votes_local[i] and votes_global[i] > 0:
            classifications.append('tie')
        else:
            classifications.append('normal')
    
    df['classification'] = classifications
    
    return df

# Define range of thresholds to test
c_values = [5, 10, 15, 20, 25, 30, 35, 40, 45]

# Apply majority voting
df = majority_voting_classifier(df, c_values)

# Print summary statistics
print("=" * 80)
print("MAJORITY VOTING CLASSIFICATION RESULTS")
print("=" * 80)
print(f"\nTested thresholds: {c_values}")
print(f"Number of threshold values: {len(c_values)}")
print("\nClassification counts:")
print(df['classification'].value_counts())

print("\n" + "-" * 80)
print("Vote statistics by classification:")
print("-" * 80)
for cls in ['global', 'local', 'tie', 'normal']:
    subset = df[df['classification'] == cls]
    if len(subset) > 0:
        print(f"\n{cls.upper()}:")
        print(f"  Count: {len(subset)}")
        print(f"  Avg global votes: {subset['votes_global'].mean():.2f}")
        print(f"  Avg local votes: {subset['votes_local'].mean():.2f}")

print("\n" + "=" * 80)

In [None]:
# Visualize the classification results
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Define colors for each classification
color_map = {
    'normal': 'lightgray',
    'global': 'red',
    'local': 'blue',
    'tie': 'purple'
}

# Plot 1: Classification (categorical colors)
ax1 = axes[0]
for cls in ['normal', 'global', 'local', 'tie']:
    mask = df['classification'] == cls
    if mask.sum() > 0:
        ax1.scatter(
            df.loc[mask, 'attribute_1'],
            df.loc[mask, 'attribute_2'],
            c=color_map[cls],
            s=50 if cls in ['global', 'local'] else 20,
            alpha=0.7 if cls in ['global', 'local'] else 0.3,
            label=f'{cls.capitalize()} ({mask.sum()})',
            edgecolors='black' if cls in ['global', 'local'] else 'none',
            linewidth=0.5
        )

ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title('Local vs Global Outlier Classification\n(Majority Voting)', 
             fontsize=14, fontweight='bold')
ax1.legend(loc='best', fontsize=10)
ax1.grid(True, alpha=0.3)

# Plot 2: Total votes (only for classified anomalies)
ax2 = axes[1]
anomaly_mask = df['classification'].isin(['global', 'local'])
normal_mask = ~anomaly_mask

# Plot normal points in background
ax2.scatter(
    df.loc[normal_mask, 'attribute_1'],
    df.loc[normal_mask, 'attribute_2'],
    c='lightgray',
    s=20,
    alpha=0.3,
    label='Normal/Tie'
)

# Plot anomalies colored by total votes
scatter = ax2.scatter(
    df.loc[anomaly_mask, 'attribute_1'],
    df.loc[anomaly_mask, 'attribute_2'],
    c=df.loc[anomaly_mask, 'total_votes'],
    cmap='viridis',
    s=50,
    alpha=0.8,
    edgecolors='black',
    linewidth=0.5
)

ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title('Total Votes Received\n(across all thresholds)', 
             fontsize=14, fontweight='bold')
cbar = plt.colorbar(scatter, ax=ax2)
cbar.set_label('Total votes', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nVisualization complete!")

In [None]:
# Create a detailed comparison with ground truth labels
fig, axes = plt.subplots(2, 2, figsize=(16, 14))

# Plot 1: Ground Truth Labels
ax1 = axes[0, 0]
for is_out, color, label in [(0, 'blue', 'Normal'), (1, 'red', 'Outlier')]:
    mask = df['is_outlier'] == is_out
    ax1.scatter(
        df.loc[mask, 'attribute_1'],
        df.loc[mask, 'attribute_2'],
        c=color,
        s=40 if is_out else 20,
        alpha=0.6,
        label=f'{label} ({mask.sum()})',
        marker='x' if is_out else 'o',
        edgecolors='black' if is_out else 'none',
        linewidth=0.5
    )
ax1.set_xlabel('Attribute 1', fontsize=11)
ax1.set_ylabel('Attribute 2', fontsize=11)
ax1.set_title('Ground Truth Labels', fontsize=13, fontweight='bold')
ax1.legend(loc='best', fontsize=9)
ax1.grid(True, alpha=0.3)

# Plot 2: Our Classification
ax2 = axes[0, 1]
for cls, color in [('normal', 'lightgray'), ('global', 'red'), ('local', 'blue'), ('tie', 'purple')]:
    mask = df['classification'] == cls
    if mask.sum() > 0:
        ax2.scatter(
            df.loc[mask, 'attribute_1'],
            df.loc[mask, 'attribute_2'],
            c=color,
            s=50 if cls in ['global', 'local'] else 20,
            alpha=0.7 if cls in ['global', 'local'] else 0.3,
            label=f'{cls.capitalize()} ({mask.sum()})',
            edgecolors='black' if cls in ['global', 'local'] else 'none',
            linewidth=0.5
        )
ax2.set_xlabel('Attribute 1', fontsize=11)
ax2.set_ylabel('Attribute 2', fontsize=11)
ax2.set_title('Majority Vote Classification', fontsize=13, fontweight='bold')
ax2.legend(loc='best', fontsize=9)
ax2.grid(True, alpha=0.3)

# Plot 3: Voting details - Global votes
ax3 = axes[1, 0]
anomaly_mask = df['classification'].isin(['global', 'local'])
normal_mask = ~anomaly_mask

ax3.scatter(df.loc[normal_mask, 'attribute_1'], df.loc[normal_mask, 'attribute_2'],
           c='lightgray', s=15, alpha=0.2)
scatter3 = ax3.scatter(
    df.loc[anomaly_mask, 'attribute_1'],
    df.loc[anomaly_mask, 'attribute_2'],
    c=df.loc[anomaly_mask, 'votes_global'],
    cmap='Reds',
    s=50,
    alpha=0.8,
    edgecolors='black',
    linewidth=0.5
)
ax3.set_xlabel('Attribute 1', fontsize=11)
ax3.set_ylabel('Attribute 2', fontsize=11)
ax3.set_title('Global Votes (higher = more global)', fontsize=13, fontweight='bold')
cbar3 = plt.colorbar(scatter3, ax=ax3)
cbar3.set_label('Global votes', fontsize=10)
ax3.grid(True, alpha=0.3)

# Plot 4: Voting details - Local votes
ax4 = axes[1, 1]
ax4.scatter(df.loc[normal_mask, 'attribute_1'], df.loc[normal_mask, 'attribute_2'],
           c='lightgray', s=15, alpha=0.2)
scatter4 = ax4.scatter(
    df.loc[anomaly_mask, 'attribute_1'],
    df.loc[anomaly_mask, 'attribute_2'],
    c=df.loc[anomaly_mask, 'votes_local'],
    cmap='Blues',
    s=50,
    alpha=0.8,
    edgecolors='black',
    linewidth=0.5
)
ax4.set_xlabel('Attribute 1', fontsize=11)
ax4.set_ylabel('Attribute 2', fontsize=11)
ax4.set_title('Local Votes (higher = more local)', fontsize=13, fontweight='bold')
cbar4 = plt.colorbar(scatter4, ax=ax4)
cbar4.set_label('Local votes', fontsize=10)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Detailed analysis: How do our classifications align with ground truth?
print("=" * 80)
print("CLASSIFICATION vs GROUND TRUTH ANALYSIS")
print("=" * 80)

# Cross-tabulation
print("\nCross-tabulation: Classification vs Ground Truth")
print("-" * 80)
crosstab = pd.crosstab(
    df['classification'], 
    df['is_outlier'],
    margins=True,
    margins_name='Total'
)
crosstab.columns = ['Normal (GT)', 'Outlier (GT)', 'Total']
print(crosstab)

# Among ground truth outliers, how are they classified?
gt_outliers = df[df['is_outlier'] == 1]
print("\n" + "=" * 80)
print("GROUND TRUTH OUTLIERS BREAKDOWN")
print("=" * 80)
print(f"Total ground truth outliers: {len(gt_outliers)}")
print("\nHow are they classified?")
for cls in ['global', 'local', 'tie', 'normal']:
    count = (gt_outliers['classification'] == cls).sum()
    pct = count / len(gt_outliers) * 100 if len(gt_outliers) > 0 else 0
    print(f"  {cls.capitalize():10s}: {count:4d} ({pct:5.1f}%)")

# Show top examples of each type
print("\n" + "=" * 80)
print("TOP EXAMPLES BY CLASSIFICATION")
print("=" * 80)

for cls in ['global', 'local']:
    subset = df[df['classification'] == cls].nlargest(5, 'total_votes')
    if len(subset) > 0:
        print(f"\n{cls.upper()} OUTLIERS (Top 5 by total votes):")
        print("-" * 80)
        cols_to_show = ['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 
                        'votes_global', 'votes_local', 'total_votes', 'is_outlier']
        print(subset[cols_to_show].to_string(index=True))

# Statistics on ranks
print("\n" + "=" * 80)
print("RANK STATISTICS BY CLASSIFICATION")
print("=" * 80)

for cls in ['global', 'local']:
    subset = df[df['classification'] == cls]
    if len(subset) > 0:
        print(f"\n{cls.upper()} outliers (n={len(subset)}):")
        print(f"  k-NN rank - mean: {subset['knn_rank'].mean():.1f}, median: {subset['knn_rank'].median():.1f}, min: {subset['knn_rank'].min()}, max: {subset['knn_rank'].max()}")
        print(f"  LOF rank  - mean: {subset['lof_rank'].mean():.1f}, median: {subset['lof_rank'].median():.1f}, min: {subset['lof_rank'].min()}, max: {subset['lof_rank'].max()}")
        print(f"  Rank diff (LOF-kNN) - mean: {(subset['lof_rank'] - subset['knn_rank']).mean():.1f}")

print("\n" + "=" * 80)

## Experiment with Different Threshold Ranges

Let's see how the classification changes with different ranges of c values.

In [None]:
# Experiment with different threshold strategies
threshold_strategies = {
    'Fine-grained (10-100)': list(range(10, 101, 10)),
    'Coarse (10-200, step 20)': list(range(10, 201, 20)),
    'Wide range (5-300)': [5, 10, 20, 30, 50, 75, 100, 150, 200, 250, 300],
    'Small thresholds (5-50)': list(range(5, 51, 1)),
}

results_summary = []

fig, axes = plt.subplots(2, 2, figsize=(18, 14))
axes = axes.flatten()

for idx, (strategy_name, c_vals) in enumerate(threshold_strategies.items()):
    # Create a copy of original dataframe
    df_temp = df[['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 'is_outlier']].copy()
    
    # Apply classification
    df_temp = majority_voting_classifier(df_temp, c_vals)
    
    # Store summary
    summary = {
        'Strategy': strategy_name,
        'Thresholds': c_vals,
        'Global': (df_temp['classification'] == 'global').sum(),
        'Local': (df_temp['classification'] == 'local').sum(),
        'Tie': (df_temp['classification'] == 'tie').sum(),
        'Normal': (df_temp['classification'] == 'normal').sum(),
    }
    results_summary.append(summary)
    
    # Plot
    ax = axes[idx]
    for cls, color in [('normal', 'lightgray'), ('global', 'red'), ('local', 'blue'), ('tie', 'purple')]:
        mask = df_temp['classification'] == cls
        if mask.sum() > 0:
            ax.scatter(
                df_temp.loc[mask, 'attribute_1'],
                df_temp.loc[mask, 'attribute_2'],
                c=color,
                s=50 if cls in ['global', 'local'] else 15,
                alpha=0.7 if cls in ['global', 'local'] else 0.3,
                label=f'{cls.capitalize()} ({mask.sum()})',
                edgecolors='black' if cls in ['global', 'local'] else 'none',
                linewidth=0.5
            )
    
    ax.set_xlabel('Attribute 1', fontsize=10)
    ax.set_ylabel('Attribute 2', fontsize=10)
    ax.set_title(f'{strategy_name}\n(c: {min(c_vals)}-{max(c_vals)}, n={len(c_vals)})', 
                 fontsize=11, fontweight='bold')
    ax.legend(loc='best', fontsize=8)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print comparison table
print("\n" + "=" * 80)
print("COMPARISON OF THRESHOLD STRATEGIES")
print("=" * 80)
comparison_df = pd.DataFrame(results_summary)
print(comparison_df.to_string(index=False))

print("\n" + "=" * 80)

## Save Results

Export the classified data for further analysis.

In [None]:
# Save the classified data
output_file = 'dfki_classified_local_global.csv'
output_cols = [
    'attribute_1', 'attribute_2', 
    'is_outlier',  # ground truth
    'knn_score', 'knn_rank',
    'lof_score', 'lof_rank',
    'votes_global', 'votes_local', 'total_votes',
    'classification'
]

df[output_cols].to_csv(output_file, index=False)
print(f"‚úì Results saved to: {output_file}")
print(f"  Rows: {len(df)}")
print(f"  Columns: {len(output_cols)}")

# Summary statistics
print("\n" + "=" * 80)
print("FINAL SUMMARY")
print("=" * 80)
print(f"\nDataset: DFKI Artificial (3000 samples)")
print(f"Thresholds tested: {c_values}")
print(f"\nClassification results:")
print(df['classification'].value_counts().to_string())

print(f"\n\nKey insights:")
print(f"  ‚Ä¢ Global outliers: Points consistently ranked high by BOTH k-NN and LOF")
print(f"  ‚Ä¢ Local outliers: Points ranked high by LOF but NOT by k-NN")
print(f"  ‚Ä¢ Majority voting: Classification based on votes across all thresholds")

# Show a few example points
print("\n" + "=" * 80)
print("EXAMPLE CLASSIFICATIONS")
print("=" * 80)

if (df['classification'] == 'global').any():
    print("\nTop GLOBAL outlier (by total votes):")
    global_example = df[df['classification'] == 'global'].nlargest(1, 'total_votes')
    print(global_example[['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 
                           'votes_global', 'votes_local', 'total_votes']].to_string())

if (df['classification'] == 'local').any():
    print("\nTop LOCAL outlier (by total votes):")
    local_example = df[df['classification'] == 'local'].nlargest(1, 'total_votes')
    print(local_example[['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 
                          'votes_global', 'votes_local', 'total_votes']].to_string())

print("\n" + "=" * 80)

In [None]:
"""
Majority Voting Algorithm for Global vs Local Outlier Classification

Key Idea:
- For each threshold c (top c anomalies):
  - If point is in top-c for BOTH kNN AND LOF ‚Üí vote "global"
  - If point is in top-c for LOF but NOT kNN ‚Üí vote "local"
- Aggregate votes across many c values
- Classify by majority vote
"""

def majority_vote_classification(df, c_range, knn_rank_col='knn_rank', lof_rank_col='lof_rank'):
    """
    Classify outliers as global or local using majority voting across multiple thresholds.
    
    Parameters:
    -----------
    df : DataFrame
        Must contain knn_rank and lof_rank columns
    c_range : list or range
        List of threshold values to test (e.g., [10, 20, 30, ..., 100])
    knn_rank_col : str
        Column name for k-NN ranks
    lof_rank_col : str
        Column name for LOF ranks
    
    Returns:
    --------
    DataFrame with added columns:
        - 'global_votes': number of times classified as global
        - 'local_votes': number of times classified as local
        - 'total_votes': total votes received
        - 'classification': 'global', 'local', or 'normal' (no votes)
        - 'confidence': ratio of majority votes to total votes
    """
    
    n_samples = len(df)
    
    # Initialize vote counters
    global_votes = np.zeros(n_samples, dtype=int)
    local_votes = np.zeros(n_samples, dtype=int)
    
    print(f"Running majority voting across c = {list(c_range)}")
    print(f"Total thresholds to test: {len(list(c_range))}")
    print()
    
    # Iterate through each threshold value
    for c in c_range:
        # Get top c indices for each algorithm
        top_knn = set(df.nsmallest(c, knn_rank_col).index)
        top_lof = set(df.nsmallest(c, lof_rank_col).index)
        
        # Points in both ‚Üí global vote
        global_outliers = top_knn & top_lof
        
        # Points in LOF but not kNN ‚Üí local vote
        local_outliers = top_lof - top_knn
        
        # Record votes
        for idx in global_outliers:
            global_votes[idx] += 1
        
        for idx in local_outliers:
            local_votes[idx] += 1
    
    # Add results to dataframe
    df_result = df.copy()
    df_result['global_votes'] = global_votes
    df_result['local_votes'] = local_votes
    df_result['total_votes'] = global_votes + local_votes
    
    # Classify based on majority
    classifications = []
    confidences = []
    
    for i in range(n_samples):
        g_votes = global_votes[i]
        l_votes = local_votes[i]
        total = g_votes + l_votes
        
        if total == 0:
            classifications.append('normal')
            confidences.append(0.0)
        elif g_votes > l_votes:
            classifications.append('global')
            confidences.append(g_votes / total)
        elif l_votes > g_votes:
            classifications.append('local')
            confidences.append(l_votes / total)
        else:  # tie
            classifications.append('tie')
            confidences.append(0.5)
    
    df_result['classification'] = classifications
    df_result['confidence'] = confidences
    
    # Print summary
    print("\n" + "="*60)
    print("CLASSIFICATION SUMMARY")
    print("="*60)
    print(f"Global outliers: {sum(np.array(classifications) == 'global')}")
    print(f"Local outliers:  {sum(np.array(classifications) == 'local')}")
    print(f"Ties:            {sum(np.array(classifications) == 'tie')}")
    print(f"Normal points:   {sum(np.array(classifications) == 'normal')}")
    print()
    
    # Statistics on confidence
    voted_mask = np.array(classifications) != 'normal'
    if voted_mask.any():
        voted_confidences = np.array(confidences)[voted_mask]
        print(f"Confidence statistics (for classified points):")
        print(f"  Mean: {voted_confidences.mean():.3f}")
        print(f"  Min:  {voted_confidences.min():.3f}")
        print(f"  Max:  {voted_confidences.max():.3f}")
    
    return df_result


def plot_classification_results(df, max_c, attr1='attribute_1', attr2='attribute_2'):
    """
    Visualize the classification results.
    
    Parameters:
    -----------
    df : DataFrame
        Must contain 'classification' and 'confidence' columns
    max_c : int
        Maximum c value used (for plot title)
    """
    
    # Create color map
    color_map = {
        'global': 'red',
        'local': 'blue',
        'tie': 'purple',
        'normal': 'lightgray'
    }
    
    colors = [color_map[c] for c in df['classification']]
    
    # Create figure with two subplots
    fig, axes = plt.subplots(1, 2, figsize=(18, 7))
    
    # Plot 1: Classification with consistent colors
    ax1 = axes[0]
    
    # Plot normal points first (background)
    normal_mask = df['classification'] == 'normal'
    ax1.scatter(df.loc[normal_mask, attr1], 
                df.loc[normal_mask, attr2],
                c='lightgray', s=20, alpha=0.3, label='Normal', zorder=1)
    
    # Plot classified outliers
    for cls, color, label, marker in [
        ('global', 'red', 'Global Outlier', 'o'),
        ('local', 'blue', 'Local Outlier', '^'),
        ('tie', 'purple', 'Tie', 's')
    ]:
        mask = df['classification'] == cls
        if mask.any():
            ax1.scatter(df.loc[mask, attr1], 
                       df.loc[mask, attr2],
                       c=color, s=80, alpha=0.8, 
                       edgecolors='black', linewidth=0.5,
                       label=label, marker=marker, zorder=3)
    
    ax1.set_xlabel('Attribute 1', fontsize=12)
    ax1.set_ylabel('Attribute 2', fontsize=12)
    ax1.set_title(f'Global vs Local Outlier Classification\n(Majority Voting, max c={max_c})', 
                 fontsize=13, fontweight='bold')
    ax1.legend(loc='best', fontsize=10)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Confidence scores
    ax2 = axes[1]
    
    # Plot normal points
    ax2.scatter(df.loc[normal_mask, attr1], 
                df.loc[normal_mask, attr2],
                c='lightgray', s=20, alpha=0.3, label='Normal', zorder=1)
    
    # Plot classified points with confidence coloring
    classified_mask = df['classification'] != 'normal'
    if classified_mask.any():
        scatter = ax2.scatter(df.loc[classified_mask, attr1], 
                             df.loc[classified_mask, attr2],
                             c=df.loc[classified_mask, 'confidence'],
                             cmap='RdYlGn', s=80, alpha=0.8,
                             edgecolors='black', linewidth=0.5,
                             vmin=0.5, vmax=1.0, zorder=3)
        
        cbar = plt.colorbar(scatter, ax=ax2)
        cbar.set_label('Classification Confidence', fontsize=11)
    
    ax2.set_xlabel('Attribute 1', fontsize=12)
    ax2.set_ylabel('Attribute 2', fontsize=12)
    ax2.set_title(f'Classification Confidence Scores\n(Higher = stronger agreement)', 
                 fontsize=13, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Additional plot: Vote distribution
    fig, ax = plt.subplots(figsize=(10, 6))
    
    classified_mask = df['classification'] != 'normal'
    if classified_mask.any():
        scatter = ax.scatter(df.loc[classified_mask, 'global_votes'], 
                           df.loc[classified_mask, 'local_votes'],
                           c=df.loc[classified_mask, 'classification'].map(color_map),
                           s=60, alpha=0.7, edgecolors='black', linewidth=0.5)
        
        # Add diagonal line (tie line)
        max_votes = max(df['global_votes'].max(), df['local_votes'].max())
        ax.plot([0, max_votes], [0, max_votes], 'k--', alpha=0.3, linewidth=1, label='Tie line')
        
        ax.set_xlabel('Global Votes', fontsize=12)
        ax.set_ylabel('Local Votes', fontsize=12)
        ax.set_title('Vote Distribution', fontsize=13, fontweight='bold')
        ax.grid(True, alpha=0.3)
        
        # Add legend manually
        from matplotlib.patches import Patch
        legend_elements = [
            Patch(facecolor='red', edgecolor='black', label='Global'),
            Patch(facecolor='blue', edgecolor='black', label='Local'),
            Patch(facecolor='purple', edgecolor='black', label='Tie')
        ]
        ax.legend(handles=legend_elements, loc='best', fontsize=10)
    
    plt.tight_layout()
    plt.show()


print("Functions loaded successfully!")
print("\nUsage example:")
print("  c_range = range(10, 101, 5)  # Test c from 10 to 100, step 5")
print("  df_classified = majority_vote_classification(df, c_range)")
print("  plot_classification_results(df_classified, max_c=100)")

In [None]:
# Run the majority voting classification

c_range = range(5, 51, 5)  

df_classified = majority_vote_classification(df, c_range)

# Visualize results
plot_classification_results(df_classified, max_c=50)

In [None]:
# Detailed analysis: Compare with ground truth and explore different c ranges

def analyze_classification_vs_ground_truth(df_classified):
    """
    Analyze how classified outliers relate to ground truth labels.
    """
    print("\n" + "="*60)
    print("COMPARISON WITH GROUND TRUTH")
    print("="*60)
    
    # Get ground truth outliers
    true_outliers = df_classified[df_classified['is_outlier'] == 1]
    true_normals = df_classified[df_classified['is_outlier'] == 0]
    
    print(f"\nGround truth distribution:")
    print(f"  True outliers: {len(true_outliers)}")
    print(f"  True normals:  {len(true_normals)}")
    
    # Among true outliers, how are they classified?
    print(f"\nClassification of TRUE OUTLIERS:")
    outlier_classifications = true_outliers['classification'].value_counts()
    for cls, count in outlier_classifications.items():
        pct = count / len(true_outliers) * 100
        print(f"  {cls:10s}: {count:4d} ({pct:5.1f}%)")
    
    # Among classified global outliers, what % are true outliers?
    print(f"\nPrecision check:")
    for cls in ['global', 'local']:
        classified = df_classified[df_classified['classification'] == cls]
        if len(classified) > 0:
            true_positive = (classified['is_outlier'] == 1).sum()
            precision = true_positive / len(classified) * 100
            print(f"  {cls:10s}: {true_positive}/{len(classified)} are true outliers ({precision:.1f}%)")
    
    # Visualize ground truth vs classification
    fig, axes = plt.subplots(1, 2, figsize=(18, 7))
    
    # Plot 1: Ground truth
    ax1 = axes[0]
    normal_mask = df_classified['is_outlier'] == 0
    outlier_mask = df_classified['is_outlier'] == 1
    
    ax1.scatter(df_classified.loc[normal_mask, 'attribute_1'], 
                df_classified.loc[normal_mask, 'attribute_2'],
                c='lightgray', s=20, alpha=0.5, label='Normal')
    ax1.scatter(df_classified.loc[outlier_mask, 'attribute_1'], 
                df_classified.loc[outlier_mask, 'attribute_2'],
                c='black', s=80, alpha=0.8, marker='x', 
                edgecolors='red', linewidth=1.5, label='True Outlier')
    
    ax1.set_xlabel('Attribute 1', fontsize=12)
    ax1.set_ylabel('Attribute 2', fontsize=12)
    ax1.set_title('Ground Truth Labels', fontsize=13, fontweight='bold')
    ax1.legend(loc='best', fontsize=10)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Our classification overlaid on ground truth
    ax2 = axes[1]
    
    # Background: ground truth
    ax2.scatter(df_classified.loc[normal_mask, 'attribute_1'], 
                df_classified.loc[normal_mask, 'attribute_2'],
                c='lightgray', s=20, alpha=0.3, label='True Normal')
    ax2.scatter(df_classified.loc[outlier_mask, 'attribute_1'], 
                df_classified.loc[outlier_mask, 'attribute_2'],
                c='yellow', s=80, alpha=0.5, marker='x', 
                edgecolors='orange', linewidth=1, label='True Outlier')
    
    # Overlay: our classification
    for cls, color, label, marker in [
        ('global', 'red', 'Classified: Global', 'o'),
        ('local', 'blue', 'Classified: Local', '^')
    ]:
        mask = df_classified['classification'] == cls
        if mask.any():
            ax2.scatter(df_classified.loc[mask, 'attribute_1'], 
                       df_classified.loc[mask, 'attribute_2'],
                       c=color, s=60, alpha=0.7, 
                       edgecolors='black', linewidth=0.5,
                       label=label, marker=marker, zorder=3)
    
    ax2.set_xlabel('Attribute 1', fontsize=12)
    ax2.set_ylabel('Attribute 2', fontsize=12)
    ax2.set_title('Classification Overlaid on Ground Truth', fontsize=13, fontweight='bold')
    ax2.legend(loc='best', fontsize=9)
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


# Run the analysis
analyze_classification_vs_ground_truth(df_classified)

In [None]:
# Detailed inspection: Show top global and local outliers

def inspect_top_outliers(df_classified, n_top=10):
    """
    Display detailed information about the top global and local outliers.
    """
    print("="*80)
    print(f"TOP {n_top} GLOBAL OUTLIERS (sorted by confidence)")
    print("="*80)
    
    global_outliers = df_classified[df_classified['classification'] == 'global'].copy()
    global_outliers = global_outliers.sort_values('confidence', ascending=False).head(n_top)
    
    cols_to_show = ['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 
                    'global_votes', 'local_votes', 'confidence', 'is_outlier']
    print(global_outliers[cols_to_show].to_string())
    
    print("\n" + "="*80)
    print(f"TOP {n_top} LOCAL OUTLIERS (sorted by confidence)")
    print("="*80)
    
    local_outliers = df_classified[df_classified['classification'] == 'local'].copy()
    local_outliers = local_outliers.sort_values('confidence', ascending=False).head(n_top)
    
    print(local_outliers[cols_to_show].to_string())
    
    # Visualize these specific points
    fig, ax = plt.subplots(figsize=(12, 9))
    
    # Background: all points
    normal_bg = df_classified['classification'] == 'normal'
    ax.scatter(df_classified.loc[normal_bg, 'attribute_1'], 
              df_classified.loc[normal_bg, 'attribute_2'],
              c='lightgray', s=15, alpha=0.2, label='Other points')
    
    # Top global outliers
    ax.scatter(global_outliers['attribute_1'], 
              global_outliers['attribute_2'],
              c='red', s=150, alpha=0.7, marker='o',
              edgecolors='darkred', linewidth=2,
              label=f'Top {n_top} Global Outliers')
    
    # Top local outliers
    ax.scatter(local_outliers['attribute_1'], 
              local_outliers['attribute_2'],
              c='blue', s=150, alpha=0.7, marker='^',
              edgecolors='darkblue', linewidth=2,
              label=f'Top {n_top} Local Outliers')
    
    # Add labels to points
    for idx, row in global_outliers.iterrows():
        ax.annotate(f"G{idx}", 
                   (row['attribute_1'], row['attribute_2']),
                   xytext=(5, 5), textcoords='offset points',
                   fontsize=8, color='darkred', fontweight='bold')
    
    for idx, row in local_outliers.iterrows():
        ax.annotate(f"L{idx}", 
                   (row['attribute_1'], row['attribute_2']),
                   xytext=(5, 5), textcoords='offset points',
                   fontsize=8, color='darkblue', fontweight='bold')
    
    ax.set_xlabel('Attribute 1', fontsize=12)
    ax.set_ylabel('Attribute 2', fontsize=12)
    ax.set_title(f'Top {n_top} Global vs Local Outliers', fontsize=14, fontweight='bold')
    ax.legend(loc='best', fontsize=10)
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\n" + "="*80)
    print("SUMMARY STATISTICS")
    print("="*80)
    print(f"\nTop global outliers:")
    print(f"  Avg kNN rank: {global_outliers['knn_rank'].mean():.1f}")
    print(f"  Avg LOF rank: {global_outliers['lof_rank'].mean():.1f}")
    print(f"  % true outliers: {(global_outliers['is_outlier'] == 1).sum() / len(global_outliers) * 100:.1f}%")
    
    print(f"\nTop local outliers:")
    print(f"  Avg kNN rank: {local_outliers['knn_rank'].mean():.1f}")
    print(f"  Avg LOF rank: {local_outliers['lof_rank'].mean():.1f}")
    print(f"  % true outliers: {(local_outliers['is_outlier'] == 1).sum() / len(local_outliers) * 100:.1f}%")


# Run inspection
inspect_top_outliers(df_classified, n_top=10)

# Optional: Save results to CSV
# df_classified.to_csv('classified_outliers.csv', index=True)
# print("\nResults saved to 'classified_outliers.csv'")

In [None]:
# Optional: Test with different k values for kNN and LOF

def run_full_pipeline(df_original, k_knn=20, k_lof=20, c_range=range(10, 101, 5)):
    """
    Complete pipeline: run kNN and LOF with specified k values, 
    then perform majority voting classification.
    
    Parameters:
    -----------
    df_original : DataFrame
        Original dataset with 'attribute_1', 'attribute_2', and 'is_outlier' columns
    k_knn : int
        Number of neighbors for k-NN
    k_lof : int
        Number of neighbors for LOF
    c_range : range or list
        Range of c values for majority voting
    
    Returns:
    --------
    DataFrame with all scores, ranks, and classifications
    """
    print("="*80)
    print(f"RUNNING FULL PIPELINE")
    print("="*80)
    print(f"k-NN neighbors: {k_knn}")
    print(f"LOF neighbors:  {k_lof}")
    print(f"Voting range:   {list(c_range)[0]} to {list(c_range)[-1]}")
    print()
    
    df_work = df_original.copy()
    X = df_work[['attribute_1', 'attribute_2']].values
    
    # Normalize
    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)
    
    # Run k-NN
    print(f"Running k-NN (k={k_knn})...")
    knn_detector = KNN(n_neighbors=k_knn, contamination=0.01)
    knn_detector.fit(X_normalized)
    knn_scores = knn_detector.decision_scores_
    
    sorted_indices = np.argsort(knn_scores)[::-1]
    knn_ranks = np.empty_like(knn_scores, dtype=int)
    knn_ranks[sorted_indices] = np.arange(1, len(knn_scores) + 1)
    
    df_work['knn_score'] = knn_scores
    df_work['knn_rank'] = knn_ranks
    
    # Run LOF
    print(f"Running LOF (k={k_lof})...")
    lof_detector = LOF(n_neighbors=k_lof, contamination=0.01)
    lof_detector.fit(X_normalized)
    lof_scores = lof_detector.decision_scores_
    
    sorted_indices = np.argsort(lof_scores)[::-1]
    lof_ranks = np.empty_like(lof_scores, dtype=int)
    lof_ranks[sorted_indices] = np.arange(1, len(lof_scores) + 1)
    
    df_work['lof_score'] = lof_scores
    df_work['lof_rank'] = lof_ranks
    
    # Majority voting classification
    print(f"\nRunning majority voting...")
    df_result = majority_vote_classification(df_work, c_range)
    
    print("\nPipeline complete!")
    return df_result


# Example: Test with different k values
print("\n" + "="*80)
print("TESTING SENSITIVITY TO k VALUES")
print("="*80)

# Test 1: Small k values (more local)
print("\n### Test 1: Small k (k=10) ###")
df_k10 = run_full_pipeline(df, k_knn=10, k_lof=10, c_range=range(10, 101, 5))
plot_classification_results(df_k10, max_c=100)

# Test 2: Larger k values (more global)
print("\n### Test 2: Large k (k=40) ###")
df_k40 = run_full_pipeline(df, k_knn=40, k_lof=40, c_range=range(10, 101, 5))
plot_classification_results(df_k40, max_c=100)

# Compare the two
print("\n" + "="*80)
print("COMPARISON: k=10 vs k=40")
print("="*80)
print("\nWith k=10 (more sensitive to local structure):")
print(f"  Global: {(df_k10['classification'] == 'global').sum()}")
print(f"  Local:  {(df_k10['classification'] == 'local').sum()}")

print("\nWith k=40 (more global perspective):")
print(f"  Global: {(df_k40['classification'] == 'global').sum()}")
print(f"  Local:  {(df_k40['classification'] == 'local').sum()}")

In [None]:
# Final Summary and Export

def create_summary_report(df_classified):
    """
    Generate a comprehensive summary report of the classification results.
    """
    print("\n" + "="*80)
    print("COMPREHENSIVE CLASSIFICATION REPORT")
    print("="*80)
    
    # Overall statistics
    total_points = len(df_classified)
    n_global = (df_classified['classification'] == 'global').sum()
    n_local = (df_classified['classification'] == 'local').sum()
    n_tie = (df_classified['classification'] == 'tie').sum()
    n_normal = (df_classified['classification'] == 'normal').sum()
    
    print(f"\n1. OVERALL CLASSIFICATION")
    print(f"   Total points:     {total_points}")
    print(f"   Global outliers:  {n_global:4d} ({n_global/total_points*100:5.2f}%)")
    print(f"   Local outliers:   {n_local:4d} ({n_local/total_points*100:5.2f}%)")
    print(f"   Ties:             {n_tie:4d} ({n_tie/total_points*100:5.2f}%)")
    print(f"   Normal points:    {n_normal:4d} ({n_normal/total_points*100:5.2f}%)")
    
    # Confidence statistics
    classified = df_classified[df_classified['classification'].isin(['global', 'local'])]
    if len(classified) > 0:
        print(f"\n2. CONFIDENCE STATISTICS")
        print(f"   Mean confidence:  {classified['confidence'].mean():.3f}")
        print(f"   Median confidence: {classified['confidence'].median():.3f}")
        print(f"   Std confidence:   {classified['confidence'].std():.3f}")
        print(f"   High conf (>0.8): {(classified['confidence'] > 0.8).sum()} points")
        print(f"   Low conf (<0.6):  {(classified['confidence'] < 0.6).sum()} points")
    
    # Ground truth comparison
    if 'is_outlier' in df_classified.columns:
        true_outliers = df_classified[df_classified['is_outlier'] == 1]
        
        print(f"\n3. GROUND TRUTH COMPARISON")
        print(f"   True outliers in dataset: {len(true_outliers)}")
        print(f"   Detected by our method:   {n_global + n_local}")
        
        # Detection rate
        detected = true_outliers['classification'].isin(['global', 'local']).sum()
        detection_rate = detected / len(true_outliers) * 100 if len(true_outliers) > 0 else 0
        print(f"   Detection rate:           {detected}/{len(true_outliers)} ({detection_rate:.1f}%)")
        
        # Precision
        our_outliers = df_classified[df_classified['classification'].isin(['global', 'local'])]
        if len(our_outliers) > 0:
            true_positives = (our_outliers['is_outlier'] == 1).sum()
            precision = true_positives / len(our_outliers) * 100
            print(f"   Precision:                {true_positives}/{len(our_outliers)} ({precision:.1f}%)")
        
        # Among detected true outliers, global vs local
        detected_outliers = true_outliers[true_outliers['classification'].isin(['global', 'local'])]
        if len(detected_outliers) > 0:
            n_global_true = (detected_outliers['classification'] == 'global').sum()
            n_local_true = (detected_outliers['classification'] == 'local').sum()
            print(f"\n   Among detected true outliers:")
            print(f"      Classified as global: {n_global_true} ({n_global_true/len(detected_outliers)*100:.1f}%)")
            print(f"      Classified as local:  {n_local_true} ({n_local_true/len(detected_outliers)*100:.1f}%)")
    
    # Ranking statistics
    if 'knn_rank' in df_classified.columns and 'lof_rank' in df_classified.columns:
        print(f"\n4. RANK STATISTICS")
        
        global_outliers = df_classified[df_classified['classification'] == 'global']
        local_outliers = df_classified[df_classified['classification'] == 'local']
        
        if len(global_outliers) > 0:
            print(f"\n   Global outliers:")
            print(f"      Avg k-NN rank: {global_outliers['knn_rank'].mean():6.1f}")
            print(f"      Avg LOF rank:  {global_outliers['lof_rank'].mean():6.1f}")
            print(f"      Rank ratio:    {(global_outliers['lof_rank'] / global_outliers['knn_rank']).mean():.3f}")
        
        if len(local_outliers) > 0:
            print(f"\n   Local outliers:")
            print(f"      Avg k-NN rank: {local_outliers['knn_rank'].mean():6.1f}")
            print(f"      Avg LOF rank:  {local_outliers['lof_rank'].mean():6.1f}")
            print(f"      Rank ratio:    {(local_outliers['lof_rank'] / local_outliers['knn_rank']).mean():.3f}")
    
    print("\n" + "="*80)


# Generate report for our classification
create_summary_report(df_classified)

# Optional: Save detailed results
save_results = False  # Set to True to save

if save_results:
    output_file = 'xod_majority_voting_results.csv'
    cols_to_save = ['attribute_1', 'attribute_2', 'is_outlier',
                    'knn_score', 'knn_rank', 'lof_score', 'lof_rank',
                    'global_votes', 'local_votes', 'total_votes',
                    'classification', 'confidence']
    df_classified[cols_to_save].to_csv(output_file, index=True)
    print(f"\n‚úì Results saved to: {output_file}")
else:
    print(f"\nTo save results, set save_results = True")

In [None]:
# Additional Visualization: Rank Difference Analysis

def visualize_rank_differences(df_classified):
    """
    Visualize the relationship between k-NN and LOF ranks,
    highlighting global vs local outliers.
    """
    
    # Calculate rank difference
    df_viz = df_classified.copy()
    df_viz['rank_diff'] = df_viz['lof_rank'] - df_viz['knn_rank']
    df_viz['rank_ratio'] = df_viz['lof_rank'] / (df_viz['knn_rank'] + 1)  # +1 to avoid division by zero
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 14))
    
    # Plot 1: k-NN rank vs LOF rank
    ax1 = axes[0, 0]
    
    color_map = {'global': 'red', 'local': 'blue', 'tie': 'purple', 'normal': 'lightgray'}
    
    for cls in ['normal', 'tie', 'local', 'global']:
        mask = df_viz['classification'] == cls
        if mask.any():
            alpha = 0.2 if cls == 'normal' else 0.7
            size = 20 if cls == 'normal' else 60
            ax1.scatter(df_viz.loc[mask, 'knn_rank'], 
                       df_viz.loc[mask, 'lof_rank'],
                       c=color_map[cls], s=size, alpha=alpha, 
                       label=cls.capitalize())
    
    # Add diagonal line (where kNN rank = LOF rank)
    max_rank = max(df_viz['knn_rank'].max(), df_viz['lof_rank'].max())
    ax1.plot([0, max_rank], [0, max_rank], 'k--', alpha=0.3, linewidth=1, label='Equal rank')
    
    ax1.set_xlabel('k-NN Rank', fontsize=12)
    ax1.set_ylabel('LOF Rank', fontsize=12)
    ax1.set_title('k-NN Rank vs LOF Rank', fontsize=13, fontweight='bold')
    ax1.legend(loc='best', fontsize=9)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Rank difference distribution
    ax2 = axes[0, 1]
    
    for cls in ['global', 'local']:
        mask = df_viz['classification'] == cls
        if mask.any():
            ax2.hist(df_viz.loc[mask, 'rank_diff'], 
                    bins=30, alpha=0.6, label=cls.capitalize(),
                    color=color_map[cls], edgecolor='black')
    
    ax2.axvline(x=0, color='black', linestyle='--', linewidth=2, alpha=0.5, label='No difference')
    ax2.set_xlabel('Rank Difference (LOF - k-NN)', fontsize=12)
    ax2.set_ylabel('Count', fontsize=12)
    ax2.set_title('Distribution of Rank Differences', fontsize=13, fontweight='bold')
    ax2.legend(loc='best', fontsize=10)
    ax2.grid(True, alpha=0.3, axis='y')
    
    # Plot 3: Spatial distribution colored by rank difference
    ax3 = axes[1, 0]
    
    classified_mask = df_viz['classification'] != 'normal'
    
    # Background
    ax3.scatter(df_viz.loc[~classified_mask, 'attribute_1'],
               df_viz.loc[~classified_mask, 'attribute_2'],
               c='lightgray', s=20, alpha=0.3, label='Normal')
    
    # Classified points
    scatter = ax3.scatter(df_viz.loc[classified_mask, 'attribute_1'],
                         df_viz.loc[classified_mask, 'attribute_2'],
                         c=df_viz.loc[classified_mask, 'rank_diff'],
                         cmap='RdBu_r', s=80, alpha=0.8,
                         edgecolors='black', linewidth=0.5,
                         vmin=-100, vmax=100)
    
    cbar = plt.colorbar(scatter, ax=ax3)
    cbar.set_label('Rank Difference (LOF - k-NN)', fontsize=11)
    
    ax3.set_xlabel('Attribute 1', fontsize=12)
    ax3.set_ylabel('Attribute 2', fontsize=12)
    ax3.set_title('Points colored by Rank Difference\n(Red: LOF>k-NN, Blue: k-NN>LOF)', 
                 fontsize=13, fontweight='bold')
    ax3.grid(True, alpha=0.3)
    
    # Plot 4: Confidence vs rank statistics
    ax4 = axes[1, 1]
    
    for cls in ['global', 'local']:
        mask = df_viz['classification'] == cls
        if mask.any():
            ax4.scatter(df_viz.loc[mask, 'confidence'],
                       df_viz.loc[mask, 'rank_diff'],
                       c=color_map[cls], s=60, alpha=0.6,
                       label=cls.capitalize(), edgecolors='black', linewidth=0.5)
    
    ax4.axhline(y=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
    ax4.set_xlabel('Classification Confidence', fontsize=12)
    ax4.set_ylabel('Rank Difference (LOF - k-NN)', fontsize=12)
    ax4.set_title('Confidence vs Rank Difference', fontsize=13, fontweight='bold')
    ax4.legend(loc='best', fontsize=10)
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("="*70)
    print("RANK DIFFERENCE ANALYSIS")
    print("="*70)
    
    for cls in ['global', 'local']:
        mask = df_viz['classification'] == cls
        if mask.any():
            print(f"\n{cls.upper()} outliers:")
            print(f"  Mean rank difference: {df_viz.loc[mask, 'rank_diff'].mean():7.1f}")
            print(f"  Median rank diff:     {df_viz.loc[mask, 'rank_diff'].median():7.1f}")
            print(f"  Std rank diff:        {df_viz.loc[mask, 'rank_diff'].std():7.1f}")
            
            # Count how many have LOF rank better than kNN rank
            better_lof = (df_viz.loc[mask, 'rank_diff'] < 0).sum()
            better_knn = (df_viz.loc[mask, 'rank_diff'] > 0).sum()
            print(f"  LOF rank < k-NN rank: {better_lof}")
            print(f"  LOF rank > k-NN rank: {better_knn}")


# Run the analysis
visualize_rank_differences(df_classified)