In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
df = pd.read_csv('../data/dfki-artificial-3000-unsupervised-ad.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nOutlier label distribution:")
print(df['outlier_label'].value_counts())

# Convert outlier_label to binary for plotting
df['is_outlier'] = (df['outlier_label'] == 'outlier').astype(int)

# Create scatter plot
fig, ax = plt.subplots(figsize=(10, 8))

# Plot normal points
normal_mask = df['is_outlier'] == 0
ax.scatter(df.loc[normal_mask, 'attribute_1'], 
           df.loc[normal_mask, 'attribute_2'],
           c='blue', alpha=0.5, s=20, label='Normal')

# Plot outliers
outlier_mask = df['is_outlier'] == 1
ax.scatter(df.loc[outlier_mask, 'attribute_1'], 
           df.loc[outlier_mask, 'attribute_2'],
           c='red', alpha=0.7, s=30, label='Outlier', marker='x')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title('DFKI Artificial Dataset (3000 samples)', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nSummary statistics:")
print(df[['attribute_1', 'attribute_2']].describe())



In [None]:
from pyod.models.knn import KNN
from sklearn.preprocessing import MinMaxScaler

X = df[['attribute_1', 'attribute_2']].values
y = df['is_outlier'].values

scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

k = 20
knn_detector = KNN(n_neighbors=k, contamination=0.01)
knn_detector.fit(X_normalized)

scores = knn_detector.decision_scores_

sorted_indices = np.argsort(scores)[::-1]
ranks = np.empty_like(scores, dtype=int)
ranks[sorted_indices] = np.arange(1, len(scores) + 1)

df['knn_score'] = scores
df['knn_rank'] = ranks

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

ax1 = axes[0]
scatter1 = ax1.scatter(df['attribute_1'], df['attribute_2'], 
                       c=scores, cmap='viridis', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title(f'Points colored by k-NN Score (k={k})', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(scatter1, ax=ax1)
cbar1.set_label('Anomaly Score', fontsize=11)
ax1.grid(True, alpha=0.3)

ax2 = axes[1]
scatter2 = ax2.scatter(df['attribute_1'], df['attribute_2'], 
                       c=ranks, cmap='plasma_r', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title(f'Points colored by k-NN Rank', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(scatter2, ax=ax2)
cbar2.set_label('Rank (1 = most anomalous)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()



In [None]:
from pyod.models.lof import LOF
from sklearn.preprocessing import MinMaxScaler

X = df[['attribute_1', 'attribute_2']].values
y = df['is_outlier'].values

scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

k = 20
lof_detector = LOF(n_neighbors=k, contamination=0.01)
lof_detector.fit(X_normalized)

scores = lof_detector.decision_scores_

sorted_indices = np.argsort(scores)[::-1]
ranks = np.empty_like(scores, dtype=int)
ranks[sorted_indices] = np.arange(1, len(scores) + 1)

df['lof_score'] = scores
df['lof_rank'] = ranks

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

ax1 = axes[0]
scatter1 = ax1.scatter(df['attribute_1'], df['attribute_2'], 
                       c=scores, cmap='viridis', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title(f'Points colored by LOF Score (k={k})', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(scatter1, ax=ax1)
cbar1.set_label('Anomaly Score', fontsize=11)
ax1.grid(True, alpha=0.3)

ax2 = axes[1]
scatter2 = ax2.scatter(df['attribute_1'], df['attribute_2'], 
                       c=ranks, cmap='plasma_r', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title(f'Points colored by LOF Rank', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(scatter2, ax=ax2)
cbar2.set_label('Rank (1 = most anomalous)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()



In [None]:
# Euclidean combination of k-NN and LOF scores: magnitude + angle

knn_scores = df['knn_score'].values
lof_scores = df['lof_score'].values

mag = np.sqrt(knn_scores**2 + lof_scores**2)
angle = np.arctan2(lof_scores, knn_scores)  # LOF vs k-NN

# Store for later use if needed
df['euc_mag'] = mag
df['euc_angle'] = angle

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: magnitude of combined anomaly signal
ax1 = axes[0]
sc1 = ax1.scatter(
    df['attribute_1'], df['attribute_2'],
    c=mag, cmap='viridis',
    s=30, alpha=0.7, edgecolors='black', linewidth=0.5,
)
ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title('Outlier magnitude: sqrt(LOF^2 + kNN^2)', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(sc1, ax=ax1)
cbar1.set_label('Magnitude', fontsize=11)
ax1.grid(True, alpha=0.3)

# Plot 2: angle indicating locality vs globality
ax2 = axes[1]
sc2 = ax2.scatter(
    df['attribute_1'], df['attribute_2'],
    c=angle, cmap='twilight_shifted',
    s=30, alpha=0.7, edgecolors='black', linewidth=0.5,
)
ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title('Locality vs Globality: angle = arctan2(LOF, kNN)', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(sc2, ax=ax2)
cbar2.set_label('Angle (radians)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
c = 100
p = 0
top_knn_indices = df.nsmallest(c, 'knn_rank').index.values
top_lof_indices = df.nsmallest(c, 'lof_rank').index.values

all_top_indices = np.unique(np.concatenate([top_knn_indices, top_lof_indices]))

ratio_values = np.full(len(df), np.nan)
for idx in all_top_indices:
    knn_rank = df.loc[idx, 'knn_rank']
    lof_rank = df.loc[idx, 'lof_rank']
    ratio = (lof_rank + p) / (knn_rank + p)
    ratio_values[idx] = ratio

mask_top = ~np.isnan(ratio_values)
mask_other = np.isnan(ratio_values)

ratio_min = ratio_values[mask_top].min()
ratio_max = ratio_values[mask_top].max()
ratio_mean = ratio_values[mask_top].mean()

vmin = max(0.1, ratio_min * 0.9)
vmax = min(ratio_max * 1.1, ratio_max + 0.5)
vmax = 1
vmin = 0.6
fig, ax = plt.subplots(figsize=(12, 8))

scatter_other = ax.scatter(df.loc[mask_other, 'attribute_1'], 
                          df.loc[mask_other, 'attribute_2'],
                          c='black', s=20, alpha=0.3, label='Other points')

scatter_top = ax.scatter(df.loc[mask_top, 'attribute_1'], 
                         df.loc[mask_top, 'attribute_2'],
                         c=ratio_values[mask_top], cmap='RdYlGn', 
                         s=50, alpha=0.8, edgecolors='black', linewidth=0.5,
                         vmin=vmin, vmax=vmax, label='Top c points')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title(f'Global vs Local Outliers (c={c})\nRatio = (LOF rank + {p}) / (k-NN rank + {p})', 
             fontsize=13, fontweight='bold')
cbar = plt.colorbar(scatter_top, ax=ax)
cbar.set_label('Ratio (1 = global outlier, 0 = local outlier)', fontsize=11)
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Top {c} points by k-NN rank: {len(top_knn_indices)}")
print(f"Top {c} points by LOF rank: {len(top_lof_indices)}")
print(f"Total unique top points: {len(all_top_indices)}")
print(f"\nRatio statistics for top points:")
print(f"  Mean: {ratio_values[mask_top].mean():.4f}")
print(f"  Min: {ratio_values[mask_top].min():.4f}")
print(f"  Max: {ratio_values[mask_top].max():.4f}")



In [None]:
c = 100
p = 20
top_knn_indices = df.nsmallest(c, 'knn_rank').index.values
top_lof_indices = df.nsmallest(c, 'lof_rank').index.values

all_top_indices = np.unique(np.concatenate([top_knn_indices, top_lof_indices]))

ratio_values = np.full(len(df), np.nan)
for idx in all_top_indices:
    knn_rank = df.loc[idx, 'knn_rank']
    lof_rank = df.loc[idx, 'lof_rank']
    ratio = (lof_rank + p) / (knn_rank + p)
    ratio_values[idx] = ratio

mask_top = ~np.isnan(ratio_values)
mask_other = np.isnan(ratio_values)

ratio_min = ratio_values[mask_top].min()
ratio_max = ratio_values[mask_top].max()
ratio_mean = ratio_values[mask_top].mean()

vmin = max(0.1, ratio_min * 0.9)
vmax = min(ratio_max * 1.1, ratio_max + 0.5)
vmax = 1
vmin = 0.6
fig, ax = plt.subplots(figsize=(12, 8))

scatter_other = ax.scatter(df.loc[mask_other, 'attribute_1'], 
                          df.loc[mask_other, 'attribute_2'],
                          c='black', s=20, alpha=0.3, label='Other points')

scatter_top = ax.scatter(df.loc[mask_top, 'attribute_1'], 
                         df.loc[mask_top, 'attribute_2'],
                         c=ratio_values[mask_top], cmap='RdYlGn', 
                         s=50, alpha=0.8, edgecolors='black', linewidth=0.5,
                         vmin=vmin, vmax=vmax, label='Top c points')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title(f'Global vs Local Outliers (c={c})\nRatio = (LOF rank + {p}) / (k-NN rank + {p})', 
             fontsize=13, fontweight='bold')
cbar = plt.colorbar(scatter_top, ax=ax)
cbar.set_label('Ratio (1 = global outlier, 0 = local outlier)', fontsize=11)
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Top {c} points by k-NN rank: {len(top_knn_indices)}")
print(f"Top {c} points by LOF rank: {len(top_lof_indices)}")
print(f"Total unique top points: {len(all_top_indices)}")
print(f"\nRatio statistics for top points:")
print(f"  Mean: {ratio_values[mask_top].mean():.4f}")
print(f"  Min: {ratio_values[mask_top].min():.4f}")
print(f"  Max: {ratio_values[mask_top].max():.4f}")



In [None]:
print("k-NN mean score by label:")
print(df.groupby('is_outlier')['knn_score'].mean())
print("LOF mean score by label:")
print(df.groupby('is_outlier')['lof_score'].mean())


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np

def analyze_ensemble_globality(df, X_normalized, k_list=[3, 4, 5, 6, 7], contamination=0.05):
    """
    Computes Globality Index by averaging distances to centroids over multiple k values.
    Filters top anomalies using existing LOF scores.
    """
    
    print(f"--- Running Ensemble Globality Analysis ---")
    print(f"Averaging cluster structures for k = {k_list}")
    
    # ==========================================
    # 1. ENSEMBLE DISTANCE CALCULATION
    # ==========================================
    n_samples = X_normalized.shape[0]
    cumulative_dists = np.zeros(n_samples)
    
    # Iterate through each k in the list
    for k in k_list:
        # Fit KMeans
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto').fit(X_normalized)
        centers = kmeans.cluster_centers_
        
        # Get min distance to ANY center for this specific k
        dists = pairwise_distances(X_normalized, centers).min(axis=1)
        
        # Add to cumulative sum
        cumulative_dists += dists

    # Average the distances
    avg_dists_to_centers = cumulative_dists / len(k_list)
    
    # ==========================================
    # 2. FILTER CANDIDATES (Top C% by LOF)
    # ==========================================
    # We assume 'lof_score' already exists in df from previous steps
    if 'lof_score' not in df.columns:
        raise ValueError("DataFrame must contain 'lof_score' column.")
        
    threshold = df['lof_score'].quantile(1 - contamination)
    candidate_mask = df['lof_score'] > threshold
    
    # Extract distances only for the candidates
    candidate_dists = avg_dists_to_centers[candidate_mask]
    candidate_indices = df.index[candidate_mask]
    
    # ==========================================
    # 3. COMPUTE FINAL INDEX (Standardize Subset)
    # ==========================================
    # Normalize the average distances of the candidates to 0-1 for coloring
    scaler_subset = MinMaxScaler()
    globality_index = scaler_subset.fit_transform(candidate_dists.reshape(-1, 1)).flatten()
    
    # Store in DF
    col_name = 'ensemble_globality_index'
    df[col_name] = np.nan
    df.loc[candidate_indices, col_name] = globality_index
    
    # ==========================================
    # 4. VISUALIZATION
    # ==========================================
    fig, ax = plt.subplots(figsize=(12, 9))

    # A. Plot Background (Normal Points)
    ax.scatter(df.loc[~candidate_mask, 'attribute_1'], 
               df.loc[~candidate_mask, 'attribute_2'],
               c='gainsboro', s=20, alpha=0.4, label='Normal Data')

    # B. Plot Candidates (Colored by Ensemble Globality)
    scatter = ax.scatter(df.loc[candidate_mask, 'attribute_1'], 
                         df.loc[candidate_mask, 'attribute_2'], 
                         c=globality_index, 
                         cmap='plasma', # plasma is great for intensity
                         s=60, alpha=0.9, edgecolors='black', linewidth=0.5,
                         label=f'Top {int(contamination*100)}% Anomalies')

    # Formatting
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label(f'Ensemble Globality (Avg Dist to Centers, k={k_list})', fontsize=11)
    cbar.set_ticks([0, 0.5, 1])
    cbar.set_ticklabels(['Local (Sparse Inlier)', 'Mixed', 'Global (Isolated)'])

    ax.set_title(f'Ensemble Globality Map (Averaged over k={k_list})\n'
                 f'Showing top {int(contamination*100)}% LOF candidates', 
                 fontsize=14, fontweight='bold')
    ax.set_xlabel('Attribute 1')
    ax.set_ylabel('Attribute 2')
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()
    
    return df

# ==========================================
# HOW TO RUN IT
# ==========================================

# We average over k=3, 4, 5, 6, 7 to account for structural ambiguity
# This makes the "Global" score much more scientifically robust
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[5], 
    contamination=0.05
)

# Check the top results
print("\nTop 5 Global Outliers (Most Isolated across all k):")
cols = ['attribute_1', 'attribute_2', 'ensemble_globality_index']
print(df.dropna(subset=['ensemble_globality_index']).nlargest(5, 'ensemble_globality_index')[cols])

In [None]:
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[3, 4, 5, 6, 7], 
    contamination=0.05
)

In [None]:
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[4,5,6,7,8,9,10], 
    contamination=0.05
)

In [None]:
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[3,4,5,6,7], 
    contamination=1
)

In [None]:
c = 100
p = 20
top_knn_indices = df.nsmallest(c, 'knn_rank').index.values
top_lof_indices = df.nsmallest(c, 'lof_rank').index.values

all_top_indices = np.unique(np.concatenate([top_knn_indices, top_lof_indices]))

ratio_values = np.full(len(df), np.nan)
for idx in all_top_indices:
    knn_rank = df.loc[idx, 'knn_rank']
    lof_rank = df.loc[idx, 'lof_rank']
    ratio = (lof_rank + knn_rank) / 2* (lof_rank)
    ratio_values[idx] = ratio

mask_top = ~np.isnan(ratio_values)
mask_other = np.isnan(ratio_values)

ratio_min = ratio_values[mask_top].min()
ratio_max = ratio_values[mask_top].max()
ratio_mean = ratio_values[mask_top].mean()

vmin = max(0.1, ratio_min * 0.9)
vmax = min(ratio_max * 1.1, ratio_max + 0.5)
vmin = ratio_min
vmax = ratio_max
fig, ax = plt.subplots(figsize=(12, 8))

scatter_other = ax.scatter(df.loc[mask_other, 'attribute_1'], 
                          df.loc[mask_other, 'attribute_2'],
                          c='black', s=20, alpha=0.3, label='Other points')

scatter_top = ax.scatter(df.loc[mask_top, 'attribute_1'], 
                         df.loc[mask_top, 'attribute_2'],
                         c=ratio_values[mask_top], cmap='RdYlGn', 
                         s=50, alpha=0.8, edgecolors='black', linewidth=0.5,
                         vmin=vmin, vmax=vmax, label='Top c points')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title(f'Global vs Local Outliers (c={c})\nRatio = (LOF rank + {p}) / (k-NN rank + {p})', 
             fontsize=13, fontweight='bold')
cbar = plt.colorbar(scatter_top, ax=ax)
cbar.set_label('Ratio (1 = global outlier, 0 = local outlier)', fontsize=11)
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Top {c} points by k-NN rank: {len(top_knn_indices)}")
print(f"Top {c} points by LOF rank: {len(top_lof_indices)}")
print(f"Total unique top points: {len(all_top_indices)}")
print(f"\nRatio statistics for top points:")
print(f"  Mean: {ratio_values[mask_top].mean():.4f}")
print(f"  Min: {ratio_values[mask_top].min():.4f}")
print(f"  Max: {ratio_values[mask_top].max():.4f}")



# Majority Voting Algorithm for Local vs Global Outlier Classification

**Key Idea:**
- If a point is in the top-c anomalies for **both k-NN and LOF** → vote for **Global**
- If a point is in the top-c anomalies for **LOF only** (not k-NN) → vote for **Local**
- Repeat for multiple c thresholds and use **majority voting** to classify each point

In [None]:
def majority_voting_classifier(df, c_values, knn_rank_col='knn_rank', lof_rank_col='lof_rank'):
    """
    Classify anomalies as local or global using majority voting across multiple thresholds.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame containing k-NN and LOF ranks
    c_values : list or array
        List of top-c thresholds to test (e.g., [10, 20, 30, 40, 50, 100])
    knn_rank_col : str
        Column name for k-NN ranks
    lof_rank_col : str
        Column name for LOF ranks
    
    Returns:
    --------
    DataFrame with added columns:
        - 'votes_global': number of global votes
        - 'votes_local': number of local votes
        - 'classification': 'global', 'local', or 'normal' (no votes)
        - 'confidence': vote margin (higher = more confident)
    """
    
    # Initialize vote counters for each point
    votes_global = np.zeros(len(df), dtype=int)
    votes_local = np.zeros(len(df), dtype=int)
    
    # For each threshold c
    for c in c_values:
        # Get top c points by k-NN rank
        top_knn = set(df.nsmallest(c, knn_rank_col).index)
        
        # Get top c points by LOF rank
        top_lof = set(df.nsmallest(c, lof_rank_col).index)
        
        # Get all points that are in top-c of at least one method
        all_top = top_knn.union(top_lof)
        
        for idx in all_top:
            in_knn = idx in top_knn
            in_lof = idx in top_lof
            
            if in_knn and in_lof:
                # Point is in both → Global outlier
                votes_global[idx] += 1
            elif in_lof and not in_knn:
                # Point is only in LOF → Local outlier
                votes_local[idx] += 1
            # If only in k-NN, we don't vote (ambiguous case)
    
    # Add votes to dataframe
    df['votes_global'] = votes_global
    df['votes_local'] = votes_local
    df['total_votes'] = votes_global + votes_local
    
    # Classify based on majority
    classifications = []
    confidences = []
    
    for i in range(len(df)):
        total = votes_global[i] + votes_local[i]
        
        if total == 0:
            classifications.append('normal')
            confidences.append(0)
        elif votes_global[i] > votes_local[i]:
            classifications.append('global')
            confidences.append(votes_global[i] - votes_local[i])
        elif votes_local[i] > votes_global[i]:
            classifications.append('local')
            confidences.append(votes_local[i] - votes_global[i])
        else:  # tie
            classifications.append('tie')
            confidences.append(0)
    
    df['classification'] = classifications
    df['confidence'] = confidences
    
    return df

# Define range of thresholds to test
c_values = [5, 10, 15, 20, 25, 30, 35, 40]

# Apply majority voting
df = majority_voting_classifier(df, c_values)

# Print summary statistics
print("=" * 80)
print("MAJORITY VOTING CLASSIFICATION RESULTS")
print("=" * 80)
print(f"\nTested thresholds: {c_values}")
print(f"Number of threshold values: {len(c_values)}")
print("\nClassification counts:")
print(df['classification'].value_counts())

print("\n" + "-" * 80)
print("Vote statistics by classification:")
print("-" * 80)
for cls in ['global', 'local', 'tie', 'normal']:
    subset = df[df['classification'] == cls]
    if len(subset) > 0:
        print(f"\n{cls.upper()}:")
        print(f"  Count: {len(subset)}")
        print(f"  Avg global votes: {subset['votes_global'].mean():.2f}")
        print(f"  Avg local votes: {subset['votes_local'].mean():.2f}")
        print(f"  Avg confidence: {subset['confidence'].mean():.2f}")

print("\n" + "=" * 80)

In [None]:
# Visualize the classification results
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Define colors for each classification
color_map = {
    'normal': 'lightgray',
    'global': 'red',
    'local': 'blue',
    'tie': 'purple'
}

# Plot 1: Classification (categorical colors)
ax1 = axes[0]
for cls in ['normal', 'global', 'local', 'tie']:
    mask = df['classification'] == cls
    if mask.sum() > 0:
        ax1.scatter(
            df.loc[mask, 'attribute_1'],
            df.loc[mask, 'attribute_2'],
            c=color_map[cls],
            s=50 if cls in ['global', 'local'] else 20,
            alpha=0.7 if cls in ['global', 'local'] else 0.3,
            label=f'{cls.capitalize()} ({mask.sum()})',
            edgecolors='black' if cls in ['global', 'local'] else 'none',
            linewidth=0.5
        )

ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title('Local vs Global Outlier Classification\n(Majority Voting)', 
             fontsize=14, fontweight='bold')
ax1.legend(loc='best', fontsize=10)
ax1.grid(True, alpha=0.3)

# Plot 2: Confidence scores (only for classified anomalies)
ax2 = axes[1]
anomaly_mask = df['classification'].isin(['global', 'local'])
normal_mask = ~anomaly_mask

# Plot normal points in background
ax2.scatter(
    df.loc[normal_mask, 'attribute_1'],
    df.loc[normal_mask, 'attribute_2'],
    c='lightgray',
    s=20,
    alpha=0.3,
    label='Normal/Tie'
)

# Plot anomalies colored by confidence
scatter = ax2.scatter(
    df.loc[anomaly_mask, 'attribute_1'],
    df.loc[anomaly_mask, 'attribute_2'],
    c=df.loc[anomaly_mask, 'confidence'],
    cmap='viridis',
    s=50,
    alpha=0.8,
    edgecolors='black',
    linewidth=0.5
)

ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title('Classification Confidence\n(Vote Margin)', 
             fontsize=14, fontweight='bold')
cbar = plt.colorbar(scatter, ax=ax2)
cbar.set_label('Confidence (vote difference)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nVisualization complete!")

In [None]:
# Create a detailed comparison with ground truth labels
fig, axes = plt.subplots(2, 2, figsize=(16, 14))

# Plot 1: Ground Truth Labels
ax1 = axes[0, 0]
for is_out, color, label in [(0, 'blue', 'Normal'), (1, 'red', 'Outlier')]:
    mask = df['is_outlier'] == is_out
    ax1.scatter(
        df.loc[mask, 'attribute_1'],
        df.loc[mask, 'attribute_2'],
        c=color,
        s=40 if is_out else 20,
        alpha=0.6,
        label=f'{label} ({mask.sum()})',
        marker='x' if is_out else 'o',
        edgecolors='black' if is_out else 'none',
        linewidth=0.5
    )
ax1.set_xlabel('Attribute 1', fontsize=11)
ax1.set_ylabel('Attribute 2', fontsize=11)
ax1.set_title('Ground Truth Labels', fontsize=13, fontweight='bold')
ax1.legend(loc='best', fontsize=9)
ax1.grid(True, alpha=0.3)

# Plot 2: Our Classification
ax2 = axes[0, 1]
for cls, color in [('normal', 'lightgray'), ('global', 'red'), ('local', 'blue'), ('tie', 'purple')]:
    mask = df['classification'] == cls
    if mask.sum() > 0:
        ax2.scatter(
            df.loc[mask, 'attribute_1'],
            df.loc[mask, 'attribute_2'],
            c=color,
            s=50 if cls in ['global', 'local'] else 20,
            alpha=0.7 if cls in ['global', 'local'] else 0.3,
            label=f'{cls.capitalize()} ({mask.sum()})',
            edgecolors='black' if cls in ['global', 'local'] else 'none',
            linewidth=0.5
        )
ax2.set_xlabel('Attribute 1', fontsize=11)
ax2.set_ylabel('Attribute 2', fontsize=11)
ax2.set_title('Majority Vote Classification', fontsize=13, fontweight='bold')
ax2.legend(loc='best', fontsize=9)
ax2.grid(True, alpha=0.3)

# Plot 3: Voting details - Global votes
ax3 = axes[1, 0]
anomaly_mask = df['classification'].isin(['global', 'local'])
normal_mask = ~anomaly_mask

ax3.scatter(df.loc[normal_mask, 'attribute_1'], df.loc[normal_mask, 'attribute_2'],
           c='lightgray', s=15, alpha=0.2)
scatter3 = ax3.scatter(
    df.loc[anomaly_mask, 'attribute_1'],
    df.loc[anomaly_mask, 'attribute_2'],
    c=df.loc[anomaly_mask, 'votes_global'],
    cmap='Reds',
    s=50,
    alpha=0.8,
    edgecolors='black',
    linewidth=0.5
)
ax3.set_xlabel('Attribute 1', fontsize=11)
ax3.set_ylabel('Attribute 2', fontsize=11)
ax3.set_title('Global Votes (higher = more global)', fontsize=13, fontweight='bold')
cbar3 = plt.colorbar(scatter3, ax=ax3)
cbar3.set_label('Global votes', fontsize=10)
ax3.grid(True, alpha=0.3)

# Plot 4: Voting details - Local votes
ax4 = axes[1, 1]
ax4.scatter(df.loc[normal_mask, 'attribute_1'], df.loc[normal_mask, 'attribute_2'],
           c='lightgray', s=15, alpha=0.2)
scatter4 = ax4.scatter(
    df.loc[anomaly_mask, 'attribute_1'],
    df.loc[anomaly_mask, 'attribute_2'],
    c=df.loc[anomaly_mask, 'votes_local'],
    cmap='Blues',
    s=50,
    alpha=0.8,
    edgecolors='black',
    linewidth=0.5
)
ax4.set_xlabel('Attribute 1', fontsize=11)
ax4.set_ylabel('Attribute 2', fontsize=11)
ax4.set_title('Local Votes (higher = more local)', fontsize=13, fontweight='bold')
cbar4 = plt.colorbar(scatter4, ax=ax4)
cbar4.set_label('Local votes', fontsize=10)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Detailed analysis: How do our classifications align with ground truth?
print("=" * 80)
print("CLASSIFICATION vs GROUND TRUTH ANALYSIS")
print("=" * 80)

# Cross-tabulation
print("\nCross-tabulation: Classification vs Ground Truth")
print("-" * 80)
crosstab = pd.crosstab(
    df['classification'], 
    df['is_outlier'],
    margins=True,
    margins_name='Total'
)
crosstab.columns = ['Normal (GT)', 'Outlier (GT)', 'Total']
print(crosstab)

# Among ground truth outliers, how are they classified?
gt_outliers = df[df['is_outlier'] == 1]
print("\n" + "=" * 80)
print("GROUND TRUTH OUTLIERS BREAKDOWN")
print("=" * 80)
print(f"Total ground truth outliers: {len(gt_outliers)}")
print("\nHow are they classified?")
for cls in ['global', 'local', 'tie', 'normal']:
    count = (gt_outliers['classification'] == cls).sum()
    pct = count / len(gt_outliers) * 100 if len(gt_outliers) > 0 else 0
    print(f"  {cls.capitalize():10s}: {count:4d} ({pct:5.1f}%)")

# Show top examples of each type
print("\n" + "=" * 80)
print("TOP EXAMPLES BY CLASSIFICATION")
print("=" * 80)

for cls in ['global', 'local']:
    subset = df[df['classification'] == cls].nlargest(5, 'confidence')
    if len(subset) > 0:
        print(f"\n{cls.upper()} OUTLIERS (Top 5 by confidence):")
        print("-" * 80)
        cols_to_show = ['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 
                        'votes_global', 'votes_local', 'confidence', 'is_outlier']
        print(subset[cols_to_show].to_string(index=True))

# Statistics on ranks
print("\n" + "=" * 80)
print("RANK STATISTICS BY CLASSIFICATION")
print("=" * 80)

for cls in ['global', 'local']:
    subset = df[df['classification'] == cls]
    if len(subset) > 0:
        print(f"\n{cls.upper()} outliers (n={len(subset)}):")
        print(f"  k-NN rank - mean: {subset['knn_rank'].mean():.1f}, median: {subset['knn_rank'].median():.1f}, min: {subset['knn_rank'].min()}, max: {subset['knn_rank'].max()}")
        print(f"  LOF rank  - mean: {subset['lof_rank'].mean():.1f}, median: {subset['lof_rank'].median():.1f}, min: {subset['lof_rank'].min()}, max: {subset['lof_rank'].max()}")
        print(f"  Rank diff (LOF-kNN) - mean: {(subset['lof_rank'] - subset['knn_rank']).mean():.1f}")

print("\n" + "=" * 80)

## Experiment with Different Threshold Ranges

Let's see how the classification changes with different ranges of c values.

In [None]:
# Experiment with different threshold strategies
threshold_strategies = {
    'Fine-grained (10-100)': list(range(10, 101, 10)),
    'Coarse (10-200, step 20)': list(range(10, 201, 20)),
    'Wide range (5-300)': [5, 10, 20, 30, 50, 75, 100, 150, 200, 250, 300],
    'Small thresholds (5-50)': list(range(5, 51, 5)),
}

results_summary = []

fig, axes = plt.subplots(2, 2, figsize=(18, 14))
axes = axes.flatten()

for idx, (strategy_name, c_vals) in enumerate(threshold_strategies.items()):
    # Create a copy of original dataframe
    df_temp = df[['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 'is_outlier']].copy()
    
    # Apply classification
    df_temp = majority_voting_classifier(df_temp, c_vals)
    
    # Store summary
    summary = {
        'Strategy': strategy_name,
        'Thresholds': c_vals,
        'Global': (df_temp['classification'] == 'global').sum(),
        'Local': (df_temp['classification'] == 'local').sum(),
        'Tie': (df_temp['classification'] == 'tie').sum(),
        'Normal': (df_temp['classification'] == 'normal').sum(),
    }
    results_summary.append(summary)
    
    # Plot
    ax = axes[idx]
    for cls, color in [('normal', 'lightgray'), ('global', 'red'), ('local', 'blue'), ('tie', 'purple')]:
        mask = df_temp['classification'] == cls
        if mask.sum() > 0:
            ax.scatter(
                df_temp.loc[mask, 'attribute_1'],
                df_temp.loc[mask, 'attribute_2'],
                c=color,
                s=50 if cls in ['global', 'local'] else 15,
                alpha=0.7 if cls in ['global', 'local'] else 0.3,
                label=f'{cls.capitalize()} ({mask.sum()})',
                edgecolors='black' if cls in ['global', 'local'] else 'none',
                linewidth=0.5
            )
    
    ax.set_xlabel('Attribute 1', fontsize=10)
    ax.set_ylabel('Attribute 2', fontsize=10)
    ax.set_title(f'{strategy_name}\n(c: {min(c_vals)}-{max(c_vals)}, n={len(c_vals)})', 
                 fontsize=11, fontweight='bold')
    ax.legend(loc='best', fontsize=8)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print comparison table
print("\n" + "=" * 80)
print("COMPARISON OF THRESHOLD STRATEGIES")
print("=" * 80)
comparison_df = pd.DataFrame(results_summary)
print(comparison_df.to_string(index=False))

print("\n" + "=" * 80)

## Save Results

Export the classified data for further analysis.

In [None]:
# Save the classified data
output_file = 'dfki_classified_local_global.csv'
output_cols = [
    'attribute_1', 'attribute_2', 
    'is_outlier',  # ground truth
    'knn_score', 'knn_rank',
    'lof_score', 'lof_rank',
    'votes_global', 'votes_local', 'total_votes',
    'classification', 'confidence'
]

df[output_cols].to_csv(output_file, index=False)
print(f"✓ Results saved to: {output_file}")
print(f"  Rows: {len(df)}")
print(f"  Columns: {len(output_cols)}")

# Summary statistics
print("\n" + "=" * 80)
print("FINAL SUMMARY")
print("=" * 80)
print(f"\nDataset: DFKI Artificial (3000 samples)")
print(f"Thresholds tested: {c_values}")
print(f"\nClassification results:")
print(df['classification'].value_counts().to_string())

print(f"\n\nKey insights:")
print(f"  • Global outliers: Points consistently ranked high by BOTH k-NN and LOF")
print(f"  • Local outliers: Points ranked high by LOF but NOT by k-NN")
print(f"  • Confidence: Measured by vote margin (higher = more certain)")

# Show a few example points
print("\n" + "=" * 80)
print("EXAMPLE CLASSIFICATIONS")
print("=" * 80)

if (df['classification'] == 'global').any():
    print("\nMost confident GLOBAL outlier:")
    global_example = df[df['classification'] == 'global'].nlargest(1, 'confidence')
    print(global_example[['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 
                           'votes_global', 'votes_local', 'confidence']].to_string())

if (df['classification'] == 'local').any():
    print("\nMost confident LOCAL outlier:")
    local_example = df[df['classification'] == 'local'].nlargest(1, 'confidence')
    print(local_example[['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 
                          'votes_global', 'votes_local', 'confidence']].to_string())

print("\n" + "=" * 80)

# Cleanup and New Synthetic Dataset

**Note:** The previous cells (4-12) contained repetitive code and clustering approaches. 

**What to keep:**
- Cell 0: Dataset loading and visualization ✓
- Cell 2: k-NN analysis ✓  
- Cell 3: LOF analysis ✓
- Cell 7: Score comparison ✓
- Cells 14-21: Majority voting algorithm ✓

**What was removed:**
- Cells 4-6: Repetitive ratio analyses
- Cells 8-12: Clustering-based approaches (not suitable for our problem)
- Empty cells

---

## Creating a Better Synthetic Dataset

Now we'll generate a synthetic dataset with **very clear local and global outliers** to properly test our majority voting classifier.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic dataset with clear local and global outliers
def generate_synthetic_dataset(n_normal=1000, n_global=20, n_local=30):
    """
    Generate a synthetic 2D dataset with:
    - Normal points in 3 main clusters
    - Global outliers: far from all clusters
    - Local outliers: close to a cluster but anomalous within it
    """
    
    # Main clusters (normal points)
    cluster1 = np.random.randn(n_normal//3, 2) * 0.5 + np.array([0, 0])
    cluster2 = np.random.randn(n_normal//3, 2) * 0.6 + np.array([5, 2])
    cluster3 = np.random.randn(n_normal - 2*(n_normal//3), 2) * 0.55 + np.array([2, -3])
    normal_points = np.vstack([cluster1, cluster2, cluster3])
    
    # Global outliers: far from all clusters, scattered
    global_outliers = np.random.uniform(low=-8, high=10, size=(n_global, 2))
    # Make sure they're actually far from clusters
    mask = (np.abs(global_outliers[:, 0]) > 6) | (np.abs(global_outliers[:, 1]) > 5)
    global_outliers = global_outliers[mask][:n_global]
    if len(global_outliers) < n_global:
        extra = np.random.uniform(low=[-8, 7], high=[-6, 10], size=(n_global - len(global_outliers), 2))
        global_outliers = np.vstack([global_outliers, extra])
    
    # Local outliers: near cluster2 but anomalous
    # Create points that are close to cluster2 but in low-density regions
    local_outliers = np.random.randn(n_local, 2) * 1.5 + np.array([5, 2])
    # Push them to the edges of the cluster
    angles = np.linspace(0, 2*np.pi, n_local)
    radial_offset = np.random.uniform(1.5, 2.5, n_local)
    local_outliers[:, 0] += radial_offset * np.cos(angles)
    local_outliers[:, 1] += radial_offset * np.sin(angles)
    
    # Combine all points
    X = np.vstack([normal_points, global_outliers, local_outliers])
    
    # Create labels
    labels = np.array(['normal'] * n_normal + 
                      ['global'] * n_global + 
                      ['local'] * n_local)
    
    # Create DataFrame
    df = pd.DataFrame({
        'attribute_1': X[:, 0],
        'attribute_2': X[:, 1],
        'true_label': labels,
        'is_outlier': (labels != 'normal').astype(int)
    })
    
    return df

# Generate the dataset
df_synthetic = generate_synthetic_dataset(n_normal=1000, n_global=20, n_local=30)

print("=" * 80)
print("SYNTHETIC DATASET GENERATED")
print("=" * 80)
print(f"\nDataset shape: {df_synthetic.shape}")
print(f"\nClass distribution:")
print(df_synthetic['true_label'].value_counts())
print(f"\nTotal outliers: {df_synthetic['is_outlier'].sum()}")

# Visualize the synthetic dataset
fig, ax = plt.subplots(figsize=(12, 9))

colors = {'normal': 'lightblue', 'global': 'red', 'local': 'orange'}
markers = {'normal': 'o', 'global': 'x', 'local': '^'}
sizes = {'normal': 20, 'global': 100, 'local': 80}

for label in ['normal', 'global', 'local']:
    mask = df_synthetic['true_label'] == label
    ax.scatter(
        df_synthetic.loc[mask, 'attribute_1'],
        df_synthetic.loc[mask, 'attribute_2'],
        c=colors[label],
        marker=markers[label],
        s=sizes[label],
        alpha=0.7 if label == 'normal' else 0.9,
        label=f'{label.capitalize()} ({mask.sum()})',
        edgecolors='black' if label != 'normal' else 'none',
        linewidth=1.5 if label != 'normal' else 0
    )

ax.set_xlabel('Attribute 1', fontsize=13)
ax.set_ylabel('Attribute 2', fontsize=13)
ax.set_title('Synthetic Dataset with Clear Local and Global Outliers', 
             fontsize=15, fontweight='bold')
ax.legend(loc='best', fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "=" * 80)

In [None]:
# Apply k-NN and LOF to the synthetic dataset
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import MinMaxScaler

# Prepare data
X_synthetic = df_synthetic[['attribute_1', 'attribute_2']].values
scaler = MinMaxScaler()
X_synthetic_normalized = scaler.fit_transform(X_synthetic)

# Run k-NN
k = 20
print(f"Running k-NN with k={k}...")
knn_detector = KNN(n_neighbors=k, contamination=0.05)
knn_detector.fit(X_synthetic_normalized)
knn_scores = knn_detector.decision_scores_

# Calculate ranks
sorted_indices = np.argsort(knn_scores)[::-1]
knn_ranks = np.empty_like(knn_scores, dtype=int)
knn_ranks[sorted_indices] = np.arange(1, len(knn_scores) + 1)

df_synthetic['knn_score'] = knn_scores
df_synthetic['knn_rank'] = knn_ranks

# Run LOF
print(f"Running LOF with k={k}...")
lof_detector = LOF(n_neighbors=k, contamination=0.05)
lof_detector.fit(X_synthetic_normalized)
lof_scores = lof_detector.decision_scores_

# Calculate ranks
sorted_indices = np.argsort(lof_scores)[::-1]
lof_ranks = np.empty_like(lof_scores, dtype=int)
lof_ranks[sorted_indices] = np.arange(1, len(lof_scores) + 1)

df_synthetic['lof_score'] = lof_scores
df_synthetic['lof_rank'] = lof_ranks

print("✓ k-NN and LOF analysis complete")

# Visualize scores
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# k-NN scores
ax1 = axes[0]
for label in ['normal', 'global', 'local']:
    mask = df_synthetic['true_label'] == label
    scatter = ax1.scatter(
        df_synthetic.loc[mask, 'attribute_1'],
        df_synthetic.loc[mask, 'attribute_2'],
        c=df_synthetic.loc[mask, 'knn_score'],
        cmap='viridis',
        s=50 if label != 'normal' else 20,
        alpha=0.7,
        edgecolors='black' if label != 'normal' else 'none',
        linewidth=1 if label != 'normal' else 0,
        vmin=knn_scores.min(),
        vmax=knn_scores.max()
    )
ax1.set_xlabel('Attribute 1', fontsize=11)
ax1.set_ylabel('Attribute 2', fontsize=11)
ax1.set_title(f'k-NN Scores (k={k})', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(scatter, ax=ax1)
cbar1.set_label('Anomaly Score', fontsize=10)
ax1.grid(True, alpha=0.3)

# LOF scores
ax2 = axes[1]
for label in ['normal', 'global', 'local']:
    mask = df_synthetic['true_label'] == label
    scatter = ax2.scatter(
        df_synthetic.loc[mask, 'attribute_1'],
        df_synthetic.loc[mask, 'attribute_2'],
        c=df_synthetic.loc[mask, 'lof_score'],
        cmap='viridis',
        s=50 if label != 'normal' else 20,
        alpha=0.7,
        edgecolors='black' if label != 'normal' else 'none',
        linewidth=1 if label != 'normal' else 0,
        vmin=lof_scores.min(),
        vmax=lof_scores.max()
    )
ax2.set_xlabel('Attribute 1', fontsize=11)
ax2.set_ylabel('Attribute 2', fontsize=11)
ax2.set_title(f'LOF Scores (k={k})', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(scatter, ax=ax2)
cbar2.set_label('Anomaly Score', fontsize=10)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Apply majority voting classifier to synthetic dataset
# Since we have 50 outliers (20 global + 30 local), use thresholds around that range

# Define thresholds centered around the true number of outliers
c_values_synthetic = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 70, 80, 100]

print("=" * 80)
print("APPLYING MAJORITY VOTING TO SYNTHETIC DATASET")
print("=" * 80)
print(f"\nTrue outliers: {df_synthetic['is_outlier'].sum()}")
print(f"  - Global: {(df_synthetic['true_label'] == 'global').sum()}")
print(f"  - Local: {(df_synthetic['true_label'] == 'local').sum()}")
print(f"\nTesting thresholds: {c_values_synthetic}")
print()

# Apply the classifier
df_synthetic = majority_voting_classifier(df_synthetic, c_values_synthetic)

# Print summary
print("\n" + "=" * 80)
print("CLASSIFICATION RESULTS")
print("=" * 80)
print(f"\nClassification counts:")
print(df_synthetic['classification'].value_counts())

print("\n" + "-" * 80)
print("Vote statistics by classification:")
print("-" * 80)
for cls in ['global', 'local', 'tie', 'normal']:
    subset = df_synthetic[df_synthetic['classification'] == cls]
    if len(subset) > 0:
        print(f"\n{cls.upper()}:")
        print(f"  Count: {len(subset)}")
        print(f"  Avg global votes: {subset['votes_global'].mean():.2f}")
        print(f"  Avg local votes: {subset['votes_local'].mean():.2f}")
        print(f"  Avg confidence: {subset['confidence'].mean():.2f}")

# Compare with ground truth
print("\n" + "=" * 80)
print("GROUND TRUTH COMPARISON")
print("=" * 80)

# Cross-tabulation
print("\nClassification vs True Label:")
crosstab = pd.crosstab(
    df_synthetic['classification'], 
    df_synthetic['true_label'],
    margins=True,
    margins_name='Total'
)
print(crosstab)

# Check how well we identified each type
print("\n" + "-" * 80)
print("Performance by True Label:")
print("-" * 80)

for true_label in ['global', 'local', 'normal']:
    subset = df_synthetic[df_synthetic['true_label'] == true_label]
    print(f"\n{true_label.upper()} (n={len(subset)}):")
    if len(subset) > 0:
        for cls in ['global', 'local', 'tie', 'normal']:
            count = (subset['classification'] == cls).sum()
            pct = count / len(subset) * 100
            if count > 0:
                print(f"  Classified as {cls}: {count} ({pct:.1f}%)")

print("\n" + "=" * 80)

In [None]:
# Visualize classification results on synthetic dataset
fig, axes = plt.subplots(2, 2, figsize=(18, 16))

# Plot 1: Ground Truth
ax1 = axes[0, 0]
for label, color, marker, size in [
    ('normal', 'lightblue', 'o', 20),
    ('global', 'red', 'x', 100),
    ('local', 'orange', '^', 80)
]:
    mask = df_synthetic['true_label'] == label
    ax1.scatter(
        df_synthetic.loc[mask, 'attribute_1'],
        df_synthetic.loc[mask, 'attribute_2'],
        c=color,
        marker=marker,
        s=size,
        alpha=0.7 if label == 'normal' else 0.9,
        label=f'{label.capitalize()} ({mask.sum()})',
        edgecolors='black' if label != 'normal' else 'none',
        linewidth=1.5 if label != 'normal' else 0
    )
ax1.set_xlabel('Attribute 1', fontsize=11)
ax1.set_ylabel('Attribute 2', fontsize=11)
ax1.set_title('Ground Truth Labels', fontsize=13, fontweight='bold')
ax1.legend(loc='best', fontsize=10)
ax1.grid(True, alpha=0.3)

# Plot 2: Our Classification
ax2 = axes[0, 1]
color_map = {'normal': 'lightgray', 'global': 'red', 'local': 'blue', 'tie': 'purple'}
for cls in ['normal', 'global', 'local', 'tie']:
    mask = df_synthetic['classification'] == cls
    if mask.sum() > 0:
        ax2.scatter(
            df_synthetic.loc[mask, 'attribute_1'],
            df_synthetic.loc[mask, 'attribute_2'],
            c=color_map[cls],
            s=50 if cls in ['global', 'local'] else 20,
            alpha=0.7 if cls in ['global', 'local'] else 0.3,
            label=f'{cls.capitalize()} ({mask.sum()})',
            edgecolors='black' if cls in ['global', 'local'] else 'none',
            linewidth=0.5
        )
ax2.set_xlabel('Attribute 1', fontsize=11)
ax2.set_ylabel('Attribute 2', fontsize=11)
ax2.set_title('Majority Vote Classification', fontsize=13, fontweight='bold')
ax2.legend(loc='best', fontsize=10)
ax2.grid(True, alpha=0.3)

# Plot 3: Global Votes
ax3 = axes[1, 0]
anomaly_mask = df_synthetic['classification'].isin(['global', 'local'])
normal_mask = ~anomaly_mask

ax3.scatter(df_synthetic.loc[normal_mask, 'attribute_1'], 
            df_synthetic.loc[normal_mask, 'attribute_2'],
           c='lightgray', s=15, alpha=0.2)
scatter3 = ax3.scatter(
    df_synthetic.loc[anomaly_mask, 'attribute_1'],
    df_synthetic.loc[anomaly_mask, 'attribute_2'],
    c=df_synthetic.loc[anomaly_mask, 'votes_global'],
    cmap='Reds',
    s=60,
    alpha=0.8,
    edgecolors='black',
    linewidth=0.5
)
ax3.set_xlabel('Attribute 1', fontsize=11)
ax3.set_ylabel('Attribute 2', fontsize=11)
ax3.set_title('Global Votes (higher = more global)', fontsize=13, fontweight='bold')
cbar3 = plt.colorbar(scatter3, ax=ax3)
cbar3.set_label('Global votes', fontsize=10)
ax3.grid(True, alpha=0.3)

# Plot 4: Local Votes
ax4 = axes[1, 1]
ax4.scatter(df_synthetic.loc[normal_mask, 'attribute_1'], 
            df_synthetic.loc[normal_mask, 'attribute_2'],
           c='lightgray', s=15, alpha=0.2)
scatter4 = ax4.scatter(
    df_synthetic.loc[anomaly_mask, 'attribute_1'],
    df_synthetic.loc[anomaly_mask, 'attribute_2'],
    c=df_synthetic.loc[anomaly_mask, 'votes_local'],
    cmap='Blues',
    s=60,
    alpha=0.8,
    edgecolors='black',
    linewidth=0.5
)
ax4.set_xlabel('Attribute 1', fontsize=11)
ax4.set_ylabel('Attribute 2', fontsize=11)
ax4.set_title('Local Votes (higher = more local)', fontsize=13, fontweight='bold')
cbar4 = plt.colorbar(scatter4, ax=ax4)
cbar4.set_label('Local votes', fontsize=10)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Show some example classifications
print("\n" + "=" * 80)
print("EXAMPLE CLASSIFICATIONS")
print("=" * 80)

for true_label in ['global', 'local']:
    print(f"\n{true_label.upper()} OUTLIERS (showing top 5 by confidence):")
    print("-" * 80)
    subset = df_synthetic[df_synthetic['true_label'] == true_label].nlargest(5, 'confidence')
    cols = ['attribute_1', 'attribute_2', 'true_label', 'classification', 
            'knn_rank', 'lof_rank', 'votes_global', 'votes_local', 'confidence']
    print(subset[cols].to_string(index=False))
    print()

print("=" * 80)

In [None]:
# Save results of synthetic dataset
output_file_synthetic = 'synthetic_classified_local_global.csv'
output_cols = [
    'attribute_1', 'attribute_2',
    'true_label', 'is_outlier',
    'knn_score', 'knn_rank',
    'lof_score', 'lof_rank',
    'votes_global', 'votes_local', 'total_votes',
    'classification', 'confidence'
]

df_synthetic[output_cols].to_csv(output_file_synthetic, index=False)

print("=" * 80)
print("FINAL RESULTS - SYNTHETIC DATASET")
print("=" * 80)
print(f"\n✓ Results saved to: {output_file_synthetic}")
print(f"  Rows: {len(df_synthetic)}")
print(f"  Columns: {len(output_cols)}")

# Calculate accuracy metrics
print("\n" + "=" * 80)
print("CLASSIFICATION ACCURACY")
print("=" * 80)

# For global outliers
global_true = df_synthetic['true_label'] == 'global'
global_pred = df_synthetic['classification'] == 'global'
global_correct = (global_true & global_pred).sum()
global_total = global_true.sum()
global_accuracy = (global_correct / global_total * 100) if global_total > 0 else 0

print(f"\nGlobal Outliers:")
print(f"  True positives: {global_correct}/{global_total} ({global_accuracy:.1f}%)")
print(f"  Misclassified as local: {(global_true & (df_synthetic['classification'] == 'local')).sum()}")
print(f"  Misclassified as normal: {(global_true & (df_synthetic['classification'] == 'normal')).sum()}")

# For local outliers
local_true = df_synthetic['true_label'] == 'local'
local_pred = df_synthetic['classification'] == 'local'
local_correct = (local_true & local_pred).sum()
local_total = local_true.sum()
local_accuracy = (local_correct / local_total * 100) if local_total > 0 else 0

print(f"\nLocal Outliers:")
print(f"  True positives: {local_correct}/{local_total} ({local_accuracy:.1f}%)")
print(f"  Misclassified as global: {(local_true & (df_synthetic['classification'] == 'global')).sum()}")
print(f"  Misclassified as normal: {(local_true & (df_synthetic['classification'] == 'normal')).sum()}")

# Overall accuracy
outlier_true = df_synthetic['is_outlier'] == 1
outlier_detected = df_synthetic['classification'].isin(['global', 'local'])
outlier_correct = (outlier_true & outlier_detected).sum()
outlier_total = outlier_true.sum()
outlier_accuracy = (outlier_correct / outlier_total * 100) if outlier_total > 0 else 0

print(f"\nOverall Outlier Detection:")
print(f"  Detected: {outlier_correct}/{outlier_total} ({outlier_accuracy:.1f}%)")

# Normal points
normal_true = df_synthetic['true_label'] == 'normal'
normal_pred = df_synthetic['classification'] == 'normal'
normal_correct = (normal_true & normal_pred).sum()
normal_total = normal_true.sum()
normal_accuracy = (normal_correct / normal_total * 100) if normal_total > 0 else 0

print(f"\nNormal Points:")
print(f"  Correctly classified: {normal_correct}/{normal_total} ({normal_accuracy:.1f}%)")
print(f"  False positives (as global): {(normal_true & (df_synthetic['classification'] == 'global')).sum()}")
print(f"  False positives (as local): {(normal_true & (df_synthetic['classification'] == 'local')).sum()}")

print("\n" + "=" * 80)
print("\n✓ Analysis complete! The majority voting algorithm successfully distinguishes")
print("  between local and global outliers.")
print("\n" + "=" * 80)

## Summary and Next Steps

### What Changed:

1. **Removed redundant code**: The voting algorithm is now defined once (Cell 14) and reused
2. **Removed clustering approaches**: Cells 4-6, 8-12 contained clustering-based methods that assume spherical data
3. **Optimized thresholds**: Using ranges close to the true number of outliers for better accuracy
4. **Created synthetic dataset**: With clear local (near clusters) and global (far from all) outliers

### Key Findings:

✓ **Global outliers**: Detected by BOTH k-NN and LOF (far from everything)  
✓ **Local outliers**: Detected by LOF only (anomalous within their neighborhood)  
✓ **Majority voting**: Robust across multiple thresholds, reduces single-threshold sensitivity

### Cells to Delete Manually:

You can now safely delete these redundant cells:
- **Cell 1**: Empty
- **Cells 4-6**: Repetitive ratio analyses (replaced by cells 14-21)
- **Cells 8-12**: Clustering approaches (not suitable)
- **Cell 13**: Empty

### Keep These Cells:

- **Cell 0**: Dataset loading ✓
- **Cell 2**: k-NN analysis ✓
- **Cell 3**: LOF analysis ✓  
- **Cell 7**: Score comparison ✓
- **Cells 14-21**: Majority voting algorithm (DFKI dataset) ✓
- **Cells 22-28**: Synthetic dataset + majority voting (NEW) ✓

In [None]:
"""
Majority Voting Algorithm for Global vs Local Outlier Classification

Key Idea:
- For each threshold c (top c anomalies):
  - If point is in top-c for BOTH kNN AND LOF → vote "global"
  - If point is in top-c for LOF but NOT kNN → vote "local"
- Aggregate votes across many c values
- Classify by majority vote
"""

def majority_vote_classification(df, c_range, knn_rank_col='knn_rank', lof_rank_col='lof_rank'):
    """
    Classify outliers as global or local using majority voting across multiple thresholds.
    
    Parameters:
    -----------
    df : DataFrame
        Must contain knn_rank and lof_rank columns
    c_range : list or range
        List of threshold values to test (e.g., [10, 20, 30, ..., 100])
    knn_rank_col : str
        Column name for k-NN ranks
    lof_rank_col : str
        Column name for LOF ranks
    
    Returns:
    --------
    DataFrame with added columns:
        - 'global_votes': number of times classified as global
        - 'local_votes': number of times classified as local
        - 'total_votes': total votes received
        - 'classification': 'global', 'local', or 'normal' (no votes)
        - 'confidence': ratio of majority votes to total votes
    """
    
    n_samples = len(df)
    
    # Initialize vote counters
    global_votes = np.zeros(n_samples, dtype=int)
    local_votes = np.zeros(n_samples, dtype=int)
    
    print(f"Running majority voting across c = {list(c_range)}")
    print(f"Total thresholds to test: {len(list(c_range))}")
    print()
    
    # Iterate through each threshold value
    for c in c_range:
        # Get top c indices for each algorithm
        top_knn = set(df.nsmallest(c, knn_rank_col).index)
        top_lof = set(df.nsmallest(c, lof_rank_col).index)
        
        # Points in both → global vote
        global_outliers = top_knn & top_lof
        
        # Points in LOF but not kNN → local vote
        local_outliers = top_lof - top_knn
        
        # Record votes
        for idx in global_outliers:
            global_votes[idx] += 1
        
        for idx in local_outliers:
            local_votes[idx] += 1
    
    # Add results to dataframe
    df_result = df.copy()
    df_result['global_votes'] = global_votes
    df_result['local_votes'] = local_votes
    df_result['total_votes'] = global_votes + local_votes
    
    # Classify based on majority
    classifications = []
    confidences = []
    
    for i in range(n_samples):
        g_votes = global_votes[i]
        l_votes = local_votes[i]
        total = g_votes + l_votes
        
        if total == 0:
            classifications.append('normal')
            confidences.append(0.0)
        elif g_votes > l_votes:
            classifications.append('global')
            confidences.append(g_votes / total)
        elif l_votes > g_votes:
            classifications.append('local')
            confidences.append(l_votes / total)
        else:  # tie
            classifications.append('tie')
            confidences.append(0.5)
    
    df_result['classification'] = classifications
    df_result['confidence'] = confidences
    
    # Print summary
    print("\n" + "="*60)
    print("CLASSIFICATION SUMMARY")
    print("="*60)
    print(f"Global outliers: {sum(np.array(classifications) == 'global')}")
    print(f"Local outliers:  {sum(np.array(classifications) == 'local')}")
    print(f"Ties:            {sum(np.array(classifications) == 'tie')}")
    print(f"Normal points:   {sum(np.array(classifications) == 'normal')}")
    print()
    
    # Statistics on confidence
    voted_mask = np.array(classifications) != 'normal'
    if voted_mask.any():
        voted_confidences = np.array(confidences)[voted_mask]
        print(f"Confidence statistics (for classified points):")
        print(f"  Mean: {voted_confidences.mean():.3f}")
        print(f"  Min:  {voted_confidences.min():.3f}")
        print(f"  Max:  {voted_confidences.max():.3f}")
    
    return df_result


def plot_classification_results(df, max_c, attr1='attribute_1', attr2='attribute_2'):
    """
    Visualize the classification results.
    
    Parameters:
    -----------
    df : DataFrame
        Must contain 'classification' and 'confidence' columns
    max_c : int
        Maximum c value used (for plot title)
    """
    
    # Create color map
    color_map = {
        'global': 'red',
        'local': 'blue',
        'tie': 'purple',
        'normal': 'lightgray'
    }
    
    colors = [color_map[c] for c in df['classification']]
    
    # Create figure with two subplots
    fig, axes = plt.subplots(1, 2, figsize=(18, 7))
    
    # Plot 1: Classification with consistent colors
    ax1 = axes[0]
    
    # Plot normal points first (background)
    normal_mask = df['classification'] == 'normal'
    ax1.scatter(df.loc[normal_mask, attr1], 
                df.loc[normal_mask, attr2],
                c='lightgray', s=20, alpha=0.3, label='Normal', zorder=1)
    
    # Plot classified outliers
    for cls, color, label, marker in [
        ('global', 'red', 'Global Outlier', 'o'),
        ('local', 'blue', 'Local Outlier', '^'),
        ('tie', 'purple', 'Tie', 's')
    ]:
        mask = df['classification'] == cls
        if mask.any():
            ax1.scatter(df.loc[mask, attr1], 
                       df.loc[mask, attr2],
                       c=color, s=80, alpha=0.8, 
                       edgecolors='black', linewidth=0.5,
                       label=label, marker=marker, zorder=3)
    
    ax1.set_xlabel('Attribute 1', fontsize=12)
    ax1.set_ylabel('Attribute 2', fontsize=12)
    ax1.set_title(f'Global vs Local Outlier Classification\n(Majority Voting, max c={max_c})', 
                 fontsize=13, fontweight='bold')
    ax1.legend(loc='best', fontsize=10)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Confidence scores
    ax2 = axes[1]
    
    # Plot normal points
    ax2.scatter(df.loc[normal_mask, attr1], 
                df.loc[normal_mask, attr2],
                c='lightgray', s=20, alpha=0.3, label='Normal', zorder=1)
    
    # Plot classified points with confidence coloring
    classified_mask = df['classification'] != 'normal'
    if classified_mask.any():
        scatter = ax2.scatter(df.loc[classified_mask, attr1], 
                             df.loc[classified_mask, attr2],
                             c=df.loc[classified_mask, 'confidence'],
                             cmap='RdYlGn', s=80, alpha=0.8,
                             edgecolors='black', linewidth=0.5,
                             vmin=0.5, vmax=1.0, zorder=3)
        
        cbar = plt.colorbar(scatter, ax=ax2)
        cbar.set_label('Classification Confidence', fontsize=11)
    
    ax2.set_xlabel('Attribute 1', fontsize=12)
    ax2.set_ylabel('Attribute 2', fontsize=12)
    ax2.set_title(f'Classification Confidence Scores\n(Higher = stronger agreement)', 
                 fontsize=13, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Additional plot: Vote distribution
    fig, ax = plt.subplots(figsize=(10, 6))
    
    classified_mask = df['classification'] != 'normal'
    if classified_mask.any():
        scatter = ax.scatter(df.loc[classified_mask, 'global_votes'], 
                           df.loc[classified_mask, 'local_votes'],
                           c=df.loc[classified_mask, 'classification'].map(color_map),
                           s=60, alpha=0.7, edgecolors='black', linewidth=0.5)
        
        # Add diagonal line (tie line)
        max_votes = max(df['global_votes'].max(), df['local_votes'].max())
        ax.plot([0, max_votes], [0, max_votes], 'k--', alpha=0.3, linewidth=1, label='Tie line')
        
        ax.set_xlabel('Global Votes', fontsize=12)
        ax.set_ylabel('Local Votes', fontsize=12)
        ax.set_title('Vote Distribution', fontsize=13, fontweight='bold')
        ax.grid(True, alpha=0.3)
        
        # Add legend manually
        from matplotlib.patches import Patch
        legend_elements = [
            Patch(facecolor='red', edgecolor='black', label='Global'),
            Patch(facecolor='blue', edgecolor='black', label='Local'),
            Patch(facecolor='purple', edgecolor='black', label='Tie')
        ]
        ax.legend(handles=legend_elements, loc='best', fontsize=10)
    
    plt.tight_layout()
    plt.show()


print("Functions loaded successfully!")
print("\nUsage example:")
print("  c_range = range(10, 101, 5)  # Test c from 10 to 100, step 5")
print("  df_classified = majority_vote_classification(df, c_range)")
print("  plot_classification_results(df_classified, max_c=100)")

In [None]:
# Run the majority voting classification
# Test with c values from 10 to 150 in steps of 5

c_range = range(10, 151, 5)  # [10, 15, 20, ..., 150]

df_classified = majority_vote_classification(df, c_range)

# Visualize results
plot_classification_results(df_classified, max_c=150)

In [None]:
# Detailed analysis: Compare with ground truth and explore different c ranges

def analyze_classification_vs_ground_truth(df_classified):
    """
    Analyze how classified outliers relate to ground truth labels.
    """
    print("\n" + "="*60)
    print("COMPARISON WITH GROUND TRUTH")
    print("="*60)
    
    # Get ground truth outliers
    true_outliers = df_classified[df_classified['is_outlier'] == 1]
    true_normals = df_classified[df_classified['is_outlier'] == 0]
    
    print(f"\nGround truth distribution:")
    print(f"  True outliers: {len(true_outliers)}")
    print(f"  True normals:  {len(true_normals)}")
    
    # Among true outliers, how are they classified?
    print(f"\nClassification of TRUE OUTLIERS:")
    outlier_classifications = true_outliers['classification'].value_counts()
    for cls, count in outlier_classifications.items():
        pct = count / len(true_outliers) * 100
        print(f"  {cls:10s}: {count:4d} ({pct:5.1f}%)")
    
    # Among classified global outliers, what % are true outliers?
    print(f"\nPrecision check:")
    for cls in ['global', 'local']:
        classified = df_classified[df_classified['classification'] == cls]
        if len(classified) > 0:
            true_positive = (classified['is_outlier'] == 1).sum()
            precision = true_positive / len(classified) * 100
            print(f"  {cls:10s}: {true_positive}/{len(classified)} are true outliers ({precision:.1f}%)")
    
    # Visualize ground truth vs classification
    fig, axes = plt.subplots(1, 2, figsize=(18, 7))
    
    # Plot 1: Ground truth
    ax1 = axes[0]
    normal_mask = df_classified['is_outlier'] == 0
    outlier_mask = df_classified['is_outlier'] == 1
    
    ax1.scatter(df_classified.loc[normal_mask, 'attribute_1'], 
                df_classified.loc[normal_mask, 'attribute_2'],
                c='lightgray', s=20, alpha=0.5, label='Normal')
    ax1.scatter(df_classified.loc[outlier_mask, 'attribute_1'], 
                df_classified.loc[outlier_mask, 'attribute_2'],
                c='black', s=80, alpha=0.8, marker='x', 
                edgecolors='red', linewidth=1.5, label='True Outlier')
    
    ax1.set_xlabel('Attribute 1', fontsize=12)
    ax1.set_ylabel('Attribute 2', fontsize=12)
    ax1.set_title('Ground Truth Labels', fontsize=13, fontweight='bold')
    ax1.legend(loc='best', fontsize=10)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Our classification overlaid on ground truth
    ax2 = axes[1]
    
    # Background: ground truth
    ax2.scatter(df_classified.loc[normal_mask, 'attribute_1'], 
                df_classified.loc[normal_mask, 'attribute_2'],
                c='lightgray', s=20, alpha=0.3, label='True Normal')
    ax2.scatter(df_classified.loc[outlier_mask, 'attribute_1'], 
                df_classified.loc[outlier_mask, 'attribute_2'],
                c='yellow', s=80, alpha=0.5, marker='x', 
                edgecolors='orange', linewidth=1, label='True Outlier')
    
    # Overlay: our classification
    for cls, color, label, marker in [
        ('global', 'red', 'Classified: Global', 'o'),
        ('local', 'blue', 'Classified: Local', '^')
    ]:
        mask = df_classified['classification'] == cls
        if mask.any():
            ax2.scatter(df_classified.loc[mask, 'attribute_1'], 
                       df_classified.loc[mask, 'attribute_2'],
                       c=color, s=60, alpha=0.7, 
                       edgecolors='black', linewidth=0.5,
                       label=label, marker=marker, zorder=3)
    
    ax2.set_xlabel('Attribute 1', fontsize=12)
    ax2.set_ylabel('Attribute 2', fontsize=12)
    ax2.set_title('Classification Overlaid on Ground Truth', fontsize=13, fontweight='bold')
    ax2.legend(loc='best', fontsize=9)
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


# Run the analysis
analyze_classification_vs_ground_truth(df_classified)

In [None]:
# Experiment with different c ranges to test sensitivity

print("="*70)
print("TESTING DIFFERENT C RANGES")
print("="*70)

test_configs = [
    {"name": "Small range (10-50)", "c_range": range(10, 51, 5)},
    {"name": "Medium range (10-100)", "c_range": range(10, 101, 5)},
    {"name": "Large range (10-200)", "c_range": range(10, 201, 10)},
    {"name": "Fine-grained (5-100, step=1)", "c_range": range(5, 101, 1)},
]

results_comparison = []

for config in test_configs:
    print(f"\n{'='*70}")
    print(f"Testing: {config['name']}")
    print(f"{'='*70}")
    
    df_test = majority_vote_classification(df, config['c_range'])
    
    # Store summary
    results_comparison.append({
        'name': config['name'],
        'n_thresholds': len(list(config['c_range'])),
        'global': (df_test['classification'] == 'global').sum(),
        'local': (df_test['classification'] == 'local').sum(),
        'tie': (df_test['classification'] == 'tie').sum(),
        'normal': (df_test['classification'] == 'normal').sum(),
        'avg_confidence': df_test[df_test['classification'] != 'normal']['confidence'].mean()
    })

# Display comparison table
print("\n" + "="*70)
print("COMPARISON ACROSS DIFFERENT C RANGES")
print("="*70)
comparison_df = pd.DataFrame(results_comparison)
print(comparison_df.to_string(index=False))

# Visualize one of the configurations in detail
print("\n\nVisualizing: Medium range (10-100)")
df_medium = majority_vote_classification(df, range(10, 101, 5))
plot_classification_results(df_medium, max_c=100)

In [None]:
# Detailed inspection: Show top global and local outliers

def inspect_top_outliers(df_classified, n_top=10):
    """
    Display detailed information about the top global and local outliers.
    """
    print("="*80)
    print(f"TOP {n_top} GLOBAL OUTLIERS (sorted by confidence)")
    print("="*80)
    
    global_outliers = df_classified[df_classified['classification'] == 'global'].copy()
    global_outliers = global_outliers.sort_values('confidence', ascending=False).head(n_top)
    
    cols_to_show = ['attribute_1', 'attribute_2', 'knn_rank', 'lof_rank', 
                    'global_votes', 'local_votes', 'confidence', 'is_outlier']
    print(global_outliers[cols_to_show].to_string())
    
    print("\n" + "="*80)
    print(f"TOP {n_top} LOCAL OUTLIERS (sorted by confidence)")
    print("="*80)
    
    local_outliers = df_classified[df_classified['classification'] == 'local'].copy()
    local_outliers = local_outliers.sort_values('confidence', ascending=False).head(n_top)
    
    print(local_outliers[cols_to_show].to_string())
    
    # Visualize these specific points
    fig, ax = plt.subplots(figsize=(12, 9))
    
    # Background: all points
    normal_bg = df_classified['classification'] == 'normal'
    ax.scatter(df_classified.loc[normal_bg, 'attribute_1'], 
              df_classified.loc[normal_bg, 'attribute_2'],
              c='lightgray', s=15, alpha=0.2, label='Other points')
    
    # Top global outliers
    ax.scatter(global_outliers['attribute_1'], 
              global_outliers['attribute_2'],
              c='red', s=150, alpha=0.7, marker='o',
              edgecolors='darkred', linewidth=2,
              label=f'Top {n_top} Global Outliers')
    
    # Top local outliers
    ax.scatter(local_outliers['attribute_1'], 
              local_outliers['attribute_2'],
              c='blue', s=150, alpha=0.7, marker='^',
              edgecolors='darkblue', linewidth=2,
              label=f'Top {n_top} Local Outliers')
    
    # Add labels to points
    for idx, row in global_outliers.iterrows():
        ax.annotate(f"G{idx}", 
                   (row['attribute_1'], row['attribute_2']),
                   xytext=(5, 5), textcoords='offset points',
                   fontsize=8, color='darkred', fontweight='bold')
    
    for idx, row in local_outliers.iterrows():
        ax.annotate(f"L{idx}", 
                   (row['attribute_1'], row['attribute_2']),
                   xytext=(5, 5), textcoords='offset points',
                   fontsize=8, color='darkblue', fontweight='bold')
    
    ax.set_xlabel('Attribute 1', fontsize=12)
    ax.set_ylabel('Attribute 2', fontsize=12)
    ax.set_title(f'Top {n_top} Global vs Local Outliers', fontsize=14, fontweight='bold')
    ax.legend(loc='best', fontsize=10)
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\n" + "="*80)
    print("SUMMARY STATISTICS")
    print("="*80)
    print(f"\nTop global outliers:")
    print(f"  Avg kNN rank: {global_outliers['knn_rank'].mean():.1f}")
    print(f"  Avg LOF rank: {global_outliers['lof_rank'].mean():.1f}")
    print(f"  % true outliers: {(global_outliers['is_outlier'] == 1).sum() / len(global_outliers) * 100:.1f}%")
    
    print(f"\nTop local outliers:")
    print(f"  Avg kNN rank: {local_outliers['knn_rank'].mean():.1f}")
    print(f"  Avg LOF rank: {local_outliers['lof_rank'].mean():.1f}")
    print(f"  % true outliers: {(local_outliers['is_outlier'] == 1).sum() / len(local_outliers) * 100:.1f}%")


# Run inspection
inspect_top_outliers(df_classified, n_top=10)

# Optional: Save results to CSV
# df_classified.to_csv('classified_outliers.csv', index=True)
# print("\nResults saved to 'classified_outliers.csv'")

In [None]:
# Optional: Test with different k values for kNN and LOF

def run_full_pipeline(df_original, k_knn=20, k_lof=20, c_range=range(10, 101, 5)):
    """
    Complete pipeline: run kNN and LOF with specified k values, 
    then perform majority voting classification.
    
    Parameters:
    -----------
    df_original : DataFrame
        Original dataset with 'attribute_1', 'attribute_2', and 'is_outlier' columns
    k_knn : int
        Number of neighbors for k-NN
    k_lof : int
        Number of neighbors for LOF
    c_range : range or list
        Range of c values for majority voting
    
    Returns:
    --------
    DataFrame with all scores, ranks, and classifications
    """
    print("="*80)
    print(f"RUNNING FULL PIPELINE")
    print("="*80)
    print(f"k-NN neighbors: {k_knn}")
    print(f"LOF neighbors:  {k_lof}")
    print(f"Voting range:   {list(c_range)[0]} to {list(c_range)[-1]}")
    print()
    
    df_work = df_original.copy()
    X = df_work[['attribute_1', 'attribute_2']].values
    
    # Normalize
    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)
    
    # Run k-NN
    print(f"Running k-NN (k={k_knn})...")
    knn_detector = KNN(n_neighbors=k_knn, contamination=0.01)
    knn_detector.fit(X_normalized)
    knn_scores = knn_detector.decision_scores_
    
    sorted_indices = np.argsort(knn_scores)[::-1]
    knn_ranks = np.empty_like(knn_scores, dtype=int)
    knn_ranks[sorted_indices] = np.arange(1, len(knn_scores) + 1)
    
    df_work['knn_score'] = knn_scores
    df_work['knn_rank'] = knn_ranks
    
    # Run LOF
    print(f"Running LOF (k={k_lof})...")
    lof_detector = LOF(n_neighbors=k_lof, contamination=0.01)
    lof_detector.fit(X_normalized)
    lof_scores = lof_detector.decision_scores_
    
    sorted_indices = np.argsort(lof_scores)[::-1]
    lof_ranks = np.empty_like(lof_scores, dtype=int)
    lof_ranks[sorted_indices] = np.arange(1, len(lof_scores) + 1)
    
    df_work['lof_score'] = lof_scores
    df_work['lof_rank'] = lof_ranks
    
    # Majority voting classification
    print(f"\nRunning majority voting...")
    df_result = majority_vote_classification(df_work, c_range)
    
    print("\nPipeline complete!")
    return df_result


# Example: Test with different k values
print("\n" + "="*80)
print("TESTING SENSITIVITY TO k VALUES")
print("="*80)

# Test 1: Small k values (more local)
print("\n### Test 1: Small k (k=10) ###")
df_k10 = run_full_pipeline(df, k_knn=10, k_lof=10, c_range=range(10, 101, 5))
plot_classification_results(df_k10, max_c=100)

# Test 2: Larger k values (more global)
print("\n### Test 2: Large k (k=40) ###")
df_k40 = run_full_pipeline(df, k_knn=40, k_lof=40, c_range=range(10, 101, 5))
plot_classification_results(df_k40, max_c=100)

# Compare the two
print("\n" + "="*80)
print("COMPARISON: k=10 vs k=40")
print("="*80)
print("\nWith k=10 (more sensitive to local structure):")
print(f"  Global: {(df_k10['classification'] == 'global').sum()}")
print(f"  Local:  {(df_k10['classification'] == 'local').sum()}")

print("\nWith k=40 (more global perspective):")
print(f"  Global: {(df_k40['classification'] == 'global').sum()}")
print(f"  Local:  {(df_k40['classification'] == 'local').sum()}")

In [None]:
# Final Summary and Export

def create_summary_report(df_classified):
    """
    Generate a comprehensive summary report of the classification results.
    """
    print("\n" + "="*80)
    print("COMPREHENSIVE CLASSIFICATION REPORT")
    print("="*80)
    
    # Overall statistics
    total_points = len(df_classified)
    n_global = (df_classified['classification'] == 'global').sum()
    n_local = (df_classified['classification'] == 'local').sum()
    n_tie = (df_classified['classification'] == 'tie').sum()
    n_normal = (df_classified['classification'] == 'normal').sum()
    
    print(f"\n1. OVERALL CLASSIFICATION")
    print(f"   Total points:     {total_points}")
    print(f"   Global outliers:  {n_global:4d} ({n_global/total_points*100:5.2f}%)")
    print(f"   Local outliers:   {n_local:4d} ({n_local/total_points*100:5.2f}%)")
    print(f"   Ties:             {n_tie:4d} ({n_tie/total_points*100:5.2f}%)")
    print(f"   Normal points:    {n_normal:4d} ({n_normal/total_points*100:5.2f}%)")
    
    # Confidence statistics
    classified = df_classified[df_classified['classification'].isin(['global', 'local'])]
    if len(classified) > 0:
        print(f"\n2. CONFIDENCE STATISTICS")
        print(f"   Mean confidence:  {classified['confidence'].mean():.3f}")
        print(f"   Median confidence: {classified['confidence'].median():.3f}")
        print(f"   Std confidence:   {classified['confidence'].std():.3f}")
        print(f"   High conf (>0.8): {(classified['confidence'] > 0.8).sum()} points")
        print(f"   Low conf (<0.6):  {(classified['confidence'] < 0.6).sum()} points")
    
    # Ground truth comparison
    if 'is_outlier' in df_classified.columns:
        true_outliers = df_classified[df_classified['is_outlier'] == 1]
        
        print(f"\n3. GROUND TRUTH COMPARISON")
        print(f"   True outliers in dataset: {len(true_outliers)}")
        print(f"   Detected by our method:   {n_global + n_local}")
        
        # Detection rate
        detected = true_outliers['classification'].isin(['global', 'local']).sum()
        detection_rate = detected / len(true_outliers) * 100 if len(true_outliers) > 0 else 0
        print(f"   Detection rate:           {detected}/{len(true_outliers)} ({detection_rate:.1f}%)")
        
        # Precision
        our_outliers = df_classified[df_classified['classification'].isin(['global', 'local'])]
        if len(our_outliers) > 0:
            true_positives = (our_outliers['is_outlier'] == 1).sum()
            precision = true_positives / len(our_outliers) * 100
            print(f"   Precision:                {true_positives}/{len(our_outliers)} ({precision:.1f}%)")
        
        # Among detected true outliers, global vs local
        detected_outliers = true_outliers[true_outliers['classification'].isin(['global', 'local'])]
        if len(detected_outliers) > 0:
            n_global_true = (detected_outliers['classification'] == 'global').sum()
            n_local_true = (detected_outliers['classification'] == 'local').sum()
            print(f"\n   Among detected true outliers:")
            print(f"      Classified as global: {n_global_true} ({n_global_true/len(detected_outliers)*100:.1f}%)")
            print(f"      Classified as local:  {n_local_true} ({n_local_true/len(detected_outliers)*100:.1f}%)")
    
    # Ranking statistics
    if 'knn_rank' in df_classified.columns and 'lof_rank' in df_classified.columns:
        print(f"\n4. RANK STATISTICS")
        
        global_outliers = df_classified[df_classified['classification'] == 'global']
        local_outliers = df_classified[df_classified['classification'] == 'local']
        
        if len(global_outliers) > 0:
            print(f"\n   Global outliers:")
            print(f"      Avg k-NN rank: {global_outliers['knn_rank'].mean():6.1f}")
            print(f"      Avg LOF rank:  {global_outliers['lof_rank'].mean():6.1f}")
            print(f"      Rank ratio:    {(global_outliers['lof_rank'] / global_outliers['knn_rank']).mean():.3f}")
        
        if len(local_outliers) > 0:
            print(f"\n   Local outliers:")
            print(f"      Avg k-NN rank: {local_outliers['knn_rank'].mean():6.1f}")
            print(f"      Avg LOF rank:  {local_outliers['lof_rank'].mean():6.1f}")
            print(f"      Rank ratio:    {(local_outliers['lof_rank'] / local_outliers['knn_rank']).mean():.3f}")
    
    print("\n" + "="*80)


# Generate report for our classification
create_summary_report(df_classified)

# Optional: Save detailed results
save_results = False  # Set to True to save

if save_results:
    output_file = 'xod_majority_voting_results.csv'
    cols_to_save = ['attribute_1', 'attribute_2', 'is_outlier',
                    'knn_score', 'knn_rank', 'lof_score', 'lof_rank',
                    'global_votes', 'local_votes', 'total_votes',
                    'classification', 'confidence']
    df_classified[cols_to_save].to_csv(output_file, index=True)
    print(f"\n✓ Results saved to: {output_file}")
else:
    print(f"\nTo save results, set save_results = True")

In [None]:
# Additional Visualization: Rank Difference Analysis

def visualize_rank_differences(df_classified):
    """
    Visualize the relationship between k-NN and LOF ranks,
    highlighting global vs local outliers.
    """
    
    # Calculate rank difference
    df_viz = df_classified.copy()
    df_viz['rank_diff'] = df_viz['lof_rank'] - df_viz['knn_rank']
    df_viz['rank_ratio'] = df_viz['lof_rank'] / (df_viz['knn_rank'] + 1)  # +1 to avoid division by zero
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 14))
    
    # Plot 1: k-NN rank vs LOF rank
    ax1 = axes[0, 0]
    
    color_map = {'global': 'red', 'local': 'blue', 'tie': 'purple', 'normal': 'lightgray'}
    
    for cls in ['normal', 'tie', 'local', 'global']:
        mask = df_viz['classification'] == cls
        if mask.any():
            alpha = 0.2 if cls == 'normal' else 0.7
            size = 20 if cls == 'normal' else 60
            ax1.scatter(df_viz.loc[mask, 'knn_rank'], 
                       df_viz.loc[mask, 'lof_rank'],
                       c=color_map[cls], s=size, alpha=alpha, 
                       label=cls.capitalize())
    
    # Add diagonal line (where kNN rank = LOF rank)
    max_rank = max(df_viz['knn_rank'].max(), df_viz['lof_rank'].max())
    ax1.plot([0, max_rank], [0, max_rank], 'k--', alpha=0.3, linewidth=1, label='Equal rank')
    
    ax1.set_xlabel('k-NN Rank', fontsize=12)
    ax1.set_ylabel('LOF Rank', fontsize=12)
    ax1.set_title('k-NN Rank vs LOF Rank', fontsize=13, fontweight='bold')
    ax1.legend(loc='best', fontsize=9)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Rank difference distribution
    ax2 = axes[0, 1]
    
    for cls in ['global', 'local']:
        mask = df_viz['classification'] == cls
        if mask.any():
            ax2.hist(df_viz.loc[mask, 'rank_diff'], 
                    bins=30, alpha=0.6, label=cls.capitalize(),
                    color=color_map[cls], edgecolor='black')
    
    ax2.axvline(x=0, color='black', linestyle='--', linewidth=2, alpha=0.5, label='No difference')
    ax2.set_xlabel('Rank Difference (LOF - k-NN)', fontsize=12)
    ax2.set_ylabel('Count', fontsize=12)
    ax2.set_title('Distribution of Rank Differences', fontsize=13, fontweight='bold')
    ax2.legend(loc='best', fontsize=10)
    ax2.grid(True, alpha=0.3, axis='y')
    
    # Plot 3: Spatial distribution colored by rank difference
    ax3 = axes[1, 0]
    
    classified_mask = df_viz['classification'] != 'normal'
    
    # Background
    ax3.scatter(df_viz.loc[~classified_mask, 'attribute_1'],
               df_viz.loc[~classified_mask, 'attribute_2'],
               c='lightgray', s=20, alpha=0.3, label='Normal')
    
    # Classified points
    scatter = ax3.scatter(df_viz.loc[classified_mask, 'attribute_1'],
                         df_viz.loc[classified_mask, 'attribute_2'],
                         c=df_viz.loc[classified_mask, 'rank_diff'],
                         cmap='RdBu_r', s=80, alpha=0.8,
                         edgecolors='black', linewidth=0.5,
                         vmin=-100, vmax=100)
    
    cbar = plt.colorbar(scatter, ax=ax3)
    cbar.set_label('Rank Difference (LOF - k-NN)', fontsize=11)
    
    ax3.set_xlabel('Attribute 1', fontsize=12)
    ax3.set_ylabel('Attribute 2', fontsize=12)
    ax3.set_title('Points colored by Rank Difference\n(Red: LOF>k-NN, Blue: k-NN>LOF)', 
                 fontsize=13, fontweight='bold')
    ax3.grid(True, alpha=0.3)
    
    # Plot 4: Confidence vs rank statistics
    ax4 = axes[1, 1]
    
    for cls in ['global', 'local']:
        mask = df_viz['classification'] == cls
        if mask.any():
            ax4.scatter(df_viz.loc[mask, 'confidence'],
                       df_viz.loc[mask, 'rank_diff'],
                       c=color_map[cls], s=60, alpha=0.6,
                       label=cls.capitalize(), edgecolors='black', linewidth=0.5)
    
    ax4.axhline(y=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
    ax4.set_xlabel('Classification Confidence', fontsize=12)
    ax4.set_ylabel('Rank Difference (LOF - k-NN)', fontsize=12)
    ax4.set_title('Confidence vs Rank Difference', fontsize=13, fontweight='bold')
    ax4.legend(loc='best', fontsize=10)
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("="*70)
    print("RANK DIFFERENCE ANALYSIS")
    print("="*70)
    
    for cls in ['global', 'local']:
        mask = df_viz['classification'] == cls
        if mask.any():
            print(f"\n{cls.upper()} outliers:")
            print(f"  Mean rank difference: {df_viz.loc[mask, 'rank_diff'].mean():7.1f}")
            print(f"  Median rank diff:     {df_viz.loc[mask, 'rank_diff'].median():7.1f}")
            print(f"  Std rank diff:        {df_viz.loc[mask, 'rank_diff'].std():7.1f}")
            
            # Count how many have LOF rank better than kNN rank
            better_lof = (df_viz.loc[mask, 'rank_diff'] < 0).sum()
            better_knn = (df_viz.loc[mask, 'rank_diff'] > 0).sum()
            print(f"  LOF rank < k-NN rank: {better_lof}")
            print(f"  LOF rank > k-NN rank: {better_knn}")


# Run the analysis
visualize_rank_differences(df_classified)