In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
df = pd.read_csv('../data/dfki-artificial-3000-unsupervised-ad.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nOutlier label distribution:")
print(df['outlier_label'].value_counts())

# Convert outlier_label to binary for plotting
df['is_outlier'] = (df['outlier_label'] == 'outlier').astype(int)

# Create scatter plot
fig, ax = plt.subplots(figsize=(10, 8))

# Plot normal points
normal_mask = df['is_outlier'] == 0
ax.scatter(df.loc[normal_mask, 'attribute_1'], 
           df.loc[normal_mask, 'attribute_2'],
           c='blue', alpha=0.5, s=20, label='Normal')

# Plot outliers
outlier_mask = df['is_outlier'] == 1
ax.scatter(df.loc[outlier_mask, 'attribute_1'], 
           df.loc[outlier_mask, 'attribute_2'],
           c='red', alpha=0.7, s=30, label='Outlier', marker='x')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title('DFKI Artificial Dataset (3000 samples)', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nSummary statistics:")
print(df[['attribute_1', 'attribute_2']].describe())



In [None]:
from pyod.models.knn import KNN
from sklearn.preprocessing import MinMaxScaler

X = df[['attribute_1', 'attribute_2']].values
y = df['is_outlier'].values

scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

k = 20
knn_detector = KNN(n_neighbors=k, contamination=0.01)
knn_detector.fit(X_normalized)

scores = knn_detector.decision_scores_

sorted_indices = np.argsort(scores)[::-1]
ranks = np.empty_like(scores, dtype=int)
ranks[sorted_indices] = np.arange(1, len(scores) + 1)

df['knn_score'] = scores
df['knn_rank'] = ranks

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

ax1 = axes[0]
scatter1 = ax1.scatter(df['attribute_1'], df['attribute_2'], 
                       c=scores, cmap='viridis', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title(f'Points colored by k-NN Score (k={k})', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(scatter1, ax=ax1)
cbar1.set_label('Anomaly Score', fontsize=11)
ax1.grid(True, alpha=0.3)

ax2 = axes[1]
scatter2 = ax2.scatter(df['attribute_1'], df['attribute_2'], 
                       c=ranks, cmap='plasma_r', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title(f'Points colored by k-NN Rank', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(scatter2, ax=ax2)
cbar2.set_label('Rank (1 = most anomalous)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()



In [None]:
from pyod.models.lof import LOF
from sklearn.preprocessing import MinMaxScaler

X = df[['attribute_1', 'attribute_2']].values
y = df['is_outlier'].values

scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

k = 20
lof_detector = LOF(n_neighbors=k, contamination=0.01)
lof_detector.fit(X_normalized)

scores = lof_detector.decision_scores_

sorted_indices = np.argsort(scores)[::-1]
ranks = np.empty_like(scores, dtype=int)
ranks[sorted_indices] = np.arange(1, len(scores) + 1)

df['lof_score'] = scores
df['lof_rank'] = ranks

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

ax1 = axes[0]
scatter1 = ax1.scatter(df['attribute_1'], df['attribute_2'], 
                       c=scores, cmap='viridis', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title(f'Points colored by LOF Score (k={k})', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(scatter1, ax=ax1)
cbar1.set_label('Anomaly Score', fontsize=11)
ax1.grid(True, alpha=0.3)

ax2 = axes[1]
scatter2 = ax2.scatter(df['attribute_1'], df['attribute_2'], 
                       c=ranks, cmap='plasma_r', 
                       s=30, alpha=0.6, edgecolors='black', linewidth=0.5)
ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title(f'Points colored by LOF Rank', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(scatter2, ax=ax2)
cbar2.set_label('Rank (1 = most anomalous)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()



In [None]:
# Euclidean combination of k-NN and LOF scores: magnitude + angle

knn_scores = df['knn_score'].values
lof_scores = df['lof_score'].values

mag = np.sqrt(knn_scores**2 + lof_scores**2)
angle = np.arctan2(lof_scores, knn_scores)  # LOF vs k-NN

# Store for later use if needed
df['euc_mag'] = mag
df['euc_angle'] = angle

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: magnitude of combined anomaly signal
ax1 = axes[0]
sc1 = ax1.scatter(
    df['attribute_1'], df['attribute_2'],
    c=mag, cmap='viridis',
    s=30, alpha=0.7, edgecolors='black', linewidth=0.5,
)
ax1.set_xlabel('Attribute 1', fontsize=12)
ax1.set_ylabel('Attribute 2', fontsize=12)
ax1.set_title('Outlier magnitude: sqrt(LOF^2 + kNN^2)', fontsize=13, fontweight='bold')
cbar1 = plt.colorbar(sc1, ax=ax1)
cbar1.set_label('Magnitude', fontsize=11)
ax1.grid(True, alpha=0.3)

# Plot 2: angle indicating locality vs globality
ax2 = axes[1]
sc2 = ax2.scatter(
    df['attribute_1'], df['attribute_2'],
    c=angle, cmap='twilight_shifted',
    s=30, alpha=0.7, edgecolors='black', linewidth=0.5,
)
ax2.set_xlabel('Attribute 1', fontsize=12)
ax2.set_ylabel('Attribute 2', fontsize=12)
ax2.set_title('Locality vs Globality: angle = arctan2(LOF, kNN)', fontsize=13, fontweight='bold')
cbar2 = plt.colorbar(sc2, ax=ax2)
cbar2.set_label('Angle (radians)', fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
c = 100
p = 0
top_knn_indices = df.nsmallest(c, 'knn_rank').index.values
top_lof_indices = df.nsmallest(c, 'lof_rank').index.values

all_top_indices = np.unique(np.concatenate([top_knn_indices, top_lof_indices]))

ratio_values = np.full(len(df), np.nan)
for idx in all_top_indices:
    knn_rank = df.loc[idx, 'knn_rank']
    lof_rank = df.loc[idx, 'lof_rank']
    ratio = (lof_rank + p) / (knn_rank + p)
    ratio_values[idx] = ratio

mask_top = ~np.isnan(ratio_values)
mask_other = np.isnan(ratio_values)

ratio_min = ratio_values[mask_top].min()
ratio_max = ratio_values[mask_top].max()
ratio_mean = ratio_values[mask_top].mean()

vmin = max(0.1, ratio_min * 0.9)
vmax = min(ratio_max * 1.1, ratio_max + 0.5)
vmax = 1
vmin = 0.6
fig, ax = plt.subplots(figsize=(12, 8))

scatter_other = ax.scatter(df.loc[mask_other, 'attribute_1'], 
                          df.loc[mask_other, 'attribute_2'],
                          c='black', s=20, alpha=0.3, label='Other points')

scatter_top = ax.scatter(df.loc[mask_top, 'attribute_1'], 
                         df.loc[mask_top, 'attribute_2'],
                         c=ratio_values[mask_top], cmap='RdYlGn', 
                         s=50, alpha=0.8, edgecolors='black', linewidth=0.5,
                         vmin=vmin, vmax=vmax, label='Top c points')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title(f'Global vs Local Outliers (c={c})\nRatio = (LOF rank + {p}) / (k-NN rank + {p})', 
             fontsize=13, fontweight='bold')
cbar = plt.colorbar(scatter_top, ax=ax)
cbar.set_label('Ratio (1 = global outlier, 0 = local outlier)', fontsize=11)
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Top {c} points by k-NN rank: {len(top_knn_indices)}")
print(f"Top {c} points by LOF rank: {len(top_lof_indices)}")
print(f"Total unique top points: {len(all_top_indices)}")
print(f"\nRatio statistics for top points:")
print(f"  Mean: {ratio_values[mask_top].mean():.4f}")
print(f"  Min: {ratio_values[mask_top].min():.4f}")
print(f"  Max: {ratio_values[mask_top].max():.4f}")



In [None]:
c = 100
p = 20
top_knn_indices = df.nsmallest(c, 'knn_rank').index.values
top_lof_indices = df.nsmallest(c, 'lof_rank').index.values

all_top_indices = np.unique(np.concatenate([top_knn_indices, top_lof_indices]))

ratio_values = np.full(len(df), np.nan)
for idx in all_top_indices:
    knn_rank = df.loc[idx, 'knn_rank']
    lof_rank = df.loc[idx, 'lof_rank']
    ratio = (lof_rank + p) / (knn_rank + p)
    ratio_values[idx] = ratio

mask_top = ~np.isnan(ratio_values)
mask_other = np.isnan(ratio_values)

ratio_min = ratio_values[mask_top].min()
ratio_max = ratio_values[mask_top].max()
ratio_mean = ratio_values[mask_top].mean()

vmin = max(0.1, ratio_min * 0.9)
vmax = min(ratio_max * 1.1, ratio_max + 0.5)
vmax = 1
vmin = 0.6
fig, ax = plt.subplots(figsize=(12, 8))

scatter_other = ax.scatter(df.loc[mask_other, 'attribute_1'], 
                          df.loc[mask_other, 'attribute_2'],
                          c='black', s=20, alpha=0.3, label='Other points')

scatter_top = ax.scatter(df.loc[mask_top, 'attribute_1'], 
                         df.loc[mask_top, 'attribute_2'],
                         c=ratio_values[mask_top], cmap='RdYlGn', 
                         s=50, alpha=0.8, edgecolors='black', linewidth=0.5,
                         vmin=vmin, vmax=vmax, label='Top c points')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title(f'Global vs Local Outliers (c={c})\nRatio = (LOF rank + {p}) / (k-NN rank + {p})', 
             fontsize=13, fontweight='bold')
cbar = plt.colorbar(scatter_top, ax=ax)
cbar.set_label('Ratio (1 = global outlier, 0 = local outlier)', fontsize=11)
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Top {c} points by k-NN rank: {len(top_knn_indices)}")
print(f"Top {c} points by LOF rank: {len(top_lof_indices)}")
print(f"Total unique top points: {len(all_top_indices)}")
print(f"\nRatio statistics for top points:")
print(f"  Mean: {ratio_values[mask_top].mean():.4f}")
print(f"  Min: {ratio_values[mask_top].min():.4f}")
print(f"  Max: {ratio_values[mask_top].max():.4f}")



In [None]:
print("k-NN mean score by label:")
print(df.groupby('is_outlier')['knn_score'].mean())
print("LOF mean score by label:")
print(df.groupby('is_outlier')['lof_score'].mean())


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np

def analyze_ensemble_globality(df, X_normalized, k_list=[3, 4, 5, 6, 7], contamination=0.05):
    """
    Computes Globality Index by averaging distances to centroids over multiple k values.
    Filters top anomalies using existing LOF scores.
    """
    
    print(f"--- Running Ensemble Globality Analysis ---")
    print(f"Averaging cluster structures for k = {k_list}")
    
    # ==========================================
    # 1. ENSEMBLE DISTANCE CALCULATION
    # ==========================================
    n_samples = X_normalized.shape[0]
    cumulative_dists = np.zeros(n_samples)
    
    # Iterate through each k in the list
    for k in k_list:
        # Fit KMeans
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto').fit(X_normalized)
        centers = kmeans.cluster_centers_
        
        # Get min distance to ANY center for this specific k
        dists = pairwise_distances(X_normalized, centers).min(axis=1)
        
        # Add to cumulative sum
        cumulative_dists += dists

    # Average the distances
    avg_dists_to_centers = cumulative_dists / len(k_list)
    
    # ==========================================
    # 2. FILTER CANDIDATES (Top C% by LOF)
    # ==========================================
    # We assume 'lof_score' already exists in df from previous steps
    if 'lof_score' not in df.columns:
        raise ValueError("DataFrame must contain 'lof_score' column.")
        
    threshold = df['lof_score'].quantile(1 - contamination)
    candidate_mask = df['lof_score'] > threshold
    
    # Extract distances only for the candidates
    candidate_dists = avg_dists_to_centers[candidate_mask]
    candidate_indices = df.index[candidate_mask]
    
    # ==========================================
    # 3. COMPUTE FINAL INDEX (Standardize Subset)
    # ==========================================
    # Normalize the average distances of the candidates to 0-1 for coloring
    scaler_subset = MinMaxScaler()
    globality_index = scaler_subset.fit_transform(candidate_dists.reshape(-1, 1)).flatten()
    
    # Store in DF
    col_name = 'ensemble_globality_index'
    df[col_name] = np.nan
    df.loc[candidate_indices, col_name] = globality_index
    
    # ==========================================
    # 4. VISUALIZATION
    # ==========================================
    fig, ax = plt.subplots(figsize=(12, 9))

    # A. Plot Background (Normal Points)
    ax.scatter(df.loc[~candidate_mask, 'attribute_1'], 
               df.loc[~candidate_mask, 'attribute_2'],
               c='gainsboro', s=20, alpha=0.4, label='Normal Data')

    # B. Plot Candidates (Colored by Ensemble Globality)
    scatter = ax.scatter(df.loc[candidate_mask, 'attribute_1'], 
                         df.loc[candidate_mask, 'attribute_2'], 
                         c=globality_index, 
                         cmap='plasma', # plasma is great for intensity
                         s=60, alpha=0.9, edgecolors='black', linewidth=0.5,
                         label=f'Top {int(contamination*100)}% Anomalies')

    # Formatting
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label(f'Ensemble Globality (Avg Dist to Centers, k={k_list})', fontsize=11)
    cbar.set_ticks([0, 0.5, 1])
    cbar.set_ticklabels(['Local (Sparse Inlier)', 'Mixed', 'Global (Isolated)'])

    ax.set_title(f'Ensemble Globality Map (Averaged over k={k_list})\n'
                 f'Showing top {int(contamination*100)}% LOF candidates', 
                 fontsize=14, fontweight='bold')
    ax.set_xlabel('Attribute 1')
    ax.set_ylabel('Attribute 2')
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()
    
    return df

# ==========================================
# HOW TO RUN IT
# ==========================================

# We average over k=3, 4, 5, 6, 7 to account for structural ambiguity
# This makes the "Global" score much more scientifically robust
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[5], 
    contamination=0.05
)

# Check the top results
print("\nTop 5 Global Outliers (Most Isolated across all k):")
cols = ['attribute_1', 'attribute_2', 'ensemble_globality_index']
print(df.dropna(subset=['ensemble_globality_index']).nlargest(5, 'ensemble_globality_index')[cols])

In [None]:
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[3, 4, 5, 6, 7], 
    contamination=0.05
)

In [None]:
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[4,5,6,7,8,9,10], 
    contamination=0.05
)

In [None]:
df = analyze_ensemble_globality(
    df, 
    X_normalized, 
    k_list=[3,4,5,6,7], 
    contamination=1
)

In [None]:
c = 100
p = 20
top_knn_indices = df.nsmallest(c, 'knn_rank').index.values
top_lof_indices = df.nsmallest(c, 'lof_rank').index.values

all_top_indices = np.unique(np.concatenate([top_knn_indices, top_lof_indices]))

ratio_values = np.full(len(df), np.nan)
for idx in all_top_indices:
    knn_rank = df.loc[idx, 'knn_rank']
    lof_rank = df.loc[idx, 'lof_rank']
    ratio = (lof_rank + knn_rank) / 2* (lof_rank)
    ratio_values[idx] = ratio

mask_top = ~np.isnan(ratio_values)
mask_other = np.isnan(ratio_values)

ratio_min = ratio_values[mask_top].min()
ratio_max = ratio_values[mask_top].max()
ratio_mean = ratio_values[mask_top].mean()

vmin = max(0.1, ratio_min * 0.9)
vmax = min(ratio_max * 1.1, ratio_max + 0.5)
vmin = ratio_min
vmax = ratio_max
fig, ax = plt.subplots(figsize=(12, 8))

scatter_other = ax.scatter(df.loc[mask_other, 'attribute_1'], 
                          df.loc[mask_other, 'attribute_2'],
                          c='black', s=20, alpha=0.3, label='Other points')

scatter_top = ax.scatter(df.loc[mask_top, 'attribute_1'], 
                         df.loc[mask_top, 'attribute_2'],
                         c=ratio_values[mask_top], cmap='RdYlGn', 
                         s=50, alpha=0.8, edgecolors='black', linewidth=0.5,
                         vmin=vmin, vmax=vmax, label='Top c points')

ax.set_xlabel('Attribute 1', fontsize=12)
ax.set_ylabel('Attribute 2', fontsize=12)
ax.set_title(f'Global vs Local Outliers (c={c})\nRatio = (LOF rank + {p}) / (k-NN rank + {p})', 
             fontsize=13, fontweight='bold')
cbar = plt.colorbar(scatter_top, ax=ax)
cbar.set_label('Ratio (1 = global outlier, 0 = local outlier)', fontsize=11)
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Top {c} points by k-NN rank: {len(top_knn_indices)}")
print(f"Top {c} points by LOF rank: {len(top_lof_indices)}")
print(f"Total unique top points: {len(all_top_indices)}")
print(f"\nRatio statistics for top points:")
print(f"  Mean: {ratio_values[mask_top].mean():.4f}")
print(f"  Min: {ratio_values[mask_top].min():.4f}")
print(f"  Max: {ratio_values[mask_top].max():.4f}")

