In [9]:
import os
import numpy as np
import polars as pl

In [10]:
data_dir = os.path.join("data", "tabular")
files = ["train_processed.csv", "validation_processed.csv", "test_processed.csv"]

In [11]:
dfs = [pl.read_csv(os.path.join(data_dir, f)) for f in files]
full_df = pl.concat(dfs)

In [12]:
data = full_df.to_numpy()
X = data[:, :-1].astype(float)
Y = data[:, -1]

mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
std[std == 0] = 1.0
X_norm = (X - mean) / std

classes = np.unique(Y)
centroids = {}
intra_class_dist = {}
outliers = []

print("--- Intra-Class Distances (Compactness) ---")
for cls in classes:
    indices = np.where(Y == cls)[0]
    X_cls = X_norm[indices]

    centroid = np.mean(X_cls, axis=0)
    centroids[cls] = centroid

    dists = np.linalg.norm(X_cls - centroid, axis=1)
    avg_dist = np.mean(dists)
    intra_class_dist[cls] = avg_dist

    print(f"Class {cls:<10} | Mean Dist to Centroid: {avg_dist:.4f}")

    threshold = avg_dist + 2 * np.std(dists)
    outlier_indices = indices[dists > threshold]

    if len(outlier_indices) > 0:
        for idx in outlier_indices:
            outliers.append((cls, idx, dists[np.where(indices == idx)[0][0]]))

print("\n--- Inter-Class Distances (Separability) ---")
for i in range(len(classes)):
    for j in range(i + 1, len(classes)):
        c1 = classes[i]
        c2 = classes[j]
        dist = np.linalg.norm(centroids[c1] - centroids[c2])
        print(f"{c1:<10} vs {c2:<10} | Distance: {dist:.4f}")

print("\n--- Outlier Detection (Threshold: Mean + 2*Std) ---")
print(f"Total Outliers Detected: {len(outliers)}")

for cls, idx, dist in outliers[:5]:
    print(f"Row {idx:<5} (Class: {cls}) is an outlier with distance {dist:.4f}")

--- Intra-Class Distances (Compactness) ---
Class banana     | Mean Dist to Centroid: 9.0917
Class carrot     | Mean Dist to Centroid: 9.4193
Class cucumber   | Mean Dist to Centroid: 10.0131
Class mandarin   | Mean Dist to Centroid: 9.1239
Class tomato     | Mean Dist to Centroid: 10.0760

--- Inter-Class Distances (Separability) ---
banana     vs carrot     | Distance: 4.2851
banana     vs cucumber   | Distance: 5.4098
banana     vs mandarin   | Distance: 3.8911
banana     vs tomato     | Distance: 7.0378
carrot     vs cucumber   | Distance: 3.8810
carrot     vs mandarin   | Distance: 3.6012
carrot     vs tomato     | Distance: 5.0954
cucumber   vs mandarin   | Distance: 5.3242
cucumber   vs tomato     | Distance: 5.5608
mandarin   vs tomato     | Distance: 4.9923

--- Outlier Detection (Threshold: Mean + 2*Std) ---
Total Outliers Detected: 297
Row 2902  (Class: banana) is an outlier with distance 13.5958
Row 2935  (Class: banana) is an outlier with distance 13.3796
Row 2959  (Class: