In [2]:
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel
import pandas as pd


data = pd.read_csv('data.csv')
print(data.shape)

(1199, 10)


In [8]:
from sklearn.metrics.pairwise import euclidean_distances
# Compute pairwise Euclidean distances
print("Pairwise-euclidean distances")
distances = euclidean_distances(data)
print(distances)

# Create similarity matrix
print("")
print("The similarity matrix is as follows:")
similarity_matrix = np.exp(-distances ** 2 / (2 * np.median(distances) ** 2))
print(similarity_matrix)

Pairwise-euclidean distances
[[0.         2.55321047 1.28491948 ... 2.35400784 2.95822089 3.02254211]
 [2.55321047 0.         2.45589395 ... 0.48878187 0.91783996 0.81913286]
 [1.28491948 2.45589395 0.         ... 2.31467907 2.8495437  2.97139086]
 ...
 [2.35400784 0.48878187 2.31467907 ... 0.         1.07927509 0.94287448]
 [2.95822089 0.91783996 2.8495437  ... 1.07927509 0.         0.84881266]
 [3.02254211 0.81913286 2.97139086 ... 0.94287448 0.84881266 0.        ]]

The similarity matrix is as follows:
[[1.         0.26517316 0.7144948  ... 0.32357388 0.16831986 0.15563845]
 [0.26517316 1.         0.29284434 ... 0.95251799 0.84237056 0.87229782]
 [0.7144948  0.29284434 1.         ... 0.33590044 0.19140373 0.16566454]
 ...
 [0.32357388 0.95251799 0.33590044 ... 1.         0.78884681 0.83441843]
 [0.16831986 0.84237056 0.19140373 ... 0.78884681 1.         0.86354919]
 [0.15563845 0.87229782 0.16566454 ... 0.83441843 0.86354919 1.        ]]


In [13]:
# Compute degree matrix
print("Printing the degree matrix ")
degree_matrix = np.diag(np.sum(similarity_matrix, axis=1))
print(degree_matrix)
print("")

# Compute Laplacian matrix
print("Printing the laplacian matrix ")
laplacian_matrix = degree_matrix - similarity_matrix
print(laplacian_matrix)
print("laplacian matrix printed")

Printing the degree matrix 
[[512.97743832   0.           0.         ...   0.           0.
    0.        ]
 [  0.         778.27691037   0.         ...   0.           0.
    0.        ]
 [  0.           0.         505.98795913 ...   0.           0.
    0.        ]
 ...
 [  0.           0.           0.         ... 794.95832305   0.
    0.        ]
 [  0.           0.           0.         ...   0.         764.07285623
    0.        ]
 [  0.           0.           0.         ...   0.           0.
  701.58724021]]

Printing the laplacian matrix 
[[ 5.11977438e+02 -2.65173164e-01 -7.14494795e-01 ... -3.23573877e-01
  -1.68319862e-01 -1.55638452e-01]
 [-2.65173164e-01  7.77276910e+02 -2.92844340e-01 ... -9.52517992e-01
  -8.42370563e-01 -8.72297822e-01]
 [-7.14494795e-01 -2.92844340e-01  5.04987959e+02 ... -3.35900435e-01
  -1.91403732e-01 -1.65664538e-01]
 ...
 [-3.23573877e-01 -9.52517992e-01 -3.35900435e-01 ...  7.93958323e+02
  -7.88846814e-01 -8.34418430e-01]
 [-1.68319862e-01 -8.423705

In [10]:
# Compute eigenvectors and eigenvalues
eigenvalues, eigenvectors = np.linalg.eig(laplacian_matrix)
print("Computing Eigen values and eigen vectors from the laplacian matrix")
# Sort eigenvectors and eigenvalues
sorted_indices = np.argsort(eigenvalues)
print("Eigen values are")
eigenvalues = eigenvalues[sorted_indices]
print(eigenvalues)
print("")
print("Eigen vectors are:")
eigenvectors = eigenvectors[:, sorted_indices]
print(eigenvectors)



Computing Eigen values and eigen vectors from the laplacian matrix
Eigen values are
[-1.17662791e-12 -1.11642603e-14  3.31755020e-13 ...  8.63576961e+02
  8.65206129e+02  8.67143491e+02]

Eigen vectors are:
[[-1.03601430e-16 -2.89036657e-02 -2.29234990e-16 ... -1.96365632e-05
  -8.94201499e-05 -1.07590559e-04]
 [-1.80615588e-17 -2.89036657e-02  5.64991844e-17 ...  5.89178888e-05
  -7.88403573e-04 -6.89402720e-04]
 [-4.63299564e-17 -2.89036657e-02 -2.77341555e-17 ...  3.21093927e-05
  -1.51818831e-04 -9.05045813e-05]
 ...
 [-2.01347001e-17 -2.89036657e-02  1.75432666e-17 ... -1.27789039e-04
  -8.29770436e-04 -9.79182672e-04]
 [-2.07567808e-17 -2.89036657e-02  3.00545836e-17 ...  1.33184266e-04
  -6.66267707e-04 -5.88058743e-04]
 [-3.26162446e-17 -2.89036657e-02  1.99071124e-18 ...  3.15098516e-06
  -1.29858601e-04 -1.51623711e-04]]


In [6]:
# Determine number of clusters
k = 3  # Number of clusters
from sklearn.cluster import KMeans
# Cluster data points using K-means
kmeans = KMeans(n_clusters=k, random_state=42)
projection = eigenvectors[:, 1:k+1]
cluster_assignments = kmeans.fit_predict(projection)

# Identify outliers
outliers = []
for cluster_idx in range(k):
    cluster_data = data[cluster_assignments == cluster_idx]
    cluster_center = np.mean(cluster_data, axis=0)
    cluster_distances = np.linalg.norm(cluster_data - cluster_center, axis=1)
    threshold = np.percentile(cluster_distances, 98)  # Adjust percentile as 98 % 
    cluster_outliers = np.where(cluster_distances > threshold)[0]
    outliers.extend(cluster_outliers)

outliers = np.array(outliers)
print(f"total number of outliers =",outliers.size)
print("Outliers:", outliers)




total number of outliers = 24
Outliers: [  25   44  113  158  201  224  248  277  340  435  494  503  525  609
  623  671  696  898 1003 1026 1042 1109 1119 1181]
