In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [5]:
# Step 2: Load dataset
# Assuming the dataset is saved as 'data.csv', upload it to Google Colab or load from Google Drive
# Replace 'data.csv' with the actual path if needed
data = pd.read_csv('synthetic_anomaly_detection.csv')

In [3]:
# Step 3: Quick data inspection
print("First few rows of the dataset:")
print(data.head())
print("\nDataset description:")
print(data.describe())

First few rows of the dataset:
  feature_1,feature_2,feature_3,feature_4,feature_5,label
0  1.764052345967664,0.4001572083672233,0.9787379...     
1  -0.977277879876411,0.9500884175255894,-0.15135...     
2  0.144043571160878,1.454273506962975,0.76103772...     
3  0.33367432737426683,1.4940790731576061,-0.2051...     
4  -2.5529898158340787,0.6536185954403606,0.86443...     

Dataset description:
       feature_1,feature_2,feature_3,feature_4,feature_5,label
count                                                 500     
unique                                                500     
top     1.764052345967664,0.4001572083672233,0.9787379...     
freq                                                    1     


In [6]:
# Step 4: Preprocessing: Scale features if needed
scaler = StandardScaler()
features = data[['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']]
scaled_features = scaler.fit_transform(features)

In [7]:
# Step 5: Apply K-Means clustering with n_clusters=3 and random_state=42
kmeans = KMeans(n_clusters=3, random_state=42)
data['cluster'] = kmeans.fit_predict(scaled_features)


In [8]:
# Step 6: Calculate distance to the nearest cluster center for each point
data['distance_to_center'] = np.linalg.norm(scaled_features - kmeans.cluster_centers_[data['cluster']], axis=1)

In [9]:
# Step 7: Define a threshold for anomaly detection (mean + 2*std deviation)
threshold = data['distance_to_center'].mean() + 2 * data['distance_to_center'].std()

In [10]:
# Mark anomalies (ANOMALY == 1 if distance > threshold)
data['anomaly'] = data['distance_to_center'] > threshold

In [13]:
# Step 8: Display anomalies
anomalies = data[data['anomaly'] == 1]
num_anomalies = len(anomalies)
print(f"Number of anomalies detected: {num_anomalies}")

Number of anomalies detected: 18


In [12]:
# Display anomalies
print("\nAnomalous data points:")
print(anomalies)


Anomalous data points:
     feature_1  feature_2  feature_3  feature_4  feature_5  label  cluster  \
0     1.764052   0.400157   0.978738   2.240893   1.867558    0.0        2   
4    -2.552990   0.653619   0.864436  -0.742165   2.269755    0.0        0   
93    2.412454  -0.960504  -0.793117  -2.288620   0.251484    0.0        0   
98   -1.029935  -0.349943   1.100284   1.298022   2.696224    0.0        0   
117  -0.470638  -0.216950   0.445393  -0.392389  -3.046143    0.0        0   
137  -2.834555   2.116791  -1.610878  -0.035768   2.380745    0.0        2   
179   1.997956  -0.856549  -1.541587   2.594425  -0.404032    0.0        2   
210  -0.719941  -0.893574  -0.156024   1.049093   3.170975    0.0        2   
270  -0.919651   2.642936   0.540123   2.290467   1.600268    0.0        2   
283  -1.744188   1.660608  -1.416603  -2.802203  -1.188424    0.0        2   
289   0.715939  -2.994613   0.880938   1.808132   0.436638    0.0        0   
346  -0.971171   1.426317   2.488442   1