<a href="https://colab.research.google.com/github/TadGreen/datamining/blob/main/Lab_DBSCAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

#Generate a 2D dataset with non-convex clusters


In [None]:
X, y = make_moons(n_samples=300, noise=0.05, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Visualize
plt.scatter(X_scaled[:, 0], X_scaled[:, 1])
plt.title("Dataset for DBSCAN")
plt.show()

#Applying DBSCAN

In [None]:
dbscan_0_1 = DBSCAN(eps=0.1, min_samples=5)
labels_0_1 = dbscan_0_1.fit_predict(X_scaled)

# Plot clusters
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels_0_1, cmap='plasma')
plt.title("DBSCAN Clustering with eps=0.1")

# Identify noise
noise_points_0_1 = np.sum(labels_0_1 == -1)
plt.subplot(1, 2, 2)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels_0_1, cmap='plasma')
plt.scatter(X_scaled[labels_0_1 == -1, 0], X_scaled[labels_0_1 == -1, 1], c='red', marker='x', s=100, label='Noise')
plt.title(f"Noise points with eps=0.1: {noise_points_0_1}")
plt.legend()
plt.show()

#Can you change epsilon and min values to observe how things change? For instance,

* Try eps=0.1, eps=0.5 and observe the difference.
* What happens when you change min_samples to 3 or 10?
* Can you also identify noise for each trial. (Hint: np.sum(labels == -1) after you your clusters)

In [None]:
dbscan_0_5 = DBSCAN(eps=0.5, min_samples=5)
labels_0_5 = dbscan_0_5.fit_predict(X_scaled)

# Plot clusters
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels_0_5, cmap='plasma')
plt.title("DBSCAN Clustering with eps=0.5")

# Identify noise
noise_points_0_5 = np.sum(labels_0_5 == -1)
plt.subplot(1, 2, 2)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels_0_5, cmap='plasma')
plt.scatter(X_scaled[labels_0_5 == -1, 0], X_scaled[labels_0_5 == -1, 1], c='red', marker='x', s=100, label='Noise')
plt.title(f"Noise points with eps=0.5: {noise_points_0_5}")
plt.legend()
plt.show()

In [None]:
dbscan_min_samples_3 = DBSCAN(eps=0.5, min_samples=3)
labels_min_samples_3 = dbscan_min_samples_3.fit_predict(X_scaled)

# Plot clusters
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels_min_samples_3, cmap='plasma')
plt.title("DBSCAN Clustering with min_samples=3 (eps=0.5)")

# Identify noise
noise_points_min_samples_3 = np.sum(labels_min_samples_3 == -1)
plt.subplot(1, 2, 2)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels_min_samples_3, cmap='plasma')
plt.scatter(X_scaled[labels_min_samples_3 == -1, 0], X_scaled[labels_min_samples_3 == -1, 1], c='red', marker='x', s=100, label='Noise')
plt.title(f"Noise points with min_samples=3: {noise_points_min_samples_3}")
plt.legend()
plt.show()

#Use a real dataset:
* Try load_iris() from sklearn.datasets.
* Apply DBSCAN

In [None]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import numpy as np

# Load the Iris dataset
iris = load_iris()
X_iris = iris.data
# y_iris = iris.target # Not used for unsupervised clustering

# Standardize the data
scaler_iris = StandardScaler()
X_iris_scaled = scaler_iris.fit_transform(X_iris)

# Apply DBSCAN
# Using initial parameters; these might need tuning for optimal results
dbscan_iris = DBSCAN(eps=0.7, min_samples=5) # Adjusted eps slightly for better initial visual
labels_iris = dbscan_iris.fit_predict(X_iris_scaled)

# Visualize the clusters (using the first two features for simplicity)
plt.figure(figsize=(10, 6))
plt.scatter(X_iris_scaled[:, 0], X_iris_scaled[:, 1], c=labels_iris, cmap='viridis')
plt.title("DBSCAN Clustering on Iris Dataset (first two features)")
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])

# Identify noise
noise_points_iris = np.sum(labels_iris == -1)
plt.scatter(X_iris_scaled[labels_iris == -1, 0], X_iris_scaled[labels_iris == -1, 1],
            c='red', marker='x', s=100, label='Noise')
plt.legend()
plt.show()

print(f"Number of clusters found: {len(set(labels_iris)) - (1 if -1 in labels_iris else 0)}")
print(f"Number of noise points: {noise_points_iris}")