In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "3"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, homogeneity_score
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import homogeneity_score, completeness_score, adjusted_rand_score

In [None]:
df = pd.read_csv("Cancer_Data.csv")

df.drop(columns=["id", "Unnamed: 32"], inplace=True)

label_encoder = LabelEncoder()
diagnosis_labels = label_encoder.fit_transform(df["diagnosis"])

In [None]:
X = df.drop(columns=["diagnosis"])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
inertias = []
silhouette_scores = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

In [None]:
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(k_range, inertias, marker='o')
plt.title("Elbow Method (Inertia)")
plt.xlabel("k (Number of Clusters)")
plt.ylabel("Inertia")

plt.subplot(1, 2, 2)
plt.plot(k_range, silhouette_scores, marker='o', color='green')
plt.title("Silhouette Scores")
plt.xlabel("k (Number of Clusters)")
plt.ylabel("Silhouette Score")

plt.tight_layout()
plt.show()

In [None]:
kmeans_final = KMeans(n_clusters=2, random_state=11, n_init=5)
kmeans_labels = kmeans_final.fit_predict(X_scaled)

In [None]:
kmeans_homogeneity = homogeneity_score(diagnosis_labels, kmeans_labels)
print(f"KMeans Homogeneity Score (k=2): {kmeans_homogeneity:.3f}")

In [None]:
neighbors = NearestNeighbors(n_neighbors=10)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)
k_distances = np.sort(distances[:, 4])

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(k_distances)
plt.title("k-Distance Graph (k=5) for DBSCAN")
plt.xlabel("Data Points sorted by distance")
plt.ylabel("5th Nearest Neighbor Distance")
plt.grid(True)
plt.show()

In [None]:
dbscan = DBSCAN(eps=2.28, min_samples=10)
dbscan_labels = dbscan.fit_predict(X_scaled)

n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)

print("Clusters found:", n_clusters)
print("Noise points:", n_noise)

In [None]:
filtered_indices = dbscan_labels != -1
dbscan_homogeneity = homogeneity_score(
    diagnosis_labels[filtered_indices],
    dbscan_labels[filtered_indices]
)
dbscan_completeness = completeness_score(
    diagnosis_labels[filtered_indices],
    dbscan_labels[filtered_indices]
)
dbscan_ari = adjusted_rand_score(
    diagnosis_labels[filtered_indices],
    dbscan_labels[filtered_indices]
)

print(f"DBSCAN Homogeneity Score: {dbscan_homogeneity:.3f}")
print(f"DBSCAN Completeness Score: {dbscan_completeness:.3f}")
print(f"DBSCAN Adjusted Rand Index (ARI): {dbscan_ari:.3f}")