In [9]:
import os
os.environ["OMP_NUM_THREADS"]="3"
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix


# Load the dataset
data = pd.read_csv("wisc_bc_ContinuousVar.csv")

# Remove rows with missing values
data = data.dropna()

# Extract features for clustering (excluding diagnosis column)
features = data.drop("diagnosis", axis=1)

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# 9.1 Hierarchical Clustering
hclust_result = linkage(scaled_features, method='complete')
cut_tree_result = cut_tree(hclust_result, n_clusters=2).flatten()

# Tabulate clustered rows against the "diagnosis" column
hclust_clusters = pd.DataFrame({'diagnosis': data['diagnosis'], 'cluster': cut_tree_result})
confusion_matrix_hclust = pd.crosstab(hclust_clusters['diagnosis'], hclust_clusters['cluster'])
print("Hierarchical Clustering:")
print(confusion_matrix_hclust)

# 9.2 K-means Clustering
kmeans = KMeans(n_clusters=2, random_state=42,n_init=10)
kmeans_result = kmeans.fit_predict(scaled_features)

# Tabulate clustered rows against the "diagnosis" column
kmeans_clusters = pd.DataFrame({'diagnosis': data['diagnosis'], 'cluster': kmeans_result})
confusion_matrix_kmeans = pd.crosstab(kmeans_clusters['diagnosis'], kmeans_clusters['cluster'])
print("\nK-means Clustering:")
print(confusion_matrix_kmeans)


Hierarchical Clustering:
cluster      0  1
diagnosis        
B          357  0
M          210  2





K-means Clustering:
cluster      0    1
diagnosis          
B          343   14
M           37  175
