## Starter file for assignment on Clustering
### Author: Tri Lam
### UH ID: 1916079

In [8]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split 


### Function for MinMax feature normalization
The input `x` is the raw data in a 2-D array of the shape `(number of data points, number of features`.

The output `x_norm` is the normalized data of the input `x` with the same shape as the input.

This function will be used for normalizing data before using DBSCAN for clustering.


In [9]:
def feature_norm(x):
    # x is a 2-D array of the shape (number of data points, number of features
    eps = np.finfo(float).eps
    x_norm = x - np.expand_dims(x.min(0), axis=0)
    x_norm = x_norm / (np.expand_dims((x.max(0) - x.min(0)), axis=0) + eps)
    
    return x_norm


### Import the dataset


In [10]:
data_path = 'clinical_records_dataset.csv'  # Make sure this path is correct
data = pd.read_csv(data_path)

# Display the first few rows to verify loading
print(data.head())

    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  DEATH_EVENT  
0        0     4            1  
1        0     6            1  
2       

### Task 1: Function for computing purity
This is your function of purity.

The indices of the clusters in `y_true` and `y_pred` start from 0 in `compute_purity`, e.g., [1, 1, 0, 0, 2, 2, 2].

`y_true` is the array of true class indices of all data points, `len(y_true)=number of data points`.

`y_pred` is the array of cluster indices of all data points, `len(y_pred)=number of data points`.

In [11]:
def compute_purity(y_true, y_pred):
    # This is your function of purity
    # y_true is the array of true class indices of all data points, len(y_true)=number of data points
    # y_pred is the array of cluster indices of all data points, len(y_pred)=number of data points
    total_points = len(y_true)
    unique_clusters = np.unique(y_pred)
    
    majority_class_count = 0
    for cluster in unique_clusters:
        indices = np.where(y_pred == cluster)[0]
        true_labels_in_cluster = y_true[indices]
        class_counts = Counter(true_labels_in_cluster)
        majority_class_count += max(class_counts.values())
    
    purity = majority_class_count / total_points
    return purity


Test for task 1

In [17]:
y_pred = np.array([2, 2, 1, 2, 2, 2, 0, 0, 0, 1, 2, 1, 1, 1, 1, 1])
y_true = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])

purity = compute_purity(y_true, y_pred)
print("Purity:", purity) 


Purity: 0.75


### Task 2

Run K-means on the dataset with k=2. Use the default parameters in the function provided by scikit-learn for the algorithm. What percentage of the data points were assigned to each of the two clusters? Compute the purity of the clustering result. Then compute the purity of the clustering result for each of the two clusters. Which cluster has the highest purity?

In [12]:
# Extract features and target, then normalize the features
X = data.drop(columns=['time', 'DEATH_EVENT']).values  # Use the exact column names
y_true = data['DEATH_EVENT'].values  # Extract the target column

X_normalized = feature_norm(X)

# Run K-Means clustering with k=2
kmeans = KMeans(n_clusters=2, random_state=42)
y_pred = kmeans.fit_predict(X_normalized)

# Calculate overall purity for the K-Means clustering
overall_purity = compute_purity(y_true, y_pred)
print("Overall Purity of K-Means clustering (k=2):", overall_purity)

# Calculate the percentage of data points in each cluster
cluster_counts = Counter(y_pred)
total_points = len(y_pred)
cluster_percentages = {cluster: count / total_points * 100 for cluster, count in cluster_counts.items()}
print("Percentage of data points in each cluster:", cluster_percentages)

# Calculate purity for each individual cluster
cluster_purities = {}
for cluster in cluster_counts:
    indices = np.where(y_pred == cluster)[0]
    true_labels_in_cluster = y_true[indices]
    class_counts = Counter(true_labels_in_cluster)
    cluster_purity = max(class_counts.values()) / len(true_labels_in_cluster)
    cluster_purities[cluster] = cluster_purity

print("Purity for each cluster:", cluster_purities)

# Identify which cluster has the highest purity
highest_purity_cluster = max(cluster_purities, key=cluster_purities.get)
print(f"Cluster {highest_purity_cluster} has the highest purity with a score of {cluster_purities[highest_purity_cluster]}")


Overall Purity of K-Means clustering (k=2): 0.6789297658862876
Percentage of data points in each cluster: {np.int32(0): 51.50501672240802, np.int32(1): 48.49498327759198}
Purity for each cluster: {np.int32(0): 0.6558441558441559, np.int32(1): 0.7034482758620689}
Cluster 1 has the highest purity with a score of 0.7034482758620689


### Task 3

Run K-means on the dataset with k=2, 10, 30, 50, 100. 

In [13]:
k_values = [2, 10, 30, 50, 100]
results = []

# Iterate over each value of k
for k in k_values:
    purities = []
    silhouettes = []
    
    for run in range(10):  
        kmeans = KMeans(n_clusters=k, random_state=run)
        y_pred = kmeans.fit_predict(X_normalized)
        
        purity = compute_purity(y_true, y_pred)
        purities.append(purity)
        
        silhouette = silhouette_score(X_normalized, y_pred, metric='euclidean')
        silhouettes.append(silhouette)
    
    avg_purity = np.mean(purities)
    avg_silhouette = np.mean(silhouettes)
    results.append({'k': k, 'avg_purity': avg_purity, 'avg_silhouette': avg_silhouette})

results_df = pd.DataFrame(results)
print(results_df)

best_purity_k = results_df.loc[results_df['avg_purity'].idxmax(), 'k']
best_silhouette_k = results_df.loc[results_df['avg_silhouette'].idxmax(), 'k']

print(f"Best k for purity: {best_purity_k}")
print(f"Best k for silhouette coefficient: {best_silhouette_k}")

     k  avg_purity  avg_silhouette
0    2    0.678930        0.197685
1   10    0.678930        0.311660
2   30    0.703679        0.472695
3   50    0.733445        0.364854
4  100    0.796321        0.232349
Best k for purity: 100
Best k for silhouette coefficient: 30


### Task 4

First, apply MinMax normalization we have used in our first EDA assignment on this dataset (a normalization function "feature_norm" has been included in the shell python notebook).

Then run DBSCAN on the normalized data with eps=0.3, 0.5, 0.7, and fix minPts=5, metric=Euclidean distance, and other default parameter values. Count the total number of clusters and the total number of anomalies generated by DBSCAN, calculate the purity of the clustering, and generate a table, as given below. Which value of eps gives the best clustering result in terms of purity?



In [15]:
eps_values = [0.3, 0.5, 0.7]  # Different epsilon values to test
task_4_results = []

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=5, metric='euclidean')  
    y_pred = dbscan.fit_predict(X_normalized)  

    n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0)

    n_anomalies = list(y_pred).count(-1)

    purity = compute_purity(y_true, y_pred) if n_clusters > 0 else 0

    task_4_results.append({
        'eps': eps,
        'Number of Clusters': n_clusters,
        'Number of Anomalies': n_anomalies,
        'Purity': purity
    })

task_4_results_df = pd.DataFrame(task_4_results)
print(task_4_results_df)


   eps  Number of Clusters  Number of Anomalies    Purity
0  0.3                  18                  146  0.688963
1  0.5                  22                   21  0.688963
2  0.7                  22                   13  0.695652
