In [None]:
import pandas as pd
import numpy as np


from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score

import gower

from rapidfuzz import process, fuzz

import matplotlib.pyplot as plt
import seaborn as sns

In [532]:
dataset = pd.read_csv("./datasets/hospital/dirty.csv")

In [533]:
# Function to determine if a column is categorical based on frequency of unique values
def is_categorical(column, threshold=0.2):
    unique_values = column.nunique()
    total_values = len(column)
    return (unique_values / total_values) < threshold


# Function to extract initial categories based on frequency threshold
def extract_categories(column, frequency_threshold=0.02):
    # Get the value counts for the column
    category_counts = column.value_counts(
        normalize=True
    )  # Normalize to get percentages
    # Return the categories that appear more than the frequency threshold (as a ratio)
    return category_counts[category_counts >= frequency_threshold].index.tolist()


# Function to clean categorical values using fuzzy matching
def clean_categorical_values(column, known_categories, similarity_threshold=85):
    cleaned_column = []

    for value in column:
        # Skip if the value is NaN
        if pd.isna(value):
            cleaned_column.append(value)
            continue

        # Find the categorical value that best matches the value in question
        match_result = process.extractOne(value, known_categories, scorer=fuzz.ratio)

        # Check if a match was found
        if match_result:
            match, score, _ = match_result

            # If the similarity score is above the threshold, replace with the matched category
            if score >= similarity_threshold:
                cleaned_column.append(match)
            else:
                cleaned_column.append(value)  # Keep original if no good match
        else:
            cleaned_column.append(value)  # Keep original if no match found

    return cleaned_column


# Function to scale numerical columns between 0 and 1
def scale_numerical_columns(column):
    scaler = MinMaxScaler()
    scaled_column = scaler.fit_transform(column.values.reshape(-1, 1)).flatten()
    return scaled_column


# Function to process the entire dataframe
def clean_and_encode_df(df):
    cleaned_df = df.copy()

    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            # print(f"Scaling numerical column: {column}")
            # Scale the numerical column
            cleaned_df[column] = scale_numerical_columns(df[column])
        elif df[column].dtype == "object":  # Only work with text columns
            if is_categorical(df[column]):
                # print(f"Processing categorical column: {column}")
                # Extract categories based on frequency
                categories = extract_categories(df[column])
                # print(f"Extracted categories for '{column}': {categories}")

                # Clean the column based on similarity to categories
                cleaned_values = clean_categorical_values(df[column], categories)
                cleaned_df[column] = cleaned_values

                # Apply label encoding to the cleaned categorical values
                # le = LabelEncoder()
                # cleaned_df[column] = le.fit_transform(cleaned_df[column])
                # print(
                #     f"Encoded values for '{column}': {dict(zip(le.classes_, range(len(le.classes_))))}"
                # )
            else:
                # print(f"Column '{column}' is not categorical (freeform text).")
                cleaned_df.drop(column, axis=1, inplace=True)

    return cleaned_df


# Apply the cleaning and encoding to the dataset
cleaned_df = clean_and_encode_df(dataset)
cleaned_df.to_csv("./clustering/prerpoccessed.csv")

### Approach 1 - Cluster Data using DBSCAN and visualize with PCA, t-SNE


In [534]:
# Function to apply DBSCAN to a dataset
def apply_dbscan(df, eps, min_samples, dist_metric=None):
    # Normalize the features

    # Fit DBSCAN
    if dist_metric.shape[0] > 0:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric="precomputed")
        df["Cluster"] = dbscan.fit_predict(dist_metric)
    else:
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(df)
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        df["Cluster"] = dbscan.fit_predict(scaled_features)

    return df, dbscan


def apply_kmeans(df, n_clusters):
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df)

    # Fit K-Means
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(scaled_features)
    df["Cluster"] = labels

    return df, kmeans


# After determining the optimal number of clusters from the elbow plot (for example, if k=3):

In [535]:
# Function to visualize clusters using PCA
def visualize_clusters_pca(df, cluster_column):

    df_without_clusters = df.drop(columns=[cluster_column])
    pca = PCA(n_components=2)
    components = pca.fit_transform(df_without_clusters.select_dtypes(include="number"))

    pca_df = pd.DataFrame(data=components, columns=["PCA1", "PCA2"])
    pca_df[cluster_column] = df[cluster_column].values

    plt.figure(figsize=(10, 7))
    sns.scatterplot(
        data=pca_df, x="PCA1", y="PCA2", hue=cluster_column, palette="viridis", s=100
    )
    plt.title(f"PCA of Clusters ({cluster_column})")
    plt.legend(title=cluster_column)
    plt.show()


# Function to visualize clusters using t-SNE
def visualize_clusters_tsne(df, cluster_column):

    df_without_clusters = df.drop(columns=[cluster_column])
    tsne = TSNE(n_components=2, perplexity=30, random_state=42)
    tsne_components = tsne.fit_transform(
        df_without_clusters.select_dtypes(include="number")
    )

    tsne_df = pd.DataFrame(data=tsne_components, columns=["TSNE1", "TSNE2"])
    tsne_df[cluster_column] = df[cluster_column].values

    plt.figure(figsize=(10, 7))
    sns.scatterplot(
        data=tsne_df, x="TSNE1", y="TSNE2", hue=cluster_column, palette="viridis", s=100
    )
    plt.title(f"t-SNE of Clusters ({cluster_column})")
    plt.legend(title=cluster_column)
    plt.show()

Visualize DBSCAN clusters using PCA and t-SNE


In [536]:
# Function to process the entire dataframe
def clean_and_encode_df_tsne(df):
    cleaned_df = df.copy()

    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            # print(f"Scaling numerical column: {column}")
            # Scale the numerical column
            cleaned_df[column] = scale_numerical_columns(df[column])
        elif df[column].dtype == "object":  # Only work with text columns
            if is_categorical(df[column]):
                # print(f"Processing categorical column: {column}")
                # Extract categories based on frequency
                categories = extract_categories(df[column])
                # print(f"Extracted categories for '{column}': {categories}")

                # Clean the column based on similarity to categories
                cleaned_values = clean_categorical_values(df[column], categories)
                cleaned_df[column] = cleaned_values

                # Apply label encoding to the cleaned categorical values
                le = LabelEncoder()
                cleaned_df[column] = le.fit_transform(cleaned_df[column])
                # print(
                #     f"Encoded values for '{column}': {dict(zip(le.classes_, range(len(le.classes_))))}"
                # )
            else:
                # print(f"Column '{column}' is not categorical (freeform text).")
                cleaned_df.drop(column, axis=1, inplace=True)

    return cleaned_df


# Apply the cleaning and encoding to the dataset
cleaned_df_tsne = clean_and_encode_df(dataset)

Apply DBSCAN to data


In [539]:
cleaned_df = cleaned_df.dropna()
# Convert numeric columns to float
numeric_cols = cleaned_df.select_dtypes(include=[np.number]).columns
cleaned_df[numeric_cols] = cleaned_df[numeric_cols].astype(np.float64)

gower_dist = gower.gower_matrix(cleaned_df)
df_dbscan, dbscan_model = apply_dbscan(
    cleaned_df, eps=0.1, min_samples=5, dist_metric=gower_dist
)

df_dbscan.to_csv("./clustering/dbscan.csv", index=False)

In [None]:
# visualize_clusters_pca(df_dbscan, "Cluster")
cleaned_df_tsne = clean_and_encode_df_tsne(dataset)
cleaned_df_tsne["Cluster"] = df_dbscan["Cluster"]
visualize_clusters_tsne(cleaned_df_tsne, "Cluster")

Visualize k-means clusters using PCA, t-SNE


In [None]:
df_kmeans, kmeans_model = apply_kmeans(cleaned_df_tsne, n_clusters=6)

df_kmeans.to_csv("./clustering/k-means.csv", index=False)

In [None]:
visualize_clusters_pca(df_kmeans, "Cluster")
visualize_clusters_tsne(df_kmeans, "Cluster")

### Approach 2 - Apply PCA and then perform Clustering


In [507]:
# Function to apply PCA to the dataset
def apply_pca(data, n_components=2):
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(data)

    # Create a DataFrame with the PCA results
    pca_df = pd.DataFrame(
        data=principal_components, columns=[f"PC{i+1}" for i in range(n_components)]
    )
    print(
        f"Explained variance by each principal component: {pca.explained_variance_ratio_}"
    )

    return pca_df, pca


def plot_clusters(data):
    plt.figure(figsize=(8, 6))

    # Scatter plot with different clusters
    plt.scatter(
        data["PC1"], data["PC2"], c=data["Cluster"], cmap="plasma", s=50, alpha=0.7
    )

    # Add labels and title
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.title("DBSCAN Clusters after PCA")
    plt.colorbar(label="Cluster")

    plt.show()

In [508]:
def apply_dbscan(df, eps, min_samples):
    # Normalize the features

    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df)
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    df["Cluster"] = dbscan.fit_predict(scaled_features)

    return df, dbscan

In [None]:
# Generate Principal Components from the data
pca_df, pca_model = apply_pca(cleaned_df_tsne, n_components=2)

# Cluster Using DBSCAN
pca_dbscan, pca_dbscan_model = apply_dbscan(pca_df, eps=0.05, min_samples=20)

# Plot the clusters
plot_clusters(pca_dbscan)

# Optionally, save the clustered data to a CSV
pca_dbscan.to_csv("./pca_dbscan.csv", index=False)

In [510]:
# Function to plot elbow
def elbow_method(data, max_clusters=10):
    data_without_clusters = data.drop(columns="Cluster")
    wcss = []

    # Fit KMeans with a range of cluster numbers and compute WCSS (Within-Cluster Sum of Squares)
    for i in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(data_without_clusters)
        wcss.append(kmeans.inertia_)  # inertia_ is the WCSS

    # Plot the elbow graph
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_clusters + 1), wcss, marker="o", linestyle="-", color="b")
    plt.xlabel("Number of Clusters")
    plt.ylabel("WCSS (Within-Cluster Sum of Squares)")
    plt.title("Elbow Method for Optimal K")
    plt.xticks(range(1, max_clusters + 1))
    plt.grid(True)
    plt.show()


# Visualize k-means
def plot_clusters(data, kmeans_model):
    plt.figure(figsize=(8, 6))

    # Scatter plot with different clusters
    plt.scatter(
        data["PC1"], data["PC2"], c=data["Cluster"], cmap="viridis", s=50, alpha=0.7
    )

    # Mark the cluster centers
    # centers = kmeans_model.cluster_centers_
    # plt.scatter(
    #     centers[:, 0],
    #     centers[:, 1],
    #     c="red",
    #     s=200,
    #     alpha=0.75,
    #     marker="X",
    #     label="Centroids",
    # )

    # Add labels and title
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.title(f"KMeans Clusters (k={kmeans_model.n_clusters}) after PCA")
    plt.legend()
    plt.colorbar(label="Cluster")

    plt.show()


# Plot the clusters (using k=3 in this example)

In [None]:
elbow_method(pca_df)

In [None]:
pca_kmeans, kmeans_model = apply_kmeans(pca_df, n_clusters=3)

pca_kmeans.to_csv("./clustering/pca_k-means.csv", index=False)

In [None]:
plot_clusters(pca_kmeans, kmeans_model)

In [514]:
# def grid_search_dbscan(df, eps_values, min_samples_values):
#     best_score = -1
#     best_params = None

#     # Iterate over all combinations of eps and min_samples
#     for eps in eps_values:
#         for min_samples in min_samples_values:
#             print(f"Evaluating DBSCAN with eps={eps}, min_samples={min_samples}")

#             # Apply DBSCAN
#             clustered_df, _ = apply_dbscan(df, eps=eps, min_samples=min_samples)
#             labels = clustered_df["DBSCAN_Cluster"]

#             # Check if there are enough clusters to calculate silhouette score
#             if len(set(labels)) > 1 and len(set(labels)) < len(labels):
#                 score = silhouette_score(df, labels)
#                 print(f"Silhouette Score: {score:.3f}")

#                 # Update the best score and parameters if the current score is better
#                 if score > best_score:
#                     best_score = score
#                     best_params = (eps, min_samples)

#     return best_params, best_score


# # Define the parameter grid
# eps_values = np.logspace(-2, 0, num=10)  # eps from 0.01 to 1.0
# min_samples_values = range(2, 10)  # min_samples from 2 to 9

# # Apply the grid search
# best_params, best_score = grid_search_dbscan(cleaned_df, eps_values, min_samples_values)

# # Output the best parameters
# print("Best Parameters:", best_params)
# print("Best Silhouette Score:", best_score)

In [515]:
# from sklearn.metrics import davies_bouldin_score


# def grid_search_dbscan(df, eps_values, min_samples_values):
#     best_score = float("inf")  # Initialize to infinity for Davies-Bouldin
#     best_params = None

#     # Iterate over all combinations of eps and min_samples
#     for eps in eps_values:
#         for min_samples in min_samples_values:
#             print(f"Evaluating DBSCAN with eps={eps}, min_samples={min_samples}")

#             # Apply DBSCAN
#             clustered_df, _ = apply_dbscan(df, eps=eps, min_samples=min_samples)
#             labels = clustered_df["DBSCAN_Cluster"]

#             # Check if there are enough clusters to calculate Davies-Bouldin Index
#             if len(set(labels)) > 1 and len(set(labels)) < len(labels):
#                 # Calculate Davies-Bouldin Index
#                 score = davies_bouldin_score(df, labels)
#                 print(f"Davies-Bouldin Index: {score:.3f}")

#                 # Update the best score and parameters if the current score is better (lower is better)
#                 if score < best_score:
#                     best_score = score
#                     best_params = (eps, min_samples)

#     return best_params, best_score


# # Define the parameter grid
# eps_values = np.logspace(-2, 0, num=10)  # eps from 0.01 to 1.0
# min_samples_values = range(2, 10)  # min_samples from 2 to 9

# # Apply the grid search
# best_params, best_score = grid_search_dbscan(cleaned_df, eps_values, min_samples_values)

# # Output the best parameters
# print("Best Parameters:", best_params)
# print("Best Davies-Bouldin Score:", best_score)

In [516]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import KernelPCA


# Function to apply Kernel PCA to the dataset
def apply_kpca(data, n_components=2, kernel="rbf", gamma=None):
    kpca = KernelPCA(n_components=n_components, kernel=kernel, gamma=gamma)
    principal_components = kpca.fit_transform(data)

    # Create a DataFrame with the KPCA results
    kpca_df = pd.DataFrame(
        data=principal_components, columns=[f"PC{i+1}" for i in range(n_components)]
    )

    return kpca_df, kpca


def plot_clusters_kpca(data):
    plt.figure(figsize=(8, 6))

    # Scatter plot with different clusters
    plt.scatter(
        data["PC1"], data["PC2"], c=data["Cluster"], cmap="plasma", s=50, alpha=0.7
    )

    # Add labels and title
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.title("DBSCAN Clusters after Kernel PCA")
    plt.colorbar(label="Cluster")

    plt.show()

In [517]:
def plot_clusters(data):
    plt.figure(figsize=(8, 6))

    # Scatter plot with different clusters
    plt.scatter(
        data["PC1"], data["PC2"], c=data["Cluster"], cmap="plasma", s=50, alpha=0.7
    )

    # Add labels and title
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.title("DBSCAN Clusters after PCA")
    plt.colorbar(label="Cluster")

    plt.show()

In [None]:
n_components = 2  # Number of components to keep
kpca_df, kpca_model = apply_kpca(cleaned_df_tsne, n_components=n_components)
# print(kpca_df)

# Cluster Using DBSCAN
kpca_dbscan, kpca_dbscan_model = apply_dbscan(kpca_df, eps=0.2, min_samples=2)

# Plot the clusters
plot_clusters(kpca_dbscan)

# Optionally, save the clustered data to a CSV
# pca_dbscan.to_csv("./pca_dbscan.csv", index=False)