In [17]:
import pandas as pd

file_path = r"C:/Users/reigi/OneDrive/Desktop/School/CS 3203N/Assignment_2_Clustering/diabetes.csv"

df = pd.read_csv(file_path)

print("First 5 records:", df.head())

First 5 records:    encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No    

In [16]:
import numpy as np
import pandas as pd

def simulate_adherence_data(n_individuals=1000, days=720):
    """
    Simulates adherence data for six predefined groups.
    
    Parameters:
        n_individuals (int): Number of individuals to simulate.
        days (int): Observation period in days (default: 720 days = 2 years).
    
    Returns:
        pd.DataFrame: Simulated adherence data.
    """
    def simulate_pattern(n, days, pattern):
        adherence = np.zeros((n, days))
        for i in range(n):
            if pattern == "high":
                adherence[i] = np.clip(np.random.normal(0.95, 0.05, days), 0, 1)
            elif pattern == "erratic":
                adherence[i] = np.clip(np.random.uniform(0.5, 0.9, days), 0, 1)
            elif pattern == "gradual_decline":
                adherence[i] = np.clip(np.linspace(1, 0.1, days) + np.random.normal(0, 0.1, days), 0, 1)
            elif pattern == "intermittent":
                adherence[i] = np.clip((np.sin(np.linspace(0, 4 * np.pi, days)) + 1) / 2 + np.random.normal(0, 0.1, days), 0, 1)
            elif pattern == "partial_drop_off":
                adherence[i] = np.clip(np.ones(days) * 0.9 - np.heaviside(np.arange(days) - 365, 1) * 0.5 + np.random.normal(0, 0.1, days), 0, 1)
            elif pattern == "non_persistence":
                adherence[i] = np.clip(np.heaviside(90 - np.arange(days), 1) + np.random.normal(0, 0.1, days), 0, 1)
        return adherence

    patterns = ["high", "erratic", "gradual_decline", "intermittent", "partial_drop_off", "non_persistence"]
    group_sizes = [n_individuals // 6] * 6
    labels = np.repeat(range(len(patterns)), group_sizes)

    data = np.vstack([simulate_pattern(size, days, p) for size, p in zip(group_sizes, patterns)])
    df = pd.DataFrame(data.T, columns=[f"Patient_{i}" for i in range(data.shape[0])])
    df["Day"] = range(days)
    df = df.melt(id_vars="Day", var_name="Patient", value_name="Adherence")
    return df, labels

simulated_data, true_labels = simulate_adherence_data()
print("Simulated Data Shape:", simulated_data.shape)

Simulated Data Shape: (717120, 3)


In [18]:
def sliding_window(data, window_size=30, overlap=15):
    """
    Applies a sliding window to the data.
    
    Parameters:
        data (pd.DataFrame): Simulated adherence data.
        window_size (int): Size of the sliding window in days.
        overlap (int): Overlap between consecutive windows in days.
    
    Returns:
        pd.DataFrame: Data with sliding windows applied.
    """
    step = window_size - overlap
    windows = []
    for start in range(0, data["Day"].max(), step):
        end = start + window_size
        window_data = data[(data["Day"] >= start) & (data["Day"] < end)]
        if not window_data.empty:
            window_data = window_data.groupby("Patient")["Adherence"].mean().reset_index()
            window_data["Window"] = f"{start}-{end}"
            windows.append(window_data)
    return pd.concat(windows, ignore_index=True)

# Apply sliding windows
windowed_data = sliding_window(simulated_data, window_size=30, overlap=15)
print("Windowed Data Shape:", windowed_data.shape)

Windowed Data Shape: (47808, 3)


In [19]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

def kmeans_clustering(data, n_clusters=6):
    """
    Applies K-Means clustering to the data.
    
    Parameters:
        data (pd.DataFrame): Windowed adherence data.
        n_clusters (int): Number of clusters.
    
    Returns:
        np.ndarray: Cluster assignments.
    """
    X = data.pivot(index="Patient", columns="Window", values="Adherence").fillna(0).values
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    return clusters

# Perform K-Means clustering
kmeans_clusters = kmeans_clustering(windowed_data)
ari_kmeans = adjusted_rand_score(true_labels, kmeans_clusters)
print(f"Adjusted Rand Index (K-Means): {ari_kmeans}")

Adjusted Rand Index (K-Means): 0.48081387250352264


In [20]:
from sklearn.cluster import DBSCAN

def dbscan_clustering(data, eps=0.5, min_samples=5):
    """
    Applies DBSCAN clustering to the data.
    
    Parameters:
        data (pd.DataFrame): Windowed adherence data.
        eps (float): Maximum distance for two points to be considered neighbors.
        min_samples (int): Minimum number of samples in a neighborhood.
    
    Returns:
        np.ndarray: Cluster assignments.
    """
    X = data.pivot(index="Patient", columns="Window", values="Adherence").fillna(0).values
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(X)
    return clusters

# Perform DBSCAN clustering
dbscan_clusters = dbscan_clustering(windowed_data)
valid_indices = dbscan_clusters != -1  # Exclude noise points
ari_dbscan = adjusted_rand_score(true_labels[valid_indices], dbscan_clusters[valid_indices])
print(f"Adjusted Rand Index (DBSCAN): {ari_dbscan}")

Adjusted Rand Index (DBSCAN): 0.48081387250352264


In [25]:
def compare_results(kmeans_ari, dbscan_ari):
    """
    Compares the results of K-Means and DBSCAN clustering.
    
    Parameters:
        kmeans_ari (float): ARI for K-Means clustering.
        dbscan_ari (float): ARI for DBSCAN clustering.
    """
    print("Comparison of Clustering Algorithms:")
    print(f"K-Means ARI: {kmeans_ari}")
    print(f"DBSCAN ARI: {dbscan_ari}")

# Compare results
compare_results(ari_kmeans, ari_dbscan)

Comparison of Clustering Algorithms:
K-Means ARI: 0.48081387250352264
DBSCAN ARI: 0.48081387250352264


In [None]:
def run_analysis():
    # Step 1: Simulate data
    simulated_data, true_labels = simulate_adherence_data()
    
    # Step 2: Apply sliding windows
    windowed_data = sliding_window(simulated_data, window_size=30, overlap=15)
    
    # Step 3: K-Means clustering
    kmeans_clusters = kmeans_clustering(windowed_data)
    ari_kmeans = adjusted_rand_score(true_labels, kmeans_clusters)
    
    # Step 4: DBSCAN clustering
    dbscan_clusters = dbscan_clustering(windowed_data)
    valid_indices = dbscan_clusters != -1
    ari_dbscan = adjusted_rand_score(true_labels[valid_indices], dbscan_clusters[valid_indices])
    
    # Step 5: Compare results
    compare_results(ari_kmeans, ari_dbscan)

# Run analysis
run_analysis()

Comparison of Clustering Algorithms:
K-Means ARI: 0.48081387250352264
DBSCAN ARI: 0.48081387250352264
