In [1]:
import numpy as np
import pandas as pd
import pickle

def preprocess_new_record(new_record, pipeline_path='Data/preprocessing_pipeline.pkl'):
    """
    Apply the same preprocessing to new records
    
    Parameters:
    -----------
    new_record : dict or pd.DataFrame
        New record(s) to preprocess
    pipeline_path : str
        Path to saved preprocessing pipeline
        
    Returns:
    --------
    pd.DataFrame : Preprocessed record matching training data format
    """
    
    # Load pipeline
    with open(pipeline_path, 'rb') as f:
        pipeline = pickle.load(f)
    
    # Convert to DataFrame if needed
    if isinstance(new_record, dict):
        df_new = pd.DataFrame([new_record])
    else:
        df_new = new_record.copy()
    
    # Step 1: Normalize Unknown-like responses
    df_new = df_new.replace({
        "Don't know": "Unknown", "Refused": "Unknown", 
        "Not Applicable": "Unknown", "N/A": "Unknown", 
        "Unknown/NA": "Unknown"
    })
    
    # Step 2: Binary encoding
    for col in pipeline['binary_cols']:
        if col not in df_new.columns:
            continue
        
        if col == "Has_diabetes":
            mapping = pipeline['binary_mappings']["Has_diabetes"]
        elif col == "Received_Hepatitis_A_Vaccine":
            mapping = pipeline['binary_mappings']["Received_Hepatitis_A_Vaccine"]
        else:
            mapping = pipeline['binary_mappings']["default"]
        
        df_new[col] = df_new[col].map(mapping)
    
    # Step 3: Ordinal encoding
    for col, encoder in pipeline['ordinal_encoders'].items():
        if col in df_new.columns:
            df_new[col] = encoder.transform(df_new[[col]])
    
    # Step 4: One-hot encoding
    for base_col in pipeline['ohe_cols']:
        if base_col in df_new.columns:
            # Get dummies for this column
            dummies = pd.get_dummies(df_new[base_col], prefix=base_col, dtype=int)
            
            # Add any missing columns from training
            for train_col in pipeline['ohe_column_names']:
                if base_col in train_col and train_col not in dummies.columns:
                    dummies[train_col] = 0
            
            # Remove extra columns not in training
            cols_to_keep = [col for col in dummies.columns 
                           if col in pipeline['ohe_column_names']]
            dummies = dummies[cols_to_keep]
            
            # Add to dataframe
            df_new = pd.concat([df_new.drop(columns=[base_col]), dummies], axis=1)
    
    # Step 5: Apply log transformation to skewed columns
    for col in pipeline['skewed_cols']:
        if col in df_new.columns:
            df_new[col] = np.log1p(df_new[col].clip(lower=0))
    
    # Step 6: Ensure all columns from training exist
    for col in pipeline['all_columns']:
        if col not in df_new.columns:
            df_new[col] = 0  # Add missing columns with default value
    
    # Step 7: Reorder columns to match training data
    df_new = df_new[pipeline['all_columns']]
    
    # Step 8: Apply scaling
    df_new[pipeline['cols_to_scale']] = pipeline['scaler'].transform(
        df_new[pipeline['cols_to_scale']]
    )
    
    return df_new

In [2]:
# Example new patient record
test_df = pd.read_csv('Data/test_dataset.csv')
df_scaled = pd.read_csv('Data/df_scaled.csv')
df_scaled.drop(columns='Unnamed: 0', inplace=True)

# Preprocess the new record
processed_patient = preprocess_new_record(test_df)

print(f"Shape of processed record: {processed_patient.shape}")
print(f"Matches training data shape: {processed_patient.shape[1] == df_scaled.shape[1]}")

Shape of processed record: (1050, 55)
Matches training data shape: True


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# Dim Reduction UMAP

In [4]:
# !pip install umap-learn


In [5]:
import joblib
umap = joblib.load('Data/umap_model.pkl')
umap_test = umap.transform(processed_patient)
umap_test_df = pd.DataFrame(umap_test, columns=[f'UMAP{i+1}' for i in range(umap_test.shape[1])])
umap_test_df.head()

print(umap_test_df.shape)
print(umap_test_df.head())

Sat Nov 15 12:48:00 2025 Building and compiling search function


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
(1050, 20)
       UMAP1      UMAP2     UMAP3     UMAP4     UMAP5     UMAP6     UMAP7  \
0  12.941374  10.881688  4.414238  6.736259  4.754117  5.308563  5.453377   
1  -4.995172   4.374187  3.285458  1.752989  5.818086  5.274156  3.320810   
2  12.947042  10.455590  4.470318  5.716773  4.547424  6.158661  5.278790   
3  12.701849   7.337795  4.251548  5.761917  5.017106  5.032408  5.027735   
4  -5.042682   4.320285  3.256649  1.827046  5.751108  5.249582  3.307297   

      UMAP8     UMAP9    UMAP10    UMAP11    UMAP12    UMAP13    UMAP14  \
0  4.772562  3.324705  4.982484  6.221325  4.482080  4.723325  5.903697   
1  5.957243  5.620238  3.837717  8.396414  1.570166  7.361269  8.890088   
2 

# KMeans

In [7]:
import joblib
import pandas as pd
import numpy as np

# ============================================================
# LOAD CLUSTERING MODELS
# ============================================================
print("="*70)
print("LOADING CLUSTERING MODELS")
print("="*70)

kmeans_final = joblib.load('Models/kmeans_umap_initial4_model.pkl')
kmeans_c0    = joblib.load('Models/kmeans_umap_c0.pkl')  
kmeans_c3    = joblib.load('Models/kmeans_umap_c3.pkl')

k_split_c0 = 3
k_split_c3 = 2

print("✓ All models loaded successfully")

# ============================================================
# ASSIGN CLUSTERS TO ALL TEST PATIENTS
# ============================================================
print("\n" + "="*70)
print("ASSIGNING CLUSTERS TO TEST PATIENTS")
print("="*70)

print(f"Test data shape: {umap_test_df.shape}")

# Make sure columns match what KMeans was trained on (optional but safer)
if hasattr(kmeans_final, "feature_names_in_"):
    expected_cols = list(kmeans_final.feature_names_in_)
    umap_test_df = umap_test_df[expected_cols]

# <<< NEW: cast to the same dtype as the model's cluster centers
dtype = kmeans_final.cluster_centers_.dtype
X_test = umap_test_df.to_numpy().astype(dtype)

# Step 1: Get initial cluster assignments (4 clusters)
initial_clusters = kmeans_final.predict(X_test)

# Step 2: Apply hierarchical logic to get FINAL clusters (0..6)
final_clusters = []
subcluster_assignments = []

for idx in range(X_test.shape[0]):
    main_cluster = initial_clusters[idx]
    
    # use the numpy slice so dtype stays consistent
    patient_features = X_test[idx:idx+1, :]   # shape (1, n_features)
    
    if main_cluster == 0:
        # Initial Cluster 0 splits into subclusters → Final clusters 0, 1, 2
        subcluster_id = kmeans_c0.predict(patient_features)[0]  # {0,1,2}
        final_cluster = 0 + subcluster_id                      # 0,1,2
        final_clusters.append(int(final_cluster))
        subcluster_assignments.append(int(subcluster_id))
        
    elif main_cluster == 1:
        # Initial Cluster 1 stays as final cluster 3
        final_clusters.append(3)
        subcluster_assignments.append(None)
        
    elif main_cluster == 2:
        # Initial Cluster 2 stays as final cluster 4
        final_clusters.append(4)
        subcluster_assignments.append(None)
        
    elif main_cluster == 3:
        # Initial Cluster 3 splits into subclusters → Final clusters 5, 6
        subcluster_id = kmeans_c3.predict(patient_features)[0]  # {0,1}
        final_cluster = 5 + subcluster_id                      # 5,6
        final_clusters.append(int(final_cluster))
        subcluster_assignments.append(int(subcluster_id))

# Attach results back to a DataFrame (we can reuse umap_test_df)
umap_test_df_assigned = umap_test_df.copy()
umap_test_df_assigned['Initial_Cluster_UMAP'] = initial_clusters
umap_test_df_assigned['Subcluster_UMAP']      = subcluster_assignments
umap_test_df_assigned['Final_Cluster_UMAP']   = final_clusters

# ============================================================
# DISPLAY RESULTS
# ============================================================
print("\n" + "="*70)
print("UMAP CLUSTER ASSIGNMENT SUMMARY")
print("="*70)

print("\nInitial cluster distribution (4 clusters):")
print(umap_test_df_assigned['Initial_Cluster_UMAP'].value_counts().sort_index())

print("\nFinal cluster distribution (7 clusters):")
print(umap_test_df_assigned['Final_Cluster_UMAP'].value_counts().sort_index())

print("\nCluster mapping (UMAP-based):")
print("  Final Cluster 0: Initial Cluster 0 → Subcluster 0")
print("  Final Cluster 1: Initial Cluster 0 → Subcluster 1")
print("  Final Cluster 2: Initial Cluster 0 → Subcluster 2")
print("  Final Cluster 3: Initial Cluster 1 (no subclustering)")
print("  Final Cluster 4: Initial Cluster 2 (no subclustering)")
print("  Final Cluster 5: Initial Cluster 3 → Subcluster 0")
print("  Final Cluster 6: Initial Cluster 3 → Subcluster 1")

print("\nFirst 20 patient assignments:")
print(umap_test_df_assigned[['Initial_Cluster_UMAP', 'Subcluster_UMAP', 'Final_Cluster_UMAP']].head(20))

# Save results
umap_test_df_assigned.to_csv('Data/test_patients_with_clusters_kmeans_umap.csv', index=False)
print("\n✓ Results saved to 'Data/test_patients_with_clusters_kmeans_umap.csv'")


LOADING CLUSTERING MODELS
✓ All models loaded successfully

ASSIGNING CLUSTERS TO TEST PATIENTS
Test data shape: (1050, 20)





UMAP CLUSTER ASSIGNMENT SUMMARY

Initial cluster distribution (4 clusters):
Initial_Cluster_UMAP
0    383
1    173
2    153
3    341
Name: count, dtype: int64

Final cluster distribution (7 clusters):
Final_Cluster_UMAP
0    195
1     56
2    132
3    173
4    153
5    212
6    129
Name: count, dtype: int64

Cluster mapping (UMAP-based):
  Final Cluster 0: Initial Cluster 0 → Subcluster 0
  Final Cluster 1: Initial Cluster 0 → Subcluster 1
  Final Cluster 2: Initial Cluster 0 → Subcluster 2
  Final Cluster 3: Initial Cluster 1 (no subclustering)
  Final Cluster 4: Initial Cluster 2 (no subclustering)
  Final Cluster 5: Initial Cluster 3 → Subcluster 0
  Final Cluster 6: Initial Cluster 3 → Subcluster 1

First 20 patient assignments:
    Initial_Cluster_UMAP  Subcluster_UMAP  Final_Cluster_UMAP
0                      0              2.0                   2
1                      1              NaN                   3
2                      3              1.0                   6
3       



# Cluster Validation

In [9]:
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

print("="*70)
print("CLUSTER ASSIGNMENT VALIDATION")
print("="*70)

CLUSTER ASSIGNMENT VALIDATION


In [10]:
# Load the training PCA data
df_umap = pd.read_csv('Data/umap_dataset.csv')

print(f"Training data loaded: {df_umap.shape}")
print(f"Columns: {df_umap.columns.tolist()}")

Training data loaded: (9442, 20)
Columns: ['UMAP1', 'UMAP2', 'UMAP3', 'UMAP4', 'UMAP5', 'UMAP6', 'UMAP7', 'UMAP8', 'UMAP9', 'UMAP10', 'UMAP11', 'UMAP12', 'UMAP13', 'UMAP14', 'UMAP15', 'UMAP16', 'UMAP17', 'UMAP18', 'UMAP19', 'UMAP20']


In [11]:
print("\n[Metric 1] Within-Cluster Variance Stability")
print("-"*70)

def calculate_within_cluster_variance(data, labels, centroids):
    """Calculate within-cluster variance for each cluster"""
    variances = {}
    for cluster_id in np.unique(labels):
        cluster_points = data[labels == cluster_id]
        centroid = centroids[cluster_id]
        variance = np.mean(np.sum((cluster_points - centroid)**2, axis=1))
        variances[cluster_id] = variance
    return variances

# For initial 4 clusters - calculate on TRAINING data
train_initial_labels = kmeans_final.labels_  # Training labels
train_centroids = kmeans_final.cluster_centers_

# Calculate training variance
train_variances = calculate_within_cluster_variance(
    df_umap.drop(columns=['Cluster', 'Cluster_Original', 'Cluster_Refined'], errors='ignore').values,
    train_initial_labels,
    train_centroids
)

# Calculate test variance
test_initial_labels = initial_clusters
# ✅ Use correct object (umap_test_df if DataFrame, else pca_test if array)
test_data = umap_test_df.values if 'umap_test_df' in locals() else umap_test

test_variances = calculate_within_cluster_variance(
    test_data,
    test_initial_labels,
    train_centroids
)

print("\nWithin-cluster variance comparison (Initial 4 clusters):")
print(f"{'Cluster':<10} {'Train Variance':<20} {'Test Variance':<20} {'% Change':<15}")
print("-"*70)

for cluster_id in sorted(train_variances.keys()):
    train_var = train_variances[cluster_id]
    test_var = test_variances.get(cluster_id, np.nan)
    
    if not np.isnan(test_var):
        pct_change = ((test_var - train_var) / train_var) * 100
        status = "✓ Stable" if abs(pct_change) < 20 else "⚠ Check"
        print(f"{cluster_id:<10} {train_var:<20.4f} {test_var:<20.4f} {pct_change:>+.2f}%  {status}")
    else:
        print(f"{cluster_id:<10} {train_var:<20.4f} {'No test samples':<20} {'N/A':<15}")



[Metric 1] Within-Cluster Variance Stability
----------------------------------------------------------------------

Within-cluster variance comparison (Initial 4 clusters):
Cluster    Train Variance       Test Variance        % Change       
----------------------------------------------------------------------
0          1.6258               1.5085               -7.21%  ✓ Stable
1          1.6861               2.0534               +21.79%  ⚠ Check
2          0.9152               0.9261               +1.18%  ✓ Stable
3          1.8813               2.0945               +11.33%  ✓ Stable


In [13]:
import numpy as np

def compute_centroids_and_variances(X, labels):
    """Return centroids and within-cluster variances for given labels."""
    centroids = {}
    variances = {}
    for cid in np.unique(labels):
        mask = (labels == cid)
        pts = X[mask]
        if pts.size == 0:
            continue
        c = pts.mean(axis=0)
        centroids[cid] = c
        d2 = np.sum((pts - c)**2, axis=1)
        variances[cid] = d2.mean()
    return centroids, variances

# ============================================================
# BUILD TRAIN UMAP MATRIX + FINAL CLUSTER LABELS
# ============================================================
umap_cols = [c for c in df_umap.columns if c.startswith("UMAP")]
train_umap_df = df_umap[umap_cols]

# Ensure same column order as used to train kmeans_final
if hasattr(kmeans_final, "feature_names_in_"):
    expected_cols = list(kmeans_final.feature_names_in_)
    train_umap_df = train_umap_df[expected_cols]

# Match dtype to model
X_train = train_umap_df.to_numpy().astype(kmeans_final.cluster_centers_.dtype)
train_initial = kmeans_final.labels_  # 0..3 on train

train_final = np.empty_like(train_initial)

for i, main in enumerate(train_initial):
    point = X_train[i:i+1, :]  # shape (1, d)
    
    if main == 0:
        # initial 0 → subclusters 0,1,2 → final 0,1,2
        sub = kmeans_c0.predict(point)[0]      # 0,1,2
        final = 0 + sub
    elif main == 1:
        # initial 1 → final 3
        final = 3
    elif main == 2:
        # initial 2 → final 4
        final = 4
    elif main == 3:
        # initial 3 → subclusters 0,1 → final 5,6
        sub = kmeans_c3.predict(point)[0]      # 0,1
        final = 5 + sub
    
    train_final[i] = int(final)

# ============================================================
# TRAIN VARIANCES (FINAL CLUSTERS)
# ============================================================
cent_train, var_train = compute_centroids_and_variances(X_train, train_final)

# ============================================================
# TEST VARIANCES (FINAL CLUSTERS)
# ============================================================
test_umap_df = umap_test_df_assigned[umap_cols]
if hasattr(kmeans_final, "feature_names_in_"):
    test_umap_df = test_umap_df[expected_cols]

X_test = test_umap_df.to_numpy().astype(kmeans_final.cluster_centers_.dtype)
test_final = umap_test_df_assigned["Final_Cluster_UMAP"].to_numpy()

_, var_test = compute_centroids_and_variances(X_test, test_final)

# ============================================================
# COMPARISON: TRAIN vs TEST WITH % CHANGE
# ============================================================
print("\nWithin-cluster variance comparison (Final UMAP clusters):")
print(f"{'Cluster':<8} {'Train Var':<18} {'Test Var':<18} {'% Change':<12}")
print("-"*60)

for cid in sorted(var_train.keys()):
    tv = var_train[cid]
    sv = var_test.get(cid, np.nan)
    
    if np.isnan(sv):
        print(f"{cid:<8} {tv:<18.4f} {'No test pts':<18} {'N/A':<12}")
    else:
        pct = ((sv - tv) / tv * 100) if tv != 0 else np.nan
        tag = "✓ Stable" if (not np.isnan(pct) and abs(pct) < 25) else "⚠ Check"
        print(f"{cid:<8} {tv:<18.4f} {sv:<18.4f} {pct:>+8.2f}%  {tag}")



Within-cluster variance comparison (Final UMAP clusters):
Cluster  Train Var          Test Var           % Change    
------------------------------------------------------------
0        0.5796             0.6788               +17.11%  ✓ Stable
1        1.6254             1.0621               -34.66%  ⚠ Check
2        0.7910             0.8524                +7.76%  ✓ Stable
3        1.6861             2.0374               +20.84%  ✓ Stable
4        0.9152             0.9194                +0.46%  ✓ Stable
5        0.8915             0.9679                +8.58%  ✓ Stable
6        0.7602             0.7789                +2.47%  ✓ Stable
