In [6]:
import numpy as np
import pandas as pd
import pickle

def preprocess_new_record(new_record, pipeline_path='Data/preprocessing_pipeline.pkl'):
    """
    Apply the same preprocessing to new records
    
    Parameters:
    -----------
    new_record : dict or pd.DataFrame
        New record(s) to preprocess
    pipeline_path : str
        Path to saved preprocessing pipeline
        
    Returns:
    --------
    pd.DataFrame : Preprocessed record matching training data format
    """
    
    # Load pipeline
    with open(pipeline_path, 'rb') as f:
        pipeline = pickle.load(f)
    
    # Convert to DataFrame if needed
    if isinstance(new_record, dict):
        df_new = pd.DataFrame([new_record])
    else:
        df_new = new_record.copy()
    
    # Step 1: Normalize Unknown-like responses
    df_new = df_new.replace({
        "Don't know": "Unknown", "Refused": "Unknown", 
        "Not Applicable": "Unknown", "N/A": "Unknown", 
        "Unknown/NA": "Unknown"
    })
    
    # Step 2: Binary encoding
    for col in pipeline['binary_cols']:
        if col not in df_new.columns:
            continue
        
        if col == "Has_diabetes":
            mapping = pipeline['binary_mappings']["Has_diabetes"]
        elif col == "Received_Hepatitis_A_Vaccine":
            mapping = pipeline['binary_mappings']["Received_Hepatitis_A_Vaccine"]
        else:
            mapping = pipeline['binary_mappings']["default"]
        
        df_new[col] = df_new[col].map(mapping)
    
    # Step 3: Ordinal encoding
    for col, encoder in pipeline['ordinal_encoders'].items():
        if col in df_new.columns:
            df_new[col] = encoder.transform(df_new[[col]])
    
    # Step 4: One-hot encoding
    for base_col in pipeline['ohe_cols']:
        if base_col in df_new.columns:
            # Get dummies for this column
            dummies = pd.get_dummies(df_new[base_col], prefix=base_col, dtype=int)
            
            # Add any missing columns from training
            for train_col in pipeline['ohe_column_names']:
                if base_col in train_col and train_col not in dummies.columns:
                    dummies[train_col] = 0
            
            # Remove extra columns not in training
            cols_to_keep = [col for col in dummies.columns 
                           if col in pipeline['ohe_column_names']]
            dummies = dummies[cols_to_keep]
            
            # Add to dataframe
            df_new = pd.concat([df_new.drop(columns=[base_col]), dummies], axis=1)
    
    # Step 5: Apply log transformation to skewed columns
    for col in pipeline['skewed_cols']:
        if col in df_new.columns:
            df_new[col] = np.log1p(df_new[col].clip(lower=0))
    
    # Step 6: Ensure all columns from training exist
    for col in pipeline['all_columns']:
        if col not in df_new.columns:
            df_new[col] = 0  # Add missing columns with default value
    
    # Step 7: Reorder columns to match training data
    df_new = df_new[pipeline['all_columns']]
    
    # Step 8: Apply scaling
    df_new[pipeline['cols_to_scale']] = pipeline['scaler'].transform(
        df_new[pipeline['cols_to_scale']]
    )
    
    return df_new

In [7]:
# Example new patient record
test_df = pd.read_csv('Data/test_dataset.csv')
df_scaled = pd.read_csv('Data/df_scaled.csv')
df_scaled.drop(columns='Unnamed: 0', inplace=True)

# Preprocess the new record
processed_patient = preprocess_new_record(test_df)

print(f"Shape of processed record: {processed_patient.shape}")
print(f"Matches training data shape: {processed_patient.shape[1] == df_scaled.shape[1]}")

Shape of processed record: (1050, 55)
Matches training data shape: True


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# Dim Reduction UMAP

In [8]:
import joblib
umap = joblib.load('Data/umap_model.pkl')
umap_test = umap.transform(processed_patient)
umap_test_df = pd.DataFrame(umap_test, columns=[f'PC{i+1}' for i in range(umap_test.shape[1])])
umap_test_df.head()

print(umap_test_df.shape)
print(umap_test_df.head())

(1050, 15)
        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0  1.113172 -1.533393  0.758375 -3.098632  0.347120 -2.097073 -0.264846   
1 -5.575248  2.321419 -0.286932  0.385995  0.209788  0.111568 -0.681061   
2  1.076240 -1.809363 -1.359945  1.711623  2.563675 -0.929339  3.114684   
3  3.553449  3.006607  0.907166  1.245952 -1.023796  1.045440 -0.585587   
4 -5.434803  2.295273  0.305454  1.150900  0.947202 -0.680264  0.273486   

        PC8       PC9      PC10      PC11      PC12      PC13      PC14  \
0 -1.656722  0.981297 -1.057477  0.448272  0.554567 -0.070196  0.073864   
1 -0.206276  0.706200 -0.047509 -0.798700  0.577931  0.197507  0.301032   
2  1.257197  0.498221 -1.351363 -0.206696 -2.048451  0.044042 -0.820650   
3  0.675189  1.765725  0.710279  0.598552  0.093380  0.288720  0.813715   
4  0.302993  0.038955 -1.227088 -1.357034  1.056881 -0.174455  0.533989   

       PC15  
0 -0.988990  
1  0.512794  
2 -1.680909  
3  0.179806  
4 -0.408397  


# KMeans

In [9]:
import joblib
import pandas as pd

# ============================================================
# LOAD CLUSTERING MODELS
# ============================================================
print("="*70)
print("LOADING CLUSTERING MODELS")
print("="*70)

# Load clustering models
kmeans_final = joblib.load('Models/kmeans_umap_initial4_model.pkl')
kmeans_c0 = joblib.load('Models/kmeans_umap_c0.pkl')  
kmeans_c3 = joblib.load('Models/kmeans_umap_c3.pkl')

k_split_c0 = 3
k_split_c3 = 2

print("✓ All models loaded successfully")

# ============================================================
# ASSIGN CLUSTERS TO ALL TEST PATIENTS
# ============================================================
print("\n" + "="*70)
print("ASSIGNING CLUSTERS TO TEST PATIENTS")
print("="*70)

print(f"Test data shape: {umap_test_df.shape}")

# Step 1: Get initial cluster assignments (4 clusters)
initial_clusters = kmeans_final.predict(umap_test_df)

# Step 2: Apply hierarchical logic to get FINAL clusters (0..6)
final_clusters = []
subcluster_assignments = []

for idx in range(len(umap_test_df)):
    main_cluster = initial_clusters[idx]
    patient_features = umap_test_df.iloc[[idx]]   # keep as DataFrame for .predict
    
    if main_cluster == 0:
        # Initial Cluster 0 splits into subclusters → Final clusters 0, 1, 2
        subcluster_id = kmeans_c0.predict(patient_features)[0]  # {0,1,2}
        final_cluster = 0 + subcluster_id                      # 0,1,2
        final_clusters.append(int(final_cluster))
        subcluster_assignments.append(int(subcluster_id))
        
    elif main_cluster == 1:
        # Initial Cluster 1 stays as final cluster 3
        final_clusters.append(3)
        subcluster_assignments.append(None)
        
    elif main_cluster == 2:
        # Initial Cluster 2 stays as final cluster 4
        final_clusters.append(4)
        subcluster_assignments.append(None)
        
    elif main_cluster == 3:
        # Initial Cluster 3 splits into subclusters → Final clusters 5, 6
        subcluster_id = kmeans_c3.predict(patient_features)[0] 
        final_cluster = 5 + subcluster_id                      #
        final_clusters.append(int(final_cluster))
        subcluster_assignments.append(int(subcluster_id))

# Attach results to the test UMAP DataFrame
umap_test_df_assigned = umap_test_df.copy()
umap_test_df_assigned['Initial_Cluster_UMAP'] = initial_clusters
umap_test_df_assigned['Subcluster_UMAP']      = subcluster_assignments
umap_test_df_assigned['Final_Cluster_UMAP']   = final_clusters

# ============================================================
# DISPLAY RESULTS
# ============================================================
print("\n" + "="*70)
print("UMAP CLUSTER ASSIGNMENT SUMMARY")
print("="*70)

print("\nInitial cluster distribution (4 clusters):")
print(umap_test_df_assigned['Initial_Cluster_UMAP'].value_counts().sort_index())

print("\nFinal cluster distribution (7 clusters):")
print(umap_test_df_assigned['Final_Cluster_UMAP'].value_counts().sort_index())

print("\nCluster mapping (UMAP-based):")
print("  Final Cluster 0: Initial Cluster 0 → Subcluster 0")
print("  Final Cluster 1: Initial Cluster 0 → Subcluster 1")
print("  Final Cluster 2: Initial Cluster 0 → Subcluster 2")
print("  Final Cluster 3: Initial Cluster 1 (no subclustering)")
print("  Final Cluster 4: Initial Cluster 2 (no subclustering)")
print("  Final Cluster 5: Initial Cluster 3 → Subcluster 0")
print("  Final Cluster 6: Initial Cluster 3 → Subcluster 1")

print("\nFirst 20 patient assignments:")
print(umap_test_df_assigned[['Initial_Cluster_UMAP', 'Subcluster_UMAP', 'Final_Cluster_UMAP']].head(20))

# Save results
umap_test_df_assigned.to_csv('Data/test_patients_with_clusters_kmeans_umap.csv', index=False)
print("\n✓ Results saved to 'Data/test_patients_with_clusters_kmeans_umap.csv'")

LOADING CLUSTERING MODELS
✓ All models loaded successfully

ASSIGNING CLUSTERS TO TEST PATIENTS
Test data shape: (1050, 15)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- PC1
- PC10
- PC11
- PC12
- PC13
- ...
Feature names seen at fit time, yet now missing:
- UMAP1
- UMAP10
- UMAP11
- UMAP12
- UMAP13
- ...
