In [1]:
import numpy as np
import pandas as pd
import pickle

def preprocess_new_record(new_record, pipeline_path='Data/preprocessing_pipeline.pkl'):
    """
    Apply the same preprocessing to new records
    
    Parameters:
    -----------
    new_record : dict or pd.DataFrame
        New record(s) to preprocess
    pipeline_path : str
        Path to saved preprocessing pipeline
        
    Returns:
    --------
    pd.DataFrame : Preprocessed record matching training data format
    """
    
    # Load pipeline
    with open(pipeline_path, 'rb') as f:
        pipeline = pickle.load(f)
    
    # Convert to DataFrame if needed
    if isinstance(new_record, dict):
        df_new = pd.DataFrame([new_record])
    else:
        df_new = new_record.copy()
    
    # Step 1: Normalize Unknown-like responses
    df_new = df_new.replace({
        "Don't know": "Unknown", "Refused": "Unknown", 
        "Not Applicable": "Unknown", "N/A": "Unknown", 
        "Unknown/NA": "Unknown"
    })
    
    # Step 2: Binary encoding
    for col in pipeline['binary_cols']:
        if col not in df_new.columns:
            continue
        
        if col == "Has_diabetes":
            mapping = pipeline['binary_mappings']["Has_diabetes"]
        elif col == "Received_Hepatitis_A_Vaccine":
            mapping = pipeline['binary_mappings']["Received_Hepatitis_A_Vaccine"]
        else:
            mapping = pipeline['binary_mappings']["default"]
        
        df_new[col] = df_new[col].map(mapping)
    
    # Step 3: Ordinal encoding
    for col, encoder in pipeline['ordinal_encoders'].items():
        if col in df_new.columns:
            df_new[col] = encoder.transform(df_new[[col]])
    
    # Step 4: One-hot encoding
    for base_col in pipeline['ohe_cols']:
        if base_col in df_new.columns:
            # Get dummies for this column
            dummies = pd.get_dummies(df_new[base_col], prefix=base_col, dtype=int)
            
            # Add any missing columns from training
            for train_col in pipeline['ohe_column_names']:
                if base_col in train_col and train_col not in dummies.columns:
                    dummies[train_col] = 0
            
            # Remove extra columns not in training
            cols_to_keep = [col for col in dummies.columns 
                           if col in pipeline['ohe_column_names']]
            dummies = dummies[cols_to_keep]
            
            # Add to dataframe
            df_new = pd.concat([df_new.drop(columns=[base_col]), dummies], axis=1)
    
    # Step 5: Apply log transformation to skewed columns
    for col in pipeline['skewed_cols']:
        if col in df_new.columns:
            df_new[col] = np.log1p(df_new[col].clip(lower=0))
    
    # Step 6: Ensure all columns from training exist
    for col in pipeline['all_columns']:
        if col not in df_new.columns:
            df_new[col] = 0  # Add missing columns with default value
    
    # Step 7: Reorder columns to match training data
    df_new = df_new[pipeline['all_columns']]
    
    # Step 8: Apply scaling
    df_new[pipeline['cols_to_scale']] = pipeline['scaler'].transform(
        df_new[pipeline['cols_to_scale']]
    )
    
    return df_new

In [2]:
# Example new patient record
test_df = pd.read_csv('Data/test_dataset.csv')
df_scaled = pd.read_csv('Data/df_scaled.csv')
df_scaled.drop(columns='Unnamed: 0', inplace=True)

# Preprocess the new record
processed_patient = preprocess_new_record(test_df)

print(f"Shape of processed record: {processed_patient.shape}")
print(f"Matches training data shape: {processed_patient.shape[1] == df_scaled.shape[1]}")

Shape of processed record: (1050, 55)
Matches training data shape: True




In [3]:
df_scaled

Unnamed: 0,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Age,Education_Level,Household_Size,Income_to_Poverty_Ratio,Has_diabetes,...,Race_Ethnicity_Other Hispanic,Race_Ethnicity_Other Race - Including Multi-Racial,Gender_Female,Gender_Male,Country_of_Birth_Born in 50 US states or Washington,Country_of_Birth_Unknown,Marital_Status_Married/Living with partner,Marital_Status_Never married,Marital_Status_Unknown,Marital_Status_Widowed/Divorced/Separated
0,0.0,1.0,-0.126195,0.071429,-0.206814,0.575,-0.5,-1.356915,-0.571698,0.0,...,0,0,1,0,1,0,0,1,0,0
1,0.0,0.0,0.061874,0.000000,0.133994,-0.400,-0.5,1.095512,-0.752830,0.0,...,0,0,0,1,0,1,1,0,0,0
2,0.0,1.0,-0.126195,0.071429,-0.206814,0.425,-0.5,-1.356915,-0.209434,0.0,...,0,0,1,0,1,0,0,1,0,0
3,-1.0,1.0,-0.126195,0.071429,-0.206814,0.650,0.0,-0.563171,-0.511321,0.0,...,0,0,0,1,1,0,0,0,0,1
4,0.0,2.0,1.395017,-0.714286,1.991652,-1.000,1.0,1.095512,-0.481132,0.0,...,0,1,0,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9437,1.0,1.0,2.245940,-0.785714,0.809071,0.725,-0.5,-1.356915,-0.458491,1.0,...,0,0,1,0,1,0,0,0,0,1
9438,-1.0,2.0,0.301680,-1.071429,0.595486,-0.800,1.0,0.436829,0.711321,0.0,...,0,0,1,0,1,0,0,0,1,0
9439,0.0,1.0,-0.158270,-0.357143,-0.491044,0.775,-0.5,0.000000,-0.730189,0.0,...,0,0,1,0,1,0,0,0,0,1
9440,0.0,0.0,0.530063,0.571429,0.458580,0.325,0.0,0.000000,0.152830,0.0,...,0,1,1,0,1,0,0,0,0,1


In [4]:
processed_patient

Unnamed: 0,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Age,Education_Level,Household_Size,Income_to_Poverty_Ratio,Has_diabetes,...,Race_Ethnicity_Other Hispanic,Race_Ethnicity_Other Race - Including Multi-Racial,Gender_Female,Gender_Male,Country_of_Birth_Born in 50 US states or Washington,Country_of_Birth_Unknown,Marital_Status_Married/Living with partner,Marital_Status_Never married,Marital_Status_Unknown,Marital_Status_Widowed/Divorced/Separated
0,1.0,0.0,1.441348,0.285714,0.967019,-0.750,0.0,0.000000,0.096226,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2.0,0.061874,-0.571429,0.368728,-1.050,1.0,1.356915,-0.454717,0.0,...,0,0,0,0,0,0,0,0,0,0
2,-1.0,1.0,1.753699,-0.071429,1.021824,0.425,0.5,-1.356915,0.900000,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,-2.266201,0.142857,-0.834974,0.675,-0.5,0.793745,-0.013208,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,2.0,1.057532,-0.428571,1.529152,-0.975,1.0,1.095512,-0.481132,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0.0,0.0,-0.526021,-0.571429,-1.174664,0.325,-0.5,-0.563171,-0.654717,0.0,...,0,0,0,0,0,0,0,0,0,0
1046,0.0,0.0,-0.886727,-2.785714,0.933739,0.025,0.0,0.000000,-0.658491,0.0,...,0,0,0,0,0,0,0,0,0,0
1047,0.0,0.0,1.532686,-1.000000,-0.237332,-0.450,-0.5,0.000000,-0.575472,0.0,...,0,0,0,0,0,0,0,0,0,0
1048,0.0,1.0,-0.126195,0.071429,-0.206814,0.325,0.5,-0.563171,0.900000,0.0,...,0,0,0,0,0,0,0,0,0,0


### Apply the same scaling and encoding as the train data

In [5]:
import joblib
pca_obj = joblib.load('Data/pca_model.pkl')
print(type(pca_obj))


<class 'sklearn.decomposition._pca.PCA'>


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
pca = joblib.load('Data/pca_model.pkl')
pca_test = pca.transform(processed_patient)
pca_test_df = pd.DataFrame(pca_test, columns=[f'PC{i+1}' for i in range(pca_test.shape[1])])
pca_test_df.head()


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,1.113172,-1.533393,0.758377,3.098627,0.34712,-2.097137,0.26474,1.656713,0.981266,-1.057706,0.448963,0.552467,-0.069899,0.074268,0.989813
1,-5.575248,2.321419,-0.286932,-0.385998,0.209792,0.111523,0.680988,0.206258,0.706181,-0.047649,-0.79797,0.577711,0.197332,0.301666,-0.512739
2,1.07624,-1.809363,-1.359951,-1.711618,2.563668,-0.929185,-3.114504,-1.257207,0.498358,-1.350981,-0.207992,-2.04696,0.044664,-0.824568,1.680211
3,3.553449,3.006607,0.907166,-1.245962,-1.023825,1.045297,0.585414,-0.675157,1.765589,0.710134,0.599043,0.089877,0.292051,0.81546,-0.178022
4,-5.434803,2.295273,0.305455,-1.150903,0.94721,-0.680308,-0.273558,-0.30302,0.038937,-1.227287,-1.356188,1.057583,-0.175903,0.536208,0.407955


In [None]:
print(pca_test_df.shape)

(1050, 15)


In [12]:
import joblib
import pandas as pd

# ============================================================
# LOAD CLUSTERING MODELS
# ============================================================
print("="*70)
print("LOADING CLUSTERING MODELS")
print("="*70)

# Load clustering models
kmeans_final = joblib.load('Models/kmeans_initial.pkl')
kmeans_c2 = joblib.load('Models/kmeans_cluster0_split.pkl')  # Originally cluster 0, now splitting cluster 2
kmeans_c3 = joblib.load('Models/kmeans_cluster3_split.pkl')

k_split_c2 = 3
k_split_c3 = 3

print("✓ All models loaded successfully")

# ============================================================
# ASSIGN CLUSTERS TO ALL TEST PATIENTS
# ============================================================
print("\n" + "="*70)
print("ASSIGNING CLUSTERS TO TEST PATIENTS")
print("="*70)

print(f"Test data shape: {pca_test_df.shape}")

# Step 1: Get initial cluster assignments (4 clusters)
initial_clusters = kmeans_final.predict(pca_test_df)

# Step 2: Apply hierarchical logic to get final clusters
final_clusters = []
subcluster_assignments = []

for idx in range(len(pca_test_df)):
    main_cluster = initial_clusters[idx]
    patient_features = pca_test_df.iloc[[idx]]
    
    if main_cluster == 0:
        # Cluster 0 stays as 0
        final_clusters.append(0)
        subcluster_assignments.append(None)
        
    elif main_cluster == 1:
        # Cluster 1 stays as 1
        final_clusters.append(1)
        subcluster_assignments.append(None)
        
    elif main_cluster == 2:
        # Original Cluster 2 splits into subclusters → Final clusters 2, 3, 4
        subcluster_id = kmeans_c2.predict(patient_features)[0]
        final_cluster = 2 + subcluster_id
        final_clusters.append(final_cluster)
        subcluster_assignments.append(subcluster_id)
        
    elif main_cluster == 3:
        # Original Cluster 3 splits into subclusters → Final clusters 5, 6, 7
        subcluster_id = kmeans_c3.predict(patient_features)[0]
        final_cluster = 5 + subcluster_id
        final_clusters.append(final_cluster)
        subcluster_assignments.append(subcluster_id)

# Add results to original processed_patient dataframe
processed_patient['Initial_Cluster'] = initial_clusters
processed_patient['Subcluster'] = subcluster_assignments
processed_patient['Final_Cluster'] = final_clusters

# ============================================================
# DISPLAY RESULTS
# ============================================================
print("\n" + "="*70)
print("CLUSTER ASSIGNMENT SUMMARY")
print("="*70)

print("\nInitial cluster distribution (4 clusters):")
print(processed_patient['Initial_Cluster'].value_counts().sort_index())

print("\nFinal cluster distribution (8 clusters):")
print(processed_patient['Final_Cluster'].value_counts().sort_index())

print("\nCluster mapping:")
print("  Cluster 0: Original Cluster 0 (no subclustering)")
print("  Cluster 1: Original Cluster 1 (no subclustering)")
print("  Cluster 2: Original Cluster 2 → Subcluster 0")
print("  Cluster 3: Original Cluster 2 → Subcluster 1")
print("  Cluster 4: Original Cluster 2 → Subcluster 2")
print("  Cluster 5: Original Cluster 3 → Subcluster 0")
print("  Cluster 6: Original Cluster 3 → Subcluster 1")
print("  Cluster 7: Original Cluster 3 → Subcluster 2")

# Show first 20 assignments
print("\nFirst 20 patient assignments:")
print(processed_patient[['Initial_Cluster', 'Subcluster', 'Final_Cluster']].head(20))

# Save results
processed_patient.to_csv('Data/test_patients_with_clusters_kmeans_pca.csv', index=False)
print("\n✓ Results saved to 'Data/test_patients_with_clusters.csv'")

LOADING CLUSTERING MODELS
✓ All models loaded successfully

ASSIGNING CLUSTERS TO TEST PATIENTS
Test data shape: (1050, 15)

CLUSTER ASSIGNMENT SUMMARY

Initial cluster distribution (4 clusters):
Initial_Cluster
0    356
1    171
2     97
3    426
Name: count, dtype: int64

Final cluster distribution (8 clusters):
Final_Cluster
0    356
1    171
2     36
3     54
4      7
5    137
6    118
7    171
Name: count, dtype: int64

Cluster mapping:
  Cluster 0: Original Cluster 0 (no subclustering)
  Cluster 1: Original Cluster 1 (no subclustering)
  Cluster 2: Original Cluster 2 → Subcluster 0
  Cluster 3: Original Cluster 2 → Subcluster 1
  Cluster 4: Original Cluster 2 → Subcluster 2
  Cluster 5: Original Cluster 3 → Subcluster 0
  Cluster 6: Original Cluster 3 → Subcluster 1
  Cluster 7: Original Cluster 3 → Subcluster 2

First 20 patient assignments:
    Initial_Cluster  Subcluster  Final_Cluster
0                 3         2.0              7
1                 1         NaN              