In [None]:
import numpy as np
import pandas as pd
import pickle

def preprocess_new_record(new_record, pipeline_path='Data/preprocessing_pipeline.pkl'):
    """
    Apply the same preprocessing to new records
    
    Parameters:
    -----------
    new_record : dict or pd.DataFrame
        New record(s) to preprocess
    pipeline_path : str
        Path to saved preprocessing pipeline
        
    Returns:
    --------
    pd.DataFrame : Preprocessed record matching training data format
    """
    
    # Load pipeline
    with open(pipeline_path, 'rb') as f:
        pipeline = pickle.load(f)
    
    # Convert to DataFrame if needed
    if isinstance(new_record, dict):
        df_new = pd.DataFrame([new_record])
    else:
        df_new = new_record.copy()
    
    # Step 1: Normalize Unknown-like responses
    df_new = df_new.replace({
        "Don't know": "Unknown", "Refused": "Unknown", 
        "Not Applicable": "Unknown", "N/A": "Unknown", 
        "Unknown/NA": "Unknown"
    })
    
    # Step 2: Binary encoding
    for col in pipeline['binary_cols']:
        if col not in df_new.columns:
            continue
        
        if col == "Has_diabetes":
            mapping = pipeline['binary_mappings']["Has_diabetes"]
        elif col == "Received_Hepatitis_A_Vaccine":
            mapping = pipeline['binary_mappings']["Received_Hepatitis_A_Vaccine"]
        else:
            mapping = pipeline['binary_mappings']["default"]
        
        df_new[col] = df_new[col].map(mapping)
    
    # Step 3: Ordinal encoding
    for col, encoder in pipeline['ordinal_encoders'].items():
        if col in df_new.columns:
            df_new[col] = encoder.transform(df_new[[col]])
    
    # Step 4: One-hot encoding
    for base_col in pipeline['ohe_cols']:
        if base_col in df_new.columns:
            # Get dummies for this column
            dummies = pd.get_dummies(df_new[base_col], prefix=base_col, dtype=int)
            
            # Add any missing columns from training
            for train_col in pipeline['ohe_column_names']:
                if base_col in train_col and train_col not in dummies.columns:
                    dummies[train_col] = 0
            
            # Remove extra columns not in training
            cols_to_keep = [col for col in dummies.columns 
                           if col in pipeline['ohe_column_names']]
            dummies = dummies[cols_to_keep]
            
            # Add to dataframe
            df_new = pd.concat([df_new.drop(columns=[base_col]), dummies], axis=1)
    
    # Step 5: Apply log transformation to skewed columns
    for col in pipeline['skewed_cols']:
        if col in df_new.columns:
            df_new[col] = np.log1p(df_new[col].clip(lower=0))
    
    # Step 6: Ensure all columns from training exist
    for col in pipeline['all_columns']:
        if col not in df_new.columns:
            df_new[col] = 0  # Add missing columns with default value
    
    # Step 7: Reorder columns to match training data
    df_new = df_new[pipeline['all_columns']]
    
    # Step 8: Apply scaling
    df_new[pipeline['cols_to_scale']] = pipeline['scaler'].transform(
        df_new[pipeline['cols_to_scale']]
    )
    
    return df_new

In [None]:
# Example new patient record
test_df = pd.read_csv('Data/test_dataset.csv')
df_scaled = pd.read_csv('Data/df_scaled.csv')
df_scaled.drop(columns='Unnamed: 0', inplace=True)

# Preprocess the new record
processed_patient = preprocess_new_record(test_df)

print(f"Shape of processed record: {processed_patient.shape}")
print(f"Matches training data shape: {processed_patient.shape[1] == df_scaled.shape[1]}")

# Dim Reduction UMAP

In [None]:
import joblib
umap = joblib.load('Data/umap_model.pkl')
umap_test = umap.transform(processed_patient)
umap_test_df = pd.DataFrame(umap_test, columns=[f'PC{i+1}' for i in range(umap_test.shape[1])])
umap_test_df.head()

