In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error, accuracy_score

In [17]:
metadata_df = pd.read_csv('../data/cancer_metadata.csv', index_col=0)
expression = pd.read_csv('../data/cancer_only_hvg.csv', index_col=0)
metadata_df = metadata_df.drop(['grade'], axis=1)

In [18]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error, accuracy_score

# Ensure matching indices
common_idx = metadata_df.index.intersection(expression.index)
metadata_df = metadata_df.loc[common_idx].copy()
expression = expression.loc[common_idx].copy()

# Identify columns with missing values
missing_cols = metadata_df.columns[metadata_df.isnull().any()].tolist()

# Separate into categorical and numeric
cat_cols = metadata_df[missing_cols].select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = metadata_df[missing_cols].select_dtypes(include=['number']).columns.tolist()

# Encode categorical columns
encoders = {}
for col in cat_cols:
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    reshaped = metadata_df[[col]].astype(str)
    metadata_df[col + '_enc'] = enc.fit_transform(reshaped)
    encoders[col] = enc

encoded_cat_cols = [col + '_enc' for col in cat_cols]
all_impute_cols = num_cols + encoded_cat_cols

# Build input for imputation
# Expression = used to compute distances
# Metadata targets = columns to impute
targets = metadata_df[all_impute_cols].copy()

# Combine expression with targets (they must be together in one matrix)
combined_data = pd.concat([expression, targets], axis=1)

# Create a copy for evaluation (true values before masking)
true_targets = targets.copy()

# Randomly mask 10% of observed values for error checking
mask_fraction = 0.1
np.random.seed(42)
impute_mask = targets.notna()
masked_data = combined_data.copy()

for col in all_impute_cols:
    observed_indices = impute_mask[col][impute_mask[col]].index
    n_mask = int(len(observed_indices) * mask_fraction)
    if n_mask == 0:
        continue
    mask_indices = np.random.choice(observed_indices, n_mask, replace=False)
    masked_data.loc[mask_indices, col] = np.nan

# Optimize K
k_values = [3, 5, 7, 10]
errors = []

for k in k_values:
    imputer = KNNImputer(n_neighbors=k)
    imputed_array = imputer.fit_transform(masked_data)
    imputed_df = pd.DataFrame(imputed_array, columns=combined_data.columns, index=combined_data.index)

    # Extract imputed targets
    imputed_targets = imputed_df[all_impute_cols]

    total_error = 0
    for col in all_impute_cols:
        mask_col = true_targets[col].notna()
        true_vals = true_targets.loc[mask_col, col]
        imputed_vals = imputed_targets.loc[mask_col, col]

        if col in encoded_cat_cols:
            acc = accuracy_score(true_vals.round().astype(int), imputed_vals.round().astype(int))
            error = 1 - acc
        else:
            error = np.sqrt(mean_squared_error(true_vals, imputed_vals))

        total_error += error

    errors.append((k, total_error))
    print(f"k={k}: total error = {total_error:.4f}")

# Select best k
best_k = min(errors, key=lambda x: x[1])[0]
print(f"\nSelected best k = {best_k}")

# Final imputation with best k
final_imputer = KNNImputer(n_neighbors=best_k)
final_imputed = final_imputer.fit_transform(combined_data)
final_imputed_df = pd.DataFrame(final_imputed, columns=combined_data.columns, index=combined_data.index)

# Extract imputed targets
final_targets = final_imputed_df[all_impute_cols]

# Write back into metadata_df
for col in num_cols:
    metadata_df[col] = final_targets[col]

for col in cat_cols:
    col_enc = col + '_enc'
    decoder = encoders[col]
    decoded_vals = decoder.inverse_transform(final_targets[[col_enc]].round())
    metadata_df[col] = decoded_vals.ravel()

# Drop temporary encoded columns
metadata_df.drop(columns=encoded_cat_cols, inplace=True)

print("\n✅ KNN imputation using expression-based neighbors completed.")


k=3: total error = 0.5709
k=5: total error = 0.5764
k=7: total error = 0.5631
k=10: total error = 0.5606

Selected best k = 10

✅ KNN imputation using expression-based neighbors completed.
