# Using K-Means and Elbow Method


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error, mean_squared_error

RANDOM_STATE = 42

## 1) Load data


In [None]:
TRAIN_PATH = os.environ.get("TRAIN_PATH", "MiNDAT.csv")
TEST_PATH = os.environ.get("TEST_PATH", "MiNDAT_UNK.csv")

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

## 2) Setting identifiers


In [None]:
TARGET_COL = "CORRUCYSTIC_DENSITY"
ID_COL = "LOCAL_IDENTIFIER"

test_ids = test[ID_COL].copy()

## 3) Select numeric features, impute medians, and robust-scale


In [None]:
# Use only numeric columns for modeling
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()

# Ensure the target is in numeric columns and remove it from features
if TARGET_COL in numeric_cols:
    numeric_cols.remove(TARGET_COL)

# Intersection with test (to avoid train-only numeric columns)
numeric_cols = [c for c in numeric_cols if c in test.columns]

# Separate X/y
X_train_raw = train[numeric_cols].copy()
y_train = train[TARGET_COL].copy()
X_test_raw = test[numeric_cols].copy()

# Fill NaNs with per-column medians fit on train
imputer = SimpleImputer(strategy="median")
X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train_raw), columns=numeric_cols, index=X_train_raw.index
)
X_test_imputed = pd.DataFrame(
    imputer.transform(X_test_raw), columns=numeric_cols, index=X_test_raw.index
)

# Robust scaling helps with outliers
scaler = RobustScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_imputed),
    columns=numeric_cols,
    index=X_train_imputed.index,
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_imputed), columns=numeric_cols, index=X_test_imputed.index
)

## 4) Feature filtering by correlation with the target


In [None]:
# Compute correlations
cors = {}
for col in numeric_cols:
    try:
        corr = np.corrcoef(X_train_imputed[col], y_train)[0, 1]
    except Exception:
        corr = np.nan
    cors[col] = corr

corr_series = (
    pd.Series(cors).dropna().sort_values(key=lambda s: s.abs(), ascending=False)
)

# Keep top N features or all if there are few
TOP_N = min(
    50, max(10, int(0.8 * len(corr_series)))
)  # keep up to 50, at least 10, or 80% of available
selected_features = corr_series.head(TOP_N).index.tolist()

print("Top correlations (sign kept, sorted by |corr|):")
display(corr_series.head(20))

print(f"Selected {len(selected_features)} features for clustering.")

Xtr = X_train_scaled[selected_features].copy()
Xte = X_test_scaled[selected_features].copy()

## 5) Preparing X for clustering

In [None]:
def _prepare_X_for_clustering(X):
    if X is None:
        raise ValueError("X is None. Provide a non-empty array or DataFrame.")

    # Convert Series/list to numpy
    if isinstance(X, pd.Series):
        X = X.to_frame()
    elif isinstance(X, list):
        X = np.asarray(X)

    # If DataFrame, keep only numeric columns
    if isinstance(X, pd.DataFrame):
        X_num = X.select_dtypes(include=[np.number])
        if X_num.shape[1] == 0:
            raise ValueError("X has no numeric columns after dtype filtering.")
        # Drop rows that are completely NaN
        X_num = X_num.dropna(how="all")
        X_arr = X_num.to_numpy()
    else:
        X_arr = np.asarray(X)

    # Ensure at least 1 dimension
    if X_arr.ndim == 0:
        raise ValueError(
            "X is a scalar; expected array-like with at least 1 dimension."
        )

    # If 1D, make it 2D
    if X_arr.ndim == 1:
        X_arr = X_arr.reshape(-1, 1)

    # Drop rows that are fully NaN (if any)
    if np.isnan(X_arr).any():
        # Keep rows that have at least one non-NaN
        mask = ~np.all(np.isnan(X_arr), axis=1)
        X_arr = X_arr[mask]

    n_samples, n_features = X_arr.shape
    if n_samples == 0:
        raise ValueError("X has 0 rows after cleaning. Provide non-empty data.")
    if n_features == 0:
        raise ValueError("X has 0 features after cleaning. Provide numeric features.")

    # Replace any remaining NaNs with column means (or raise if you prefer strict)
    if np.isnan(X_arr).any():
        col_means = np.nanmean(X_arr, axis=0)
        # For columns that are entirely NaN, np.nanmean yields NaN; handle those:
        nan_cols = np.isnan(col_means)
        if np.any(nan_cols):
            # Drop entirely-NaN columns safely:
            keep_cols = ~nan_cols
            X_arr = X_arr[:, keep_cols]
            if X_arr.shape[1] == 0:
                raise ValueError("All features are NaN; cannot proceed.")
            col_means = np.nanmean(X_arr, axis=0)
        inds = np.where(np.isnan(X_arr))
        X_arr[inds] = np.take(col_means, inds[1])

    return X_arr

## 6) Elbow method to choose K

In [None]:
def compute_elbow_inertia(X, k_min=1, k_max=12, random_state=RANDOM_STATE, n_init=10):
    X_arr = _prepare_X_for_clustering(X)
    n_samples = X_arr.shape[0]

    if k_min < 1:
        k_min = 1
    k_max = min(int(k_max), n_samples)
    if k_max < k_min:
        raise ValueError(
            f"Invalid k range after adjustment: k_min={k_min}, k_max={k_max}, n_samples={n_samples}"
        )

    ks = list(range(k_min, k_max + 1))
    inertias = []
    for k in ks:
        # Use integer n_init for wider sklearn compatibility
        km = KMeans(n_clusters=k, n_init=n_init, random_state=random_state)
        km.fit(X_arr)
        inertias.append(float(km.inertia_))
    return ks, inertias


def choose_k_by_knee(ks, inertias):
    if len(ks) == 0 or len(inertias) == 0:
        raise ValueError("ks/inertias must be non-empty.")
    if len(ks) != len(inertias):
        raise ValueError("ks and inertias must have the same length.")

    x = np.array(ks, dtype=float)
    y = np.array(inertias, dtype=float)

    p1 = np.array([x[0], y[0]])
    p2 = np.array([x[-1], y[-1]])
    line_vec = p2 - p1
    denom = np.linalg.norm(line_vec)
    if denom == 0.0:
        # All inertias identical; fall back to first k
        return ks[0], np.zeros_like(x)

    # Compute perpendicular distances of all points to the line p1->p2
    p1_to_points = np.vstack([x - p1[0], y - p1[1]]).T
    distances = np.abs(np.cross(line_vec, p1_to_points) / denom)
    knee_idx = int(np.argmax(distances))
    return int(ks[knee_idx]), distances


def auto_choose_k_and_fit(Xtr, random_state=42, n_init=10):
    X_arr = _prepare_X_for_clustering(Xtr)
    n_samples = X_arr.shape[0]

    # Safer k_max heuristic
    heuristic_max = max(3, n_samples // 50)  # ensure at least 3
    k_max = min(15, heuristic_max, n_samples)  # never exceed n_samples
    k_min = 1
    ks, inertias = compute_elbow_inertia(
        X_arr, k_min=k_min, k_max=k_max, random_state=random_state, n_init=n_init
    )
    best_k, _ = choose_k_by_knee(ks, inertias)

    model = KMeans(n_clusters=best_k, n_init=n_init, random_state=random_state).fit(
        X_arr
    )
    return {
        "best_k": best_k,
        "ks": ks,
        "inertias": inertias,
        "labels": model.labels_,
        "centers": model.cluster_centers_,
        "model": model,
    }


# Fallback if no features were selected by correlation: use top-variance features from the scaled data
if isinstance(Xtr, pd.DataFrame) and Xtr.shape[1] == 0:
    # pick up to 20 highest-variance features that exist in both train/test scaled frames
    common_cols = list(X_train_scaled.columns.intersection(X_test_scaled.columns))
    if len(common_cols) == 0:
        raise ValueError(
            "No common numeric features available in train/test after preprocessing."
        )
    var_series = X_train_scaled[common_cols].var().sort_values(ascending=False)
    fallback_n = min(
        20, max(5, len(var_series))
    )  # at least 5, up to 20 or all if fewer
    selected_features = var_series.index[:fallback_n].tolist()
    print(
        f"No features selected by correlation; falling back to {len(selected_features)} top-variance features."
    )
    Xtr = X_train_scaled[selected_features].copy()
    Xte = X_test_scaled[selected_features].copy()

# Use the robust auto-K routine with explicit parameters
result = auto_choose_k_and_fit(Xtr, random_state=RANDOM_STATE, n_init=10)
ks, inertias = result["ks"], result["inertias"]
auto_k, distances = choose_k_by_knee(ks, inertias)

print("Elbow Ks:", ks)
print("Inertias:", [round(v, 2) for v in inertias])
print(f"Auto-selected K (knee): {auto_k}")

# Plot
plt.figure(figsize=(6, 4))
plt.plot(ks, inertias, marker="o")
plt.title("Elbow Method (Inertia vs K)")
plt.xlabel("K")
plt.ylabel("Inertia (Within-Cluster SSE)")
for k, inertia, d in zip(ks, inertias, distances):
    plt.annotate(
        str(k),
        (k, inertia),
        textcoords="offset points",
        xytext=(0, 5),
        ha="center",
        fontsize=8,
    )
plt.grid(True, alpha=0.3)
plt.show()

## 7) Train K-Means and map clusters to target means


In [None]:
K = int(auto_k)

kmeans = KMeans(n_clusters=K, n_init="auto", random_state=RANDOM_STATE)
train_labels = kmeans.fit_predict(Xtr)

# 1) Ensure y_train is a 1D numpy array
y_train = np.asarray(y_train).reshape(-1)

# 2) Filter out rows where y_train is NaN and align labels (and X if needed)
# Assume these three arrays should be row-aligned
# X_train: (N, d), y_train: (N), train_labels: (N)
# Make sure they are all the same length first
N = min(len(y_train), len(train_labels), len(X_train_scaled))
X_train_scaled = X_train_scaled[:N]
y_train = y_train[:N]
train_labels = np.asarray(train_labels)[:N]

# Build mask from the unfiltered labels array that defines validity
valid_mask = ~np.isnan(y_train)

# Apply the same mask to all aligned arrays
X_train_scaled = X_train_scaled[valid_mask]
y_train = y_train[valid_mask]
train_labels = train_labels[valid_mask]


# 3) Compute cluster means robustly on valid rows
# Handle noise label (-1) if present by treating it as a separate "cluster"
global_mean = float(np.mean(y_train))

# Build a DataFrame to group by labels safely
df_tmp = pd.DataFrame({"label": train_labels, "y": y_train})

cluster_to_mean = df_tmp.groupby("label", dropna=False)["y"].mean()

# 4) Replace any NaN cluster means with the global mean
cluster_to_mean = cluster_to_mean.fillna(global_mean)

# 5) Create a fast lookup for all possible non-negative labels
max_label = (
    int(cluster_to_mean.index[cluster_to_mean.index >= 0].max())
    if np.any(cluster_to_mean.index >= 0)
    else -1
)
means_array = np.full(max_label + 1 if max_label >= 0 else 0, global_mean, dtype=float)

# Fill means for all non-negative labels
for lbl, mean_val in cluster_to_mean.items():
    if lbl >= 0:
        means_array[lbl] = float(mean_val)

# 6) Build predictions with safe handling for noise/unseen labels
train_labels = np.asarray(train_labels, dtype=int)

# Mask for noise labels (-1) or any labels outside max_label
out_of_range_mask = (train_labels < 0) | (train_labels > max_label)
in_range_mask = ~out_of_range_mask

y_pred_train = np.empty_like(y_train, dtype=float)
y_pred_train[out_of_range_mask] = global_mean  # fallback for -1 or unseen labels
if max_label >= 0:
    y_pred_train[in_range_mask] = means_array[train_labels[in_range_mask]]
else:
    # If there are no non-negative labels at all, everything uses global mean
    y_pred_train[in_range_mask] = global_mean

# 7) Compute metrics
mae = mean_absolute_error(y_train, y_pred_train)
rmse = mean_squared_error(y_train, y_pred_train) ** 0.5
print(f"In-sample MAE: {mae:.5f}, RMSE: {rmse:.5f}")

## 8) Predict on test and build submission


In [None]:
test_labels = kmeans.predict(Xte)

# If a cluster appears only in test (rare), fall back to global mean
global_mean = y_train.mean()
cluster_means_filled = cluster_to_mean.copy()
# Ensure we have means for all clusters [K-1]
for c in range(K):
    if c not in cluster_means_filled.index:
        cluster_means_filled.loc[c] = global_mean

test_pred = cluster_means_filled.iloc[test_labels].values

submission = pd.DataFrame({ID_COL: test_ids, TARGET_COL: test_pred})

# Sort by ID if ID is sortable
try:
    submission = submission.sort_values(by=ID_COL)
except Exception:
    pass

out_path = "submission.csv"
submission.to_csv(out_path, index=False)
print(f"Saved submission to {out_path}")