Using PCA to clean data and reduce dimensionality, then linear regression with cross-validation

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

train_path = "MiNDAT.csv"
test_path = "MiNDAT_UNK.csv"
id_col = "LOCAL_IDENTIFIER"
target_col = "CORRUCYSTIC_DENSITY"
variance_to_keep = 0.001

# 1) Load
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# 2) Normalize column names (strip whitespace)
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()

# 3) Separate out ID and target

y = train[target_col]
train_ids = train[id_col]
test_ids = test[id_col]

# 4) Select numeric feature set (exclude ID and target)
X_train = train.drop(columns=[c for c in [id_col, target_col] if c in train.columns])
X_test = test.drop(columns=[id_col], errors="ignore")

# Keep only numeric columns
X_train = X_train.select_dtypes(include=[np.number]).copy()
X_test = X_test.select_dtypes(include=[np.number]).copy()

# Align test columns to train columns
X_test = X_test.reindex(columns=X_train.columns, fill_value=np.nan)

# 5) Impute missing values using training medians
train_medians = X_train.median(numeric_only=True)
X_train = X_train.fillna(train_medians)
X_test = X_test.fillna(train_medians)

# 6) Standardize (fit on train, transform both)
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# 7) Choose the number of components to retain the desired variance
#    Fit a full PCA first to inspect cumulative explained variance
pca_full = PCA(svd_solver="full")
pca_full.fit(X_train_std)
cum_var = np.cumsum(pca_full.explained_variance_ratio_)
k = int(np.searchsorted(cum_var, variance_to_keep) + 1)
k = max(1, min(k, X_train_std.shape[1]))  # safety bounds

print(f"Total numeric features: {X_train_std.shape[1]}")
print(f"Chosen components (k) for >= {variance_to_keep*100:.1f}% variance: {k}")
print(f"Actual variance retained: {cum_var[k-1]*100:.2f}%")

# 8) Fit PCA with k components on train, transform both
pca = PCA(n_components=k, svd_solver="full")
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

# 9) Build reduced DataFrames
pc_cols = [f"PC{i}" for i in range(1, k + 1)]
train_reduced = pd.DataFrame(X_train_pca, columns=pc_cols)
test_reduced = pd.DataFrame(X_test_pca, columns=pc_cols)

train_out = pd.concat(
    [
        train_ids.reset_index(drop=True),
        y.reset_index(drop=True),
        train_reduced.reset_index(drop=True),
    ],
    axis=1,
)
test_out = pd.concat(
    [test_ids.reset_index(drop=True), test_reduced.reset_index(drop=True)], axis=1
)

# 10) Save
train_out.to_csv("MiNDAT_pca.csv", index=False)
test_out.to_csv("MiNDAT_UNK_pca.csv", index=False)

print("Saved:")
print(" - MiNDAT_pca.csv  [LOCAL_IDENTIFIER, CORRUCYSTIC_DENSITY, PC1..PCk]")
print(" - MiNDAT_UNK_pca.csv  [LOCAL_IDENTIFIER, PC1..PCk]")

Total numeric features: 43
Chosen components (k) for >= 0.1% variance: 1
Actual variance retained: 8.16%
Saved:
 - MiNDAT_pca.csv  [LOCAL_IDENTIFIER, CORRUCYSTIC_DENSITY, PC1..PCk]
 - MiNDAT_UNK_pca.csv  [LOCAL_IDENTIFIER, PC1..PCk]


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error

train_path = "MiNDAT_pca.csv"
test_path = "MiNDAT_UNK_pca.csv"
id_col = "LOCAL_IDENTIFIER"
target_col = "CORRUCYSTIC_DENSITY"

# 1) Load
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# 2) Normalize column names (strip whitespace just in case)
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()

# 3) Separate target and IDs
y = pd.to_numeric(train[target_col], errors="coerce")
train_ids = train[id_col]
test_ids = test[id_col]

# 4) Select numeric features (excluding ID and target), coerce to numeric
X_train = train.drop(columns=[c for c in [id_col, target_col] if c in train.columns])
X_test = test.drop(columns=[id_col], errors="ignore")

# Coerce to numeric (non-convertible -> NaN)
for df in (X_train, X_test):
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Keep only numeric dtypes
X_train = X_train.select_dtypes(include=[np.number]).copy()
X_test = X_test.select_dtypes(include=[np.number]).copy()

# Align columns between train and test
X_test = X_test.reindex(columns=X_train.columns, fill_value=np.nan)

# 5) Remove/clean NaNs:
mask_y = y.notna()
X_train = X_train.loc[mask_y].copy()
y = y.loc[mask_y].copy()

# - Fill remaining NaNs in features with training medians
train_medians = X_train.median(numeric_only=True).fillna(0.0)
X_train = X_train.fillna(train_medians)

# - As a final safeguard, drop any training rows that still contain NaN
row_mask = X_train.notna().all(axis=1)
X_train, y = X_train.loc[row_mask], y.loc[row_mask]

# - Clean test features using the same median (do not drop IDs)
X_test = X_test.fillna(train_medians).fillna(0.0)

# 6) Persist-cleaned files
clean_train = pd.concat(
    [
        train_ids.loc[y.index].reset_index(drop=True),
        y.reset_index(drop=True),
        X_train.reset_index(drop=True),
    ],
    axis=1,
)
clean_test = pd.concat(
    [test_ids.reset_index(drop=True), X_test.reset_index(drop=True)], axis=1
)
clean_train.to_csv("MiNDAT_clean.csv", index=False)
clean_test.to_csv("MiNDAT_UNK_clean.csv", index=False)
print("Saved cleaned datasets:")
print(" - MiNDAT_clean.csv")
print(" - MiNDAT_UNK_clean.csv")

# 7) Build model: Standardize -> LinearRegression
pipe = Pipeline(
    steps=[
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("linreg", LinearRegression()),
    ]
)


def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))


# 8) Cross-validation (no grid search)
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
rmse_scorer = make_scorer(rmse, greater_is_better=False)

r2_scores = cross_val_score(pipe, X_train, y, scoring="r2", cv=rkf, n_jobs=None)
rmse_scores = cross_val_score(
    pipe, X_train, y, scoring=rmse_scorer, cv=rkf, n_jobs=None
)

print(f"CV R²: mean={np.mean(r2_scores):.4f}, std={np.std(r2_scores):.4f}")
print(f"CV RMSE: mean={np.mean(rmse_scores):.4f}, std={np.std(rmse_scores):.4f}")

# 9) Fit on full cleaned training data
pipe.fit(X_train, y)

# Show learned coefficients
lin = pipe.named_steps["linreg"]
coefs = pd.Series(lin.coef_, index=X_train.columns, name="coefficient").sort_values(
    ascending=False
)
print("\nLearned coefficients (after standardization):")
print(coefs.head(15))
print("\nMost negative coefficients:")
print(coefs.tail(15))

# 10) Prediction for test and build submission
preds = pipe.predict(X_test)
submission = pd.DataFrame({id_col: test_ids.values, target_col: preds})[
    [id_col, target_col]
]

out_file = "corrucystic_density_predictions9.csv"
submission.to_csv(out_file, index=False)
print(f"\nWrote submission file: {out_file}")

Saved cleaned datasets:
 - MiNDAT_clean.csv
 - MiNDAT_UNK_clean.csv
CV R²: mean=-0.0009, std=0.0007
CV RMSE: mean=-188.5723, std=1.9885

Learned coefficients (after standardization):
PC1   -0.608233
Name: coefficient, dtype: float64

Most negative coefficients:
PC1   -0.608233
Name: coefficient, dtype: float64

Wrote submission file: corrucystic_density_predictions9.csv
