## RunModelsWithMeanEmbedding
This script:
- Reads in the dataframe containing the per-LSOA summary info (e.g. mean/max/min embedding, mean/max/min embedding per cluster grouping, % of images in each cluster in each LSOA)
- Compares model performance:
    - When using mean/max/min embedding
    - When including the variables recording the % of images in each cluster
- Properly fits and tests model using cross-validation

In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message=".*glibc.*")

import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import spearmanr
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
import json
import multiprocessing
import joblib

data_dir = os.path.join("../../../../data/embeddings/")
imd_file = os.path.join("../../../../", "data", "imd", "File_2_-_IoD2025_Domains_of_Deprivation.xlsx")

var_to_predict = 'imd_rank'

def evaluate_imd_model(y_true, y_pred, column,num_in_class, num_lsoas, plot=True, plot_collectively = False, ax=False ):

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    spearman_corr, _ = spearmanr(y_true, y_pred)

    metrics = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'Spearman_rank_corr': spearman_corr}

    # Print metrics
    #print("Model evaluation metrics:")
    #for k, v in metrics.items():
    #    print(f"{k}: {v:.3f}")

    # Scatter plot
    if plot:
        plt.figure(figsize=(6,6))
        plt.scatter(y_true, y_pred, alpha=0.7)
        plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', lw=2)
        plt.xlabel(f"True {column}")
        plt.ylabel(f"Predicted {column}")
        plt.title(f"{column}")
        plt.grid(True)
        plt.show()

    # Scatter plot
    if plot_collectively:
        ax.scatter(y_true, y_pred, alpha=0.7)
        ax.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', lw=2)
        ax.set_xlabel(f"True")
        ax.set_ylabel(f"Predicted")
        ax.set_title(f"{column}: \n{num_in_class} images\n {num_lsoas} LSOAs", fontsize=15)
        ax.grid(True)
        ax.text(
            0.01, 0.99,  # x=1% from left, y=99% from bottom
            f'R² = {r2:.2f}, RMSE = {rmse:.0f}',
            transform=ax.transAxes,  # use axes coordinates
            fontsize=12,
            verticalalignment='top',  # anchor to top so it doesn’t go outside
            horizontalalignment='left')
    
    return metrics

In [47]:
statistic = 'median'

### Get imd data

In [2]:
imd = pd.read_excel(imd_file, sheet_name="IoD2025 Domains", header=0)

# Rename columns to simpler versions
imd_col_map = {"LSOA code (2021)": "LSOA21CD",
    "Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)": "imd_rank",
    "Index of Multiple Deprivation (IMD) Decile (where 1 is most deprived 10% of LSOAs)": "imd_decile"}
imd = imd.rename(columns=imd_col_map)

### Get embedding data

In [3]:
# file_ending = 'kmeanscluster7' 
# categories_dict = {'kmeanscluster7': ['1', '2', '3', '4', '5',  '6',  '7']}
# category_pct_cols = categories_dict[file_ending]

In [49]:
big_summary_df = pd.read_pickle(data_dir + f"per_lsoa_embedding_summaries/{statistic}_embedding.pkl")

big_summary_df_with_imd = pd.merge(left=big_summary_df, right=imd[['LSOA21CD', 'imd_rank']], on="LSOA21CD")
big_summary_df_with_imd['imd_rerank'] = big_summary_df_with_imd['imd_rank'].rank(method='first').astype(int)

# Fit model properly
Fit several different models with cross validation and hyper-parameter tuning, in order to work out the best model and parameters.  

SHOULD WE DO THIS FIRST AND THEN USE THE ANSWER IN THE ABOVE TESTS?

In [54]:
X = np.stack(big_summary_df_with_imd[f"{statistic}_embedding"].values)     
y = big_summary_df_with_imd['imd_rerank'].values

# -------------------------
# Split data into training and test sets
# -------------------------
X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
    X, y, np.arange(X.shape[0]), test_size=0.2, random_state=42)
print(f"Training points: {X_train.shape[0]}, Test points: {X_test.shape[0]}")

# -------------------------
# Define model pipelines and parameter grids
# -------------------------
models = []
param_grids = []

# # 1. Linear Regression
# pipe_linear = Pipeline([
#     ('scaler', StandardScaler()),
#     ('reg', LinearRegression())])
# models.append(pipe_linear)
# param_grids.append({})

# # 2. Random Forest
# pipe_rf = Pipeline([
#     ('scaler', StandardScaler()),
#     ('reg', RandomForestRegressor(random_state=42))])
# models.append(pipe_rf)
# param_grids.append({
#     'reg__n_estimators': [100, 200],
#     'reg__max_depth': [None, 10, 20]})

# 4. XGBoost Regressor
pipe_xgb = Pipeline([
    ('scaler', StandardScaler()),  # scaling not strictly necessary but fine to keep
    ('reg', XGBRegressor(
        objective='reg:squarederror',  # standard regression objective
        random_state=42,
        n_jobs=-1,
        verbosity=0))])
models.append(pipe_xgb)
param_grids.append({
    'reg__n_estimators': [100, 300],
    'reg__max_depth': [3, 6, 10],
    'reg__learning_rate': [0.01, 0.1, 0.3],
    'reg__subsample': [0.8, 1.0]})

# # 3. Neural Network (MLPRegressor)
# pipe_mlp = Pipeline([
#     ('scaler', StandardScaler()),
#     ('reg', MLPRegressor(max_iter=1500,
#                              activation='relu',           # default, works well for embeddings
#     solver='adam',               # stochastic gradient optimizer
#     learning_rate_init=1e-3,     # small learning rate improves convergence on high-dim data
#     early_stopping=True,         # stop automatically if validation score doesn’t improve
#     n_iter_no_change=50,         # patience for early stopping
#     random_state=42,
#     verbose=False))])
# models.append(pipe_mlp)
# param_grids.append({
#     'reg__hidden_layer_sizes': [(100,), (100, 50)],
#     'reg__alpha': [1e-4, 1e-3]})

# -------------------------
# Perform cross-validation
# -------------------------
best_model = None
best_score = -np.inf
best_model_name = None
best_params = {}

print("Training models")
cv = KFold(n_splits=5, shuffle=True, random_state=42)
ncores = min(multiprocessing.cpu_count() - 1, 100)
print(f"Using {ncores} cores.")

for model, param_grid, name in zip(
    models, param_grids, ["XGBoost"]): #"LinearReg", "RandomForest", "NeuralNet", 
    print(f"\nTraining: {name}...")
    if param_grid:
        grid = GridSearchCV(model, param_grid, cv=cv, scoring='r2', n_jobs=ncores)
        grid.fit(X_train, y_train)
        cv_score = grid.best_score_
        model_best = grid.best_estimator_
        params_best = grid.best_params_
    else:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
        cv_score = np.mean(scores)
        model.fit(X_train, y_train)
        model_best = model
        params_best = {}

    print(f"{name} CV mean R² = {cv_score:.3f} {('(best params: '+str(params_best)+')') if params_best else ''}")

    # Track the best
    if cv_score > best_score:
        best_score = cv_score
        best_model = model_best
        best_model_name = name
        best_params = params_best.copy()

print(f"\nBest model: {best_model_name} (R² = {best_score:.3f})")
print(f"Best hyperparameters: {best_params}")

print(f"Best model from CV: {best_model_name} with R^2 = {best_score:.3f} and params {best_params}")

# Save the meta-data so the model can be trained on the different domains of deprivation
best_model_info = {
    "model_name": best_model_name,
    "best_score": float(best_score),
    "best_params": best_params}

Training points: 1356, Test points: 339
Training models
Using 31 cores.

Training: XGBoost...


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a rece

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a rece

XGBoost CV mean R² = 0.649 (best params: {'reg__learning_rate': 0.1, 'reg__max_depth': 3, 'reg__n_estimators': 300, 'reg__subsample': 0.8})

Best model: XGBoost (R² = 0.649)
Best hyperparameters: {'reg__learning_rate': 0.1, 'reg__max_depth': 3, 'reg__n_estimators': 300, 'reg__subsample': 0.8}
Best model from CV: XGBoost with R^2 = 0.649 and params {'reg__learning_rate': 0.1, 'reg__max_depth': 3, 'reg__n_estimators': 300, 'reg__subsample': 0.8}


In [66]:
# ---- SAVE the model ----
bundle = {
   "model": best_model,                 # the fitted Pipeline
   "name": best_model_name,             # e.g., "RandomForest"
   "cv_score_r2": float(best_score), }  # ensure JSON-safe
 #  "trained_at": datetime.utcnow().isoformat() + "Z",
#   "sklearn_version": sklearn.__version__

joblib.dump(bundle, os.path.join(data_dir, "5-imd_best_model_bundle.joblib"))
print("Saved to best_model_bundle.joblib")

Saved to best_model_bundle.joblib


### Final test

In [68]:
# predict on held-out test data
y_pred_test = best_model.predict(X_test)

# compute metrics
test_r2 = r2_score(y_test, y_pred_test)
test_rmse = mean_squared_error(y_test, y_pred_test)

print(f"Test R² = {test_r2:.3f}")
print(f"Test RMSE = {test_rmse:.3f}")


Test R² = 0.678
Test RMSE = 72270.953
