In [1]:
# %%
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, log_loss, roc_auc_score,
    roc_curve, brier_score_loss
)
from sklearn.calibration import calibration_curve

# Import custom models
# Removed old path hack; using package imports))
from llm_prior_project.priors.target_informed_model import TargetInformedModel
from llm_prior_project.priors.target_elicitor import LLMTargetElicitor
from llm_prior_project.models.target_model import SklearnTargetModel

np.random.seed(42)


In [2]:
# %%
# loading the data 
def load_heart_dataset(path, features, outcome="num"):
    columns = [
        "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
        "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"
    ]
    df = pd.read_csv(path, header=None, names=columns, na_values="?")
    df[outcome] = (df[outcome] > 0).astype(int)
    df = df[features + [outcome]].dropna()
    return df[features], df[outcome]

features = ["age", "sex", "trestbps", "chol", "thalach", "oldpeak"]

X, y = load_heart_dataset("data/heart+disease/processed.hungarian.data", features)
X_cleveland, y_cleveland = load_heart_dataset("data/heart+disease/processed.cleveland.data", features)

print("Hungarian:", X.shape, "Cleveland:", X_cleveland.shape)


Hungarian: (270, 6) Cleveland: (303, 6)


In [8]:
# %%
custom_prompt = custom_prompt = """

You are an expert in **biostatistics and cardiovascular epidemiology**.  
For the **target-informed logistic regression model** provided below, which predicts coronary artery disease (CAD), your task is to propose and justify suitable target values for the regression coefficients.  

Unlike standard ridge regression, which shrinks coefficients toward **zero**, this method shrinks coefficients toward **pre-specified targets** $\mu = (\mu_1, \ldots, \mu_p)$, based on domain knowledge.  

---

**Model Details:**  
- Response: $y \in \{0,1\}$ (0 = healthy, 1 = CAD)  
- Linear Predictor:  

\[
\eta = \beta_0 + \beta_1 \cdot \text{age} + \beta_2 \cdot \text{sex} + \beta_3 \cdot \text{trestbps} + \beta_4 \cdot \text{chol} + \beta_5 \cdot \text{thalach} + \beta_6 \cdot \text{oldpeak}
\]

- Prediction:  

\[
P(y=1|X) = \frac{1}{1 + \exp(-\eta)}
\]

- Objective Function (Target-Informed Ridge Penalty):  

\[
\min_\beta \; - \log L(\beta \,|\, X,y) + \alpha \sum_{j=1}^p (\beta_j - \mu_j)^2
\]

where $\mu_j$ are the **target values** you will propose.  

---

**Predictor Details:**  
- `age`: in years  
- `sex`: categorical (1 = male; 0 = female)  
- `trestbps`: resting blood pressure (mm Hg on admission)  
- `chol`: serum cholesterol (mg/dl)  
- `thalach`: maximum heart rate achieved (bpm)  
- `oldpeak`: ST depression induced by exercise relative to rest (mm)  

---

**Your Response Should:**  

1. **Leverage Knowledge & Simulate Literature Use**  
   - Briefly state how you use your epidemiological knowledge (e.g., Framingham Study, MONICA project, meta-analyses) to inform coefficient expectations.  
   - Do *not* rely on the Cleveland Heart Disease dataset, but instead general domain knowledge.  

2. **Propose Target Coefficients ($\mu_j$)**  
   - Provide target values $\mu_j$ for each predictor, representing the expected effect on the log-odds of CAD.  
   - Justify the direction (positive/negative) and plausible magnitude of each coefficient in natural language.  

3. **Rationale for Each Target**  
   - For each predictor, explain why the coefficient should be positive/negative, and roughly how large, based on prior evidence.  
   - For $\beta_0$, clarify its interpretation (baseline log-odds when all predictors are zero) and how you approximate it.  

4. **Uncertainty & Strength of Belief**  
   - Discuss how confident you are about each target. Which ones are well-established (e.g., age, sex), and which are more uncertain (e.g., cholesterol in small hospital samples)?  
   - If useful, you may also describe an alternative “weaker” set of targets closer to zero, to represent more cautious shrinkage.  

---

**Output Format:**  
After providing your reasoning in detail, end your answer with a JSON object summarizing the chosen target values only:  

```json
{
  "targets": {
    "age": ... ,
    "sex": ... ,
    "trestbps": ... ,
    "chol": ... ,
    "thalach": ... ,
    "oldpeak": ...
  }
}
```

---


"""

elicitor = LLMTargetElicitor(model_name="gpt-4")
result = elicitor.get_targets_with_prompt(custom_prompt, features)

if result:
    targets = result["targets"]
    print("Extracted targets:", targets)
else:
    targets = [0.0] * len(features)


Using user-crafted prompt
Feature names expected: ['age', 'sex', 'trestbps', 'chol', 'thalach', 'oldpeak']
Prompt length: 2934 characters
Prompt validation: PASSED
Calling LLM API...
Got LLM response (2630 characters)
Response preview: 1. **Leverage Knowledge & Simulate Literature Use**  
   The Framingham Heart Study, MONICA project, and various meta-analyses have provided valuable insights into the risk factors for coronary artery...
Parsing LLM response...
Successfully extracted targets: [0.05, 0.5, 0.02, 0.002, -0.01, 0.4]
Extracted targets: [0.05, 0.5, 0.02, 0.002, -0.01, 0.4]


In [9]:
print(targets)

[0.05, 0.5, 0.02, 0.002, -0.01, 0.4]


In [10]:
# %%
def evaluate_models(model, baseline, X, y):
    """
    Evaluate baseline and informed models on given data.
    Works with either numpy arrays or pandas DataFrames.
    """
    # Ensure numpy arrays
    X = np.asarray(X)
    y = np.asarray(y)

    # Baseline (sklearn logistic regression)
    baseline_probs = baseline.predict_proba(X)[:, 1]

    # Informed model already returns probabilities
    informed_probs = model.predict(X)

    return {
        "baseline_accuracy": accuracy_score(y, (baseline_probs > 0.5).astype(int)),
        "baseline_log_loss": log_loss(y, baseline_probs),
        "baseline_auc": roc_auc_score(y, baseline_probs),
        "informed_accuracy": accuracy_score(y, (informed_probs > 0.5).astype(int)),
        "informed_log_loss": log_loss(y, informed_probs),
        "informed_auc": roc_auc_score(y, informed_probs)
    }


def cross_val_evaluate(X, y, feature_names, targets, alpha=1.0, n_splits=5):
    """
    Compare baseline vs target-informed logistic regression using cross-validation.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []
    
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Baseline logistic regression
        baseline = LogisticRegression(max_iter=1000).fit(X_train, y_train)
        
        # Target-informed logistic regression
        informed = TargetInformedModel(alpha=alpha, model_type="logistic").fit(
            X_train, y_train, feature_names=feature_names, targets=targets
        )
        
        results.append(evaluate_models(informed, baseline, X_test, y_test))
    
    return pd.DataFrame(results)


def cross_val_grid_alphas(X, y, feature_names, targets, alphas, n_splits=5):
    """
    Run cross-validation across multiple alpha values for TargetInformedModel.
    Returns per-fold results, a summary table, and the best alpha.
    """
    all_results = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for alpha in alphas:
        cv_results = cross_val_evaluate(
            X, y, feature_names, targets, alpha=alpha, n_splits=n_splits
        )
        cv_results["alpha"] = alpha
        all_results.append(cv_results)

    results = pd.concat(all_results, ignore_index=True)
    summary = results.groupby("alpha").mean()

    # Select alpha with lowest informed log_loss
    best_alpha = summary["informed_log_loss"].idxmin()
    print("\nCross-validation summary:")
    print(summary)
    print("\nSelected alpha (by log_loss):", best_alpha)

    return results, summary, best_alpha


In [11]:
# %%
alphas = [0.01, 0.1, 0.5, 1, 2, 5, 10]
grid_results, summary, best_alpha = cross_val_grid_alphas(
    X.values, y.values, features, targets, alphas, n_splits=5
)



Cross-validation summary:
       baseline_accuracy  baseline_log_loss  baseline_auc  informed_accuracy  \
alpha                                                                          
0.01            0.814815           0.450018      0.849259           0.811111   
0.10            0.814815           0.450018      0.849259           0.811111   
0.50            0.814815           0.450018      0.849259           0.811111   
1.00            0.814815           0.450018      0.849259           0.814815   
2.00            0.814815           0.450018      0.849259           0.814815   
5.00            0.814815           0.450018      0.849259           0.825926   
10.00           0.814815           0.450018      0.849259           0.822222   

       informed_log_loss  informed_auc  
alpha                                   
0.01            0.452301      0.848666  
0.10            0.451825      0.849548  
0.50            0.450621      0.851307  
1.00            0.449237      0.850719  
2.00  

In [12]:
# %%
from sklearn.linear_model import LogisticRegression

# --- Fit models ---
# 1. Standard logistic regression (no penalty)
log_reg = LogisticRegression(
    penalty=None, max_iter=1000, solver="lbfgs"
).fit(X, y)

# 2. Logistic regression with L2 ridge penalty
ridge_reg = LogisticRegression(
    penalty="l2", C=1.0, max_iter=1000, solver="lbfgs"
).fit(X, y)

# 3. Target-informed logistic regression (new sklearn hack model)
informed_reg = SklearnTargetModel(
    alpha=best_alpha, model_type="logistic", targets=targets
).fit(X.values, y.values, feature_names=features)

# --- Evaluate ---
def evaluate_three_models(X, y, log_reg, ridge_reg, informed_reg):
    results = {}

    # Standard logistic regression
    probs_log = log_reg.predict_proba(X)[:, 1]
    results["logistic"] = {
        "accuracy": accuracy_score(y, (probs_log > 0.5)),
        "log_loss": log_loss(y, probs_log),
        "auc": roc_auc_score(y, probs_log)
    }

    # Ridge logistic regression
    probs_ridge = ridge_reg.predict_proba(X)[:, 1]
    results["ridge"] = {
        "accuracy": accuracy_score(y, (probs_ridge > 0.5)),
        "log_loss": log_loss(y, probs_ridge),
        "auc": roc_auc_score(y, probs_ridge)
    }

    # Target-informed logistic regression
    probs_inf = informed_reg.predict(X.values)
    results["informed"] = {
        "accuracy": accuracy_score(y, (probs_inf > 0.5)),
        "log_loss": log_loss(y, probs_inf),
        "auc": roc_auc_score(y, probs_inf)
    }

    return pd.DataFrame(results).T

# Run evaluation on Hungarian dataset
metrics_three = evaluate_three_models(X, y, log_reg, ridge_reg, informed_reg)
print("Hungarian dataset comparison:")
print(metrics_three)


Hungarian dataset comparison:
          accuracy  log_loss       auc
logistic  0.818519  0.424243  0.862441
ridge     0.825926  0.425129  0.861269
informed  0.674074  0.602512  0.860214


In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

def cross_val_search_targets(X, y, features, target_candidates, alphas, n_splits=5, random_state=42):
    """
    Cross-validate to find best (targets, alpha) combination for Target-Informed Logistic Regression.
    
    Parameters
    ----------
    X : array-like
        Feature matrix
    y : array-like
        Binary target vector
    features : list of str
        Feature names
    target_candidates : list of np.array
        List of candidate target vectors (μ) to try
    alphas : list of float
        Regularisation strengths
    n_splits : int
        Number of CV folds
    random_state : int
        Random seed for reproducibility
    
    Returns
    -------
    results : pd.DataFrame
        Per-fold metrics for all (targets, alpha) combos
    summary : pd.DataFrame
        Mean metrics per (targets, alpha) combo
    best_combo : dict
        Dictionary with best targets, alpha, and their metrics
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    all_results = []

    for mu in target_candidates:
        for alpha in alphas:
            fold_metrics = []
            for train_idx, test_idx in skf.split(X, y):
                X_train, X_test = X[train_idx], X[test_idx]
                y_train, y_test = y[train_idx], y[test_idx]

                model = SklearnTargetModel(alpha=alpha, model_type="logistic", targets=mu).fit(
                    X_train, y_train, feature_names=features
                )
                probs = model.predict(X_test)

                fold_metrics.append({
                    "alpha": alpha,
                    "targets": tuple(mu.round(3)),  # store as tuple for readability
                    "log_loss": log_loss(y_test, probs),
                    "accuracy": accuracy_score(y_test, (probs > 0.5)),
                    "auc": roc_auc_score(y_test, probs),
                })
            all_results.extend(fold_metrics)

    results = pd.DataFrame(all_results)
    summary = results.groupby(["alpha", "targets"]).mean().reset_index()

    # Best by log_loss
    best_idx = summary["log_loss"].idxmin()
    best_combo = summary.iloc[best_idx].to_dict()

    print("Best combination found:")
    print(best_combo)

    return results, summary, best_combo


In [14]:
# Candidate target sets
target_candidates = [
    np.zeros(len(features)),             # shrink to zero
    np.array([0.3, 0.5, 0.1, 0.05, -0.3, 0.7]),  # literature-based example
    LogisticRegression(penalty=None, max_iter=1000).fit(X, y).coef_[0]  # MLE from data
]

alphas = [0.01, 0.1, 0.5, 1, 2, 5]

cv_results, cv_summary, best = cross_val_search_targets(
    X.values, y.values, features, target_candidates, alphas, n_splits=5
)

print("\nCV summary:")
print(cv_summary.sort_values("log_loss").head())


Best combination found:
{'alpha': 1.0, 'targets': (np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)), 'log_loss': 0.4500173516917004, 'accuracy': 0.8148148148148149, 'auc': 0.849259400729989}

CV summary:
    alpha                         targets  log_loss  accuracy       auc
10   1.00  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.450017  0.814815  0.849259
13   2.00  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.450462  0.825926  0.848671
7    0.50  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.450688  0.814815  0.851019
4    0.10  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.452181  0.807407  0.849548
1    0.01  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.452713  0.807407  0.848666


In [15]:
def cross_val_search_and_generalize(
    X_train, y_train, X_test, y_test, features,
    target_candidates, alphas, n_splits=5, random_state=42
):
    """
    1. Cross-validate to find best (targets, alpha) on training set.
    2. Refit best model on full training set.
    3. Evaluate on external test set.

    Returns
    -------
    results : pd.DataFrame
        Per-fold metrics for all (targets, alpha) combos.
    summary : pd.DataFrame
        Mean CV metrics per (targets, alpha).
    best_combo : dict
        Best combo from CV (alpha, targets).
    external_metrics : dict
        Evaluation metrics on external test set.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    all_results = []

    for mu in target_candidates:
        for alpha in alphas:
            fold_metrics = []
            for train_idx, val_idx in skf.split(X_train, y_train):
                X_tr, X_val = X_train[train_idx], X_train[val_idx]
                y_tr, y_val = y_train[train_idx], y_train[val_idx]

                model = SklearnTargetModel(alpha=alpha, model_type="logistic", targets=mu).fit(
                    X_tr, y_tr, feature_names=features
                )
                probs = model.predict(X_val)

                fold_metrics.append({
                    "alpha": alpha,
                    "targets": tuple(mu.round(3)),
                    "log_loss": log_loss(y_val, probs),
                    "accuracy": accuracy_score(y_val, (probs > 0.5)),
                    "auc": roc_auc_score(y_val, probs),
                })
            all_results.extend(fold_metrics)

    results = pd.DataFrame(all_results)
    summary = results.groupby(["alpha", "targets"]).mean().reset_index()

    # Best by log_loss
    best_idx = summary["log_loss"].idxmin()
    best_combo = summary.iloc[best_idx].to_dict()

    print("Best combination found (CV):")
    print(best_combo)

    # --- Refit best model on full training set ---
    best_alpha = best_combo["alpha"]
    best_targets = np.array(best_combo["targets"], dtype=float)

    final_model = SklearnTargetModel(alpha=best_alpha, model_type="logistic", targets=best_targets).fit(
        X_train, y_train, feature_names=features
    )

    # --- External test evaluation ---
    probs_test = final_model.predict(X_test)
    external_metrics = {
        "accuracy": accuracy_score(y_test, (probs_test > 0.5)),
        "log_loss": log_loss(y_test, probs_test),
        "auc": roc_auc_score(y_test, probs_test),
    }

    print("\nExternal test set evaluation (Cleveland):")
    print(external_metrics)

    return results, summary, best_combo, external_metrics


In [16]:
target_candidates = [
    np.zeros(len(features)),  # zero target (ridge)
    LogisticRegression(penalty=None, max_iter=1000).fit(X, y).coef_[0],  # MLE targets
    np.array([0.3, 0.5, 0.1, 0.05, -0.3, 0.7])  # example domain-informed
]

alphas = [0.01, 0.1, 0.5, 1, 2, 5]

cv_results, cv_summary, best, external_metrics = cross_val_search_and_generalize(
    X.values, y.values,
    X_cleveland.values, y_cleveland.values,
    features, target_candidates, alphas,
    n_splits=5
)

print("\nTop CV combos:")
print(cv_summary.sort_values("log_loss").head())


Best combination found (CV):
{'alpha': 1.0, 'targets': (np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)), 'log_loss': 0.4500173516917004, 'accuracy': 0.8148148148148149, 'auc': 0.849259400729989}

External test set evaluation (Cleveland):
{'accuracy': 0.7491749174917491, 'log_loss': 0.5577224833451117, 'auc': 0.8054921916125636}

Top CV combos:
    alpha                         targets  log_loss  accuracy       auc
10   1.00  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.450017  0.814815  0.849259
13   2.00  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.450462  0.825926  0.848671
7    0.50  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.450688  0.814815  0.851019
4    0.10  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.452181  0.807407  0.849548
1    0.01  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)  0.452713  0.807407  0.848666


In [17]:
from sklearn.utils import resample

def low_data_experiment(
    X_train, y_train, X_test, y_test, features,
    targets_prior, alphas=[1.0], fractions=[0.1, 0.2, 0.3], random_state=42
):
    """
    Test generalisation with small subsets of training data.
    
    Parameters
    ----------
    X_train, y_train : Hungarian data
    X_test, y_test   : Cleveland data
    targets_prior    : np.array
        Expert-informed targets for informed model
    alphas : list
        Regularisation strengths to try for ridge and informed models
    fractions : list
        Fractions of training data to subsample
    """
    results = []

    for frac in fractions:
        # Subsample Hungarian data
        X_sub, y_sub = resample(
            X_train, y_train,
            replace=False,
            n_samples=int(frac * len(y_train)),
            random_state=random_state
        )

        # --- Standard logistic regression ---
        log_reg = LogisticRegression(
            penalty=None, max_iter=1000, solver="lbfgs"
        ).fit(X_sub, y_sub)
        probs_log = log_reg.predict_proba(X_test)[:, 1]

        results.append({
            "train_frac": frac,
            "model": "logistic",
            "accuracy": accuracy_score(y_test, (probs_log > 0.5)),
            "log_loss": log_loss(y_test, probs_log),
            "auc": roc_auc_score(y_test, probs_log),
        })

        # --- Ridge logistic regression ---
        for alpha in alphas:
            ridge_reg = LogisticRegression(
                penalty="l2", C=1.0/alpha, max_iter=1000, solver="lbfgs"
            ).fit(X_sub, y_sub)
            probs_ridge = ridge_reg.predict_proba(X_test)[:, 1]

            results.append({
                "train_frac": frac,
                "model": f"ridge (alpha={alpha})",
                "accuracy": accuracy_score(y_test, (probs_ridge > 0.5)),
                "log_loss": log_loss(y_test, probs_ridge),
                "auc": roc_auc_score(y_test, probs_ridge),
            })

        # --- Target-informed logistic regression ---
        for alpha in alphas:
            informed_reg = SklearnTargetModel(
                alpha=alpha, model_type="logistic", targets=targets_prior
            ).fit(X_sub, y_sub, feature_names=features)
            probs_inf = informed_reg.predict(X_test)

            results.append({
                "train_frac": frac,
                "model": f"informed (alpha={alpha})",
                "accuracy": accuracy_score(y_test, (probs_inf > 0.5)),
                "log_loss": log_loss(y_test, probs_inf),
                "auc": roc_auc_score(y_test, probs_inf),
            })

    return pd.DataFrame(results)


In [18]:
# Example prior (make this from domain knowledge!)
expert_targets = np.array([0.03, 0.5, 0.1, 0.05, -0.4, 0.7])

fractions = [0.1, 0.2, 0.3]
alphas = [0.5, 1.0, 2.0]

low_data_results = low_data_experiment(
    X.values, y.values,
    X_cleveland.values, y_cleveland.values,
    features, targets_prior=expert_targets,
    alphas=alphas, fractions=fractions
)

print(low_data_results.sort_values(["train_frac", "model"]))


    train_frac                 model  accuracy  log_loss       auc
4          0.1  informed (alpha=0.5)  0.567657  4.147692  0.797157
5          0.1  informed (alpha=1.0)  0.554455  5.914845  0.787024
6          0.1  informed (alpha=2.0)  0.547855  7.621389  0.779654
0          0.1              logistic  0.739274  0.570479  0.824574
1          0.1     ridge (alpha=0.5)  0.716172  0.584779  0.800579
2          0.1     ridge (alpha=1.0)  0.699670  0.607948  0.785533
3          0.1     ridge (alpha=2.0)  0.689769  0.638301  0.769082
11         0.2  informed (alpha=0.5)  0.458746  1.991015  0.789700
12         0.2  informed (alpha=1.0)  0.742574  0.772251  0.820319
13         0.2  informed (alpha=2.0)  0.600660  2.715390  0.806019
7          0.2              logistic  0.745875  0.688388  0.826505
8          0.2     ridge (alpha=0.5)  0.745875  0.587038  0.820758
9          0.2     ridge (alpha=1.0)  0.752475  0.575426  0.817512
10         0.2     ridge (alpha=2.0)  0.739274  0.573787  0.81

In [19]:
def brute_force_target_search(
    X_train, y_train, X_test, y_test, features,
    alphas=[0.5, 1.0, 2.0], multipliers=[0.5, 0.75, 1.0, 1.25, 1.5]
):
    """
    Brute force search: perturb baseline logistic regression coefficients
    and test as priors for target-informed regression.
    """
    results = []

    # Step 1: Fit baseline logistic regression (MLE)
    log_reg = LogisticRegression(penalty=None, max_iter=1000, solver="lbfgs").fit(X_train, y_train)
    base_coef = log_reg.coef_[0]

    print("Baseline logistic coefficients (MLE):", base_coef)

    # Step 2: Generate perturbed targets
    for alpha in alphas:
        for m in multipliers:
            mu = base_coef * m

            # Step 3: Fit informed regression
            informed = SklearnTargetModel(alpha=alpha, model_type="logistic", targets=mu).fit(
                X_train, y_train, feature_names=features
            )
            probs_inf = informed.predict(X_test)

            # Step 4: Evaluate on Cleveland
            metrics = {
                "alpha": alpha,
                "multiplier": m,
                "log_loss": log_loss(y_test, probs_inf),
                "accuracy": accuracy_score(y_test, (probs_inf > 0.5)),
                "auc": roc_auc_score(y_test, probs_inf),
            }
            results.append(metrics)

    return pd.DataFrame(results)


In [20]:
alphas = [0.1, 0.5, 1.0, 2.0, 5.0]
multipliers = [0.5, 0.75, 1.0, 1.25, 1.5]

brute_results = brute_force_target_search(
    X.values, y.values,
    X_cleveland.values, y_cleveland.values,
    features, alphas=alphas, multipliers=multipliers
)

print(brute_results.sort_values("log_loss").head())


Baseline logistic coefficients (MLE): [-0.01179657  1.56139634 -0.00438747  0.00643127 -0.023387    1.62953566]
    alpha  multiplier  log_loss  accuracy       auc
20    5.0        0.50  0.568531  0.712871  0.806852
21    5.0        0.75  0.575824  0.719472  0.806106
16    2.0        0.75  0.584278  0.719472  0.806589
15    2.0        0.50  0.584994  0.719472  0.805931
11    1.0        0.75  0.587695  0.719472  0.806589


In [21]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

def brute_force_random_targets(
    X_train, y_train, X_test, y_test, features,
    alphas=[0.1, 0.5, 1.0, 2.0],
    n_samples=1000000,  # up to "millions"
    report_every=50000,
    noise_scale=0.5,
    random_state=42
):
    rng = np.random.default_rng(random_state)
    results = []

    # Step 1: Baseline MLE coefficients
    log_reg = LogisticRegression(penalty=None, max_iter=1000, solver="lbfgs").fit(X_train, y_train)
    base_coef = log_reg.coef_[0]
    print("Baseline logistic coefficients (MLE):", base_coef)

    best_logloss = float("inf")
    best_combo = None

    # Step 2: Random sampling of target priors
    for i in range(1, n_samples + 1):
        # Random perturbation around baseline
        noise = rng.normal(0, noise_scale, size=base_coef.shape)
        mu = base_coef + noise

        for alpha in alphas:
            informed = SklearnTargetModel(alpha=alpha, model_type="logistic", targets=mu).fit(
                X_train, y_train, feature_names=features
            )
            probs_inf = informed.predict(X_test)

            ll = log_loss(y_test, probs_inf)
            auc = roc_auc_score(y_test, probs_inf)

            if ll < best_logloss:
                best_logloss = ll
                best_combo = {
                    "iter": i,
                    "alpha": alpha,
                    "log_loss": ll,
                    "auc": auc,
                    "accuracy": accuracy_score(y_test, (probs_inf > 0.5)),
                    "targets": mu.round(3).tolist()
                }

            if i % report_every == 0 and alpha == alphas[0]:
                print(f"[Iteration {i}] Best so far: log_loss={best_logloss:.4f}, "
                      f"AUC={best_combo['auc']:.3f}, alpha={best_combo['alpha']}, "
                      f"targets={best_combo['targets']}")

    print("\n=== Final Best Combo ===")
    print(best_combo)
    return best_combo


In [22]:
best_prior = brute_force_random_targets(
    X.values, y.values,
    X_cleveland.values, y_cleveland.values,
    features,
    alphas=[0.1, 0.5, 1.0, 2.0],
    n_samples=200000,    # adjust upward if you want millions
    report_every=20000,  # progress updates
    noise_scale=0.3      # controls how far we wander from MLE
)


Baseline logistic coefficients (MLE): [-0.01179657  1.56139634 -0.00438747  0.00643127 -0.023387    1.62953566]
[Iteration 20000] Best so far: log_loss=0.5572, AUC=0.808, alpha=0.1, targets=[0.091, 1.16, -0.069, 0.076, 0.08, 0.939]
[Iteration 40000] Best so far: log_loss=0.5544, AUC=0.813, alpha=2.0, targets=[0.016, 1.33, 0.046, -0.029, -0.073, 0.919]
[Iteration 60000] Best so far: log_loss=0.5544, AUC=0.813, alpha=2.0, targets=[0.016, 1.33, 0.046, -0.029, -0.073, 0.919]
[Iteration 80000] Best so far: log_loss=0.5544, AUC=0.813, alpha=2.0, targets=[0.016, 1.33, 0.046, -0.029, -0.073, 0.919]
[Iteration 100000] Best so far: log_loss=0.5544, AUC=0.813, alpha=2.0, targets=[0.016, 1.33, 0.046, -0.029, -0.073, 0.919]
[Iteration 120000] Best so far: log_loss=0.5526, AUC=0.811, alpha=2.0, targets=[0.078, 1.01, 0.037, -0.025, -0.152, 1.05]
[Iteration 140000] Best so far: log_loss=0.5526, AUC=0.811, alpha=2.0, targets=[0.078, 1.01, 0.037, -0.025, -0.152, 1.05]
[Iteration 160000] Best so far: log