Learning to rank.


In [None]:
from wvpy.jtools import declare_task_variables

# set up for external override
with declare_task_variables(globals()):
    rand_seed = 2024
    do_display = True
    result_fname = ""

In [None]:
# set up Python
import logging
import numpy as np
import pandas as pd
import json
from sklearn.linear_model import LogisticRegression
from cmdstanpy import CmdStanModel
from plotnine import *
from rank_plotting_fns import plot_rank_performance, run_stan_model

# quiet down Stan
logger = logging.getLogger("cmdstanpy")
logger.addHandler(logging.NullHandler())

In [None]:
rng = np.random.default_rng(rand_seed)

In [None]:
example_name = "uci wine example"

In [None]:
if do_display:
    print(example_name)

In [None]:
if example_name == "uci wine example":
    # read data and set scale of system
    features_frame = pd.read_csv("uci_wine_example_features.csv")
    features_scores = pd.read_csv("uci_wine_example_scores.csv")
    score_name = "logistic_score"
    m_examples: int = 100
    noise_scale = 3.87
    position_penalty_scale = -2.7123
elif example_name == "sklearn wine example":
    # read data and set scale of system
    features_frame = pd.read_csv("sklearn_wine_example_features.csv")
    features_scores = pd.read_csv("sklearn_wine_example_scores.csv")
    score_name = "score"
    m_examples: int = 100
    noise_scale = 18.7
    position_penalty_scale = -13.123
else:
    raise ("bad option")

know_score: bool = True

In [None]:
features_frame.head()

In [None]:
features_frame.shape

In [None]:
collected_stats = []

In [None]:
n_alternatives: int = 5

In [None]:
n_vars = features_frame.shape[1] + n_alternatives

In [None]:
position_penalties = [position_penalty_scale * i for i in range(n_alternatives)]

position_penalties

In [None]:
# assemble panels of observations with top scoring entry picked
observations = dict()
for sel_i in range(n_alternatives):
    observations[f"display_position_{sel_i}"] = [sel_i] * m_examples
    selected_examples = rng.choice(
        features_frame.shape[0], size=m_examples, replace=True
    )
    observations[f"item_id_{sel_i}"] = selected_examples
    observations[f"score_value_{sel_i}"] = (
        [  # noisy observation of score plus position penalty
            features_scores.loc[int(selected_examples[i]), score_name]  # item score
            + position_penalties[sel_i]  # positional penalty
            + noise_scale * rng.normal(size=1)[0]  # observation noise
            for i in range(m_examples)
        ]
    )
    observations[f"pick_value_{sel_i}"] = [0] * m_examples
observations = pd.DataFrame(observations)
# mark selections
for i in range(m_examples):
    best_j = 0
    for j in range(1, n_alternatives):
        if (
            observations[f"score_value_{j}"][i]
            > observations[f"score_value_{best_j}"][i]
        ):
            best_j = j
    observations.loc[i, f"pick_value_{best_j}"] = 1
# make sure we don't have a column we would not know in practice
observations = observations.loc[
    :, [c for c in observations.columns if not c.startswith("score_value_")]
].reset_index(drop=True, inplace=False)

observations.head()

In [None]:
observations.loc[
    :, [c for c in observations.columns if c.startswith("pick_value_")]
].mean(axis=0)

In [None]:
# This is the data
observations[
    [c for c in observations.columns if not c.startswith("display_position_")]
].head(10)

In [None]:
score_compare_frame = pd.DataFrame([[] for i in range(features_frame.shape[0])])
if know_score:
    score_compare_frame["hidden concept"] = features_scores[score_name]  # would not know this for non-synthetic data


Try a Stan model.


In [None]:
# swap all observed alternatives selections into picked position
observations_sorted = observations.copy()
for passed_i in range(1, n_alternatives):
    for row_i in range(m_examples):
        if observations_sorted.loc[row_i, f"pick_value_{passed_i}"] > 0:
            # swap where data is stored in row
            for dest_col, source_col in (
                ("display_position_0", f"display_position_{passed_i}"),
                ("item_id_0", f"item_id_{passed_i}"),
                ("pick_value_0", f"pick_value_{passed_i}"),
            ):
                v_source = observations_sorted.loc[row_i, source_col]
                v_dest = observations_sorted.loc[row_i, dest_col]
                observations_sorted.loc[row_i, source_col] = v_dest
                observations_sorted.loc[row_i, dest_col] = v_source
observations_sorted.rename(columns={f'display_position_{i}': f'encoding_{i}' for i in range(n_alternatives)}, inplace=True)

In [None]:
observations_sorted.head(10)

In [None]:
observations_sorted[
    [
        c
        for c in observations_sorted.columns
        if not c.startswith("pick_value_")
    ]
].head(10)

In [None]:
assert np.all(observations_sorted["pick_value_0"] == 1)
for sel_i in range(1, n_alternatives):
    assert np.all(observations_sorted[f"pick_value_{sel_i}"] == 0)

In [None]:
stan_model_panel_src = (
    """
data {
  int<lower=1> n_vars;                     // number of variables per alternative
  int<lower=1> m_examples;                 // number of examples
  matrix[m_examples, n_vars] x_picked;     // character of picked examples
"""
    + "".join(
        [
            f"""  matrix[m_examples, n_vars] x_passed_{i};   // character of passed examples
"""
            for i in range(1, n_alternatives)
        ]
    )
    + """}
parameters {
  vector[n_vars] beta;                      // model parameters
  vector[m_examples] error_picked;          // reified noise term on picks (the secret sauce!)
}
transformed parameters {
  vector[m_examples] expect_picked;
  vector[m_examples] v_picked;
"""
    + "".join(
        [
            f"""  vector[m_examples] expect_passed_{i};
"""
            for i in range(1, n_alternatives)
        ]
    )
    + """  expect_picked = x_picked * beta;          // modeled expected score of picked item
  v_picked = expect_picked + error_picked;  // reified actual score of picked item
"""
    + "".join(
        [
            f"""  expect_passed_{i} = x_passed_{i} * beta;      // modeled expected score of passed item
"""
            for i in range(1, n_alternatives)
        ]
    )
    + """}
model {
    // basic priors
  beta ~ normal(0, 10);
  error_picked ~ normal(0, 10);
    // log probability of observed ordering as a function of parameters
    // terms are independent conditioned on knowing value of v_picked!
"""
    + "".join(
        [
            f"""  target += normal_lcdf( v_picked | expect_passed_{i}, 10);
"""
            for i in range(1, n_alternatives)
        ]
    )
    + """}
"""
)

if do_display:
    print(stan_model_panel_src)

In [None]:
def fmt_array(a) -> str:
    return json.dumps([v for v in a])


def mk_posn_indicator(posn: int) -> str:
    posn_indicators = [0] * n_alternatives
    posn_indicators[posn] = 1
    return posn_indicators


def f_i(sel_i: int) -> str:
    id_seq = observations_sorted[f"item_id_{sel_i}"]
    posn_seq = observations_sorted[f"encoding_{sel_i}"]
    return fmt_array(
        [
            list(features_frame.loc[int(id), :]) + mk_posn_indicator(int(posn))
            for id, posn in zip(id_seq, posn_seq)
        ]
    )


data_str = (
    f"""
{{
 "n_vars" : {n_vars},
 "m_examples" : {m_examples},
 "x_picked" : {f_i(0)},
"""
    + """,
""".join(
        [f""" "x_passed_{i}" : {f_i(i)}""" for i in range(1, n_alternatives)]
    )
    + """
}
"""
)

In [None]:
fit = run_stan_model(
    stan_model_src=stan_model_panel_src,
    data_str=data_str,
)

In [None]:
# get implied sample weights from chain
wt_frame = fit.draws_pd(vars=["lp__"])

wt_frame

In [None]:
if do_display:
    stddev = np.sqrt(np.var(wt_frame['lp__']))
    log_samples = np.log(wt_frame.shape[0])
    (
        ggplot(
            data=wt_frame,
            mapping=aes(x="lp__"),
        )
        + geom_density(fill="gray", alpha=0.7)
        + ggtitle(f"{example_name} Stan lp__ value on panel draws\nstandard deviation: {stddev:.2f}, log samples = {log_samples:.2f}")
    ).show()

In [None]:
beta_draws = fit.draws_pd(vars=["beta"])
beta_draws_display = beta_draws.copy()
beta_draws_display.columns = list(features_frame.columns) + [
    f"position_effect_{sel_i}" for sel_i in range(n_alternatives)
]

beta_draws_display

In [None]:
# this this entirety of what we pull out of Stan- per modeled preference cohort
# from now on we do not use Stan
estimated_beta_Stan = beta_draws_display.loc[
    wt_frame["lp__"] >= np.quantile(wt_frame["lp__"], 0.9), :
].mean()
# estimated_beta_Stan = beta_draws_display.mean()
estimated_beta_Stan

In [None]:
position_quantiles = (
    beta_draws.iloc[:, features_frame.shape[1] : n_vars]
    .quantile((0.25, 0.5, 0.75))
    .transpose()
    .reset_index(drop=True, inplace=False)
)
position_quantiles.columns = [str(c) for c in position_quantiles.columns]

In [None]:
stat_pull = plot_rank_performance(
    estimated_beta=estimated_beta_Stan,  # estimated coefficients
    example_name=example_name,  # name of data set
    n_vars=n_vars,  # number of variables (including position variables)
    n_alternatives=n_alternatives,  # size of panels
    features_frame=features_frame,  # features by row id
    observations=observations,  # observations layout frame
    estimate_name="Stan panel model",  # display name of estimate
    position_quantiles=position_quantiles,  # quantiles of estimated positions
    position_penalties=position_penalties,  # ideal position penalties
    score_compare_frame=score_compare_frame,  # score comparison frame (altered by call)
    rng=rng,  # pseudo random source
    show_plots=do_display,
)
collected_stats.append(stat_pull)

Try to approximate the Stan model with a logistic model with similar error structure.
Consider each pair of panel entries with a different outcome as an observation and try to
build a model that reproduces the observed outcomes.
The extra trick is: repeat the whole data frame negated with the outcomes reverse (so 
we don't define a problem with all positive or all negative outcomes).


In [None]:
feature_names = list(features_frame.columns) + [
    f"position_{sel_i}" for sel_i in range(n_alternatives)
]
enc_frame = []
for row_i in range(observations.shape[0]):
    feature_row = observations.loc[row_i, :]
    sel_pick = np.argmax(
        feature_row[[f"pick_value_{sel_i}" for sel_i in range(n_alternatives)]]
    )
    for sel_i in range(n_alternatives):
        if sel_i != sel_pick:
            posn_vec = [0] * n_alternatives
            posn_vec[sel_pick] = 1.0
            posn_vec[sel_i] = -1.0
            encoded_row = (
                list(
                    features_frame.loc[feature_row[f"item_id_{sel_pick}"], :]
                    - features_frame.loc[feature_row[f"item_id_{sel_i}"], :]
                )
                + posn_vec
            )
            di = pd.DataFrame({k: [v] for k, v in zip(feature_names, encoded_row)})
            enc_frame.append(di)
enc_frame = pd.concat(enc_frame, ignore_index=True)

In [None]:
enc_frame.head()

In [None]:
logistic_model = LogisticRegression(max_iter=10000, solver="newton-cholesky")
logistic_model.fit(
    pd.concat([enc_frame, -enc_frame], ignore_index=True),
    [True] * enc_frame.shape[0] + [False] * enc_frame.shape[0],
)

In [None]:
stat_pull = plot_rank_performance(
    estimated_beta=logistic_model.coef_[0],  # estimated coefficients
    example_name=example_name,  # name of data set
    n_vars=n_vars,  # number of variables (including position variables)
    n_alternatives=n_alternatives,  # size of panels
    features_frame=features_frame,  # features by row id
    observations=observations,  # observations layout frame
    estimate_name="logistic model",  # display name of estimate
    position_quantiles=None,  # quantiles of estimated positions
    position_penalties=position_penalties,  # ideal position penalties
    score_compare_frame=score_compare_frame,  # score comparison frame (altered by call)
    rng=rng,  # pseudo random source
    show_plots=do_display,
)
collected_stats.append(stat_pull)

We can also try a related Stan model per-comparison, instead of per-panel.

In [None]:
stan_model_comparison_src = (
    """
data {
  int<lower=1> n_vars;                     // number of variables per alternative
  int<lower=1> m_examples;                 // number of examples
  matrix[m_examples, n_vars] x_picked;     // character of picked examples
"""
    + "".join(
        [
            f"""  matrix[m_examples, n_vars] x_passed_{i};   // character of passed examples
"""
            for i in range(1, n_alternatives)
        ]
    )
    + """}
parameters {
  vector[n_vars] beta;                      // model parameters
}
transformed parameters {
  vector[m_examples] expect_picked;
"""
    + "".join(
        [
            f"""  vector[m_examples] expect_passed_{i};
"""
            for i in range(1, n_alternatives)
        ]
    )
    + """  expect_picked = x_picked * beta;          // modeled expected score of picked item
"""
    + "".join(
        [
            f"""  expect_passed_{i} = x_passed_{i} * beta;      // modeled expected score of passed item
"""
            for i in range(1, n_alternatives)
        ]
    )
    + """}
model {
    // basic priors
  beta ~ normal(0, 10);
    // log probability of observed ordering as a function of parameters
"""
    + "".join(
        [
            f"""  target += normal_lcdf( 0 | expect_passed_{i} - expect_picked, sqrt(2) * 10);
"""
            for i in range(1, n_alternatives)
        ]
    )
    + """}
"""
)

if do_display:
    print(stan_model_comparison_src)

In [None]:
fit_comp = run_stan_model(
    stan_model_src=stan_model_comparison_src,
    data_str=data_str,
)

In [None]:
# get implied sample weights from chain
wt_frame_c = fit_comp.draws_pd(vars=["lp__"])
if do_display:
    stddev_c = np.sqrt(np.var(wt_frame_c['lp__']))
    log_samples_c = np.log(wt_frame_c.shape[0])
    (
        ggplot(
            data=wt_frame_c,
            mapping=aes(x="lp__"),
        )
        + geom_density(fill="gray", alpha=0.7)
        + ggtitle(f"{example_name} Stan lp__ value on comparison draws\nstandard deviation: {stddev_c:.2f}, log samples = {log_samples_c:.2f}")
    ).show()

In [None]:
beta_draws_c = fit_comp.draws_pd(vars=["beta"])
estimated_beta_Stan_c = beta_draws_c.loc[
    wt_frame_c["lp__"] >= np.quantile(wt_frame_c["lp__"], 0.9), :
].mean()
# estimated_beta_Stan_c = beta_draws_c.mean()

In [None]:
position_quantiles_c = (
    beta_draws_c.iloc[:, features_frame.shape[1] : n_vars]
    .quantile((0.25, 0.5, 0.75))
    .transpose()
    .reset_index(drop=True, inplace=False)
)
position_quantiles_c.columns = [str(c) for c in position_quantiles_c.columns]

In [None]:
stat_pull = plot_rank_performance(
    estimated_beta=estimated_beta_Stan_c,  # estimated coefficients
    example_name=example_name,  # name of data set
    n_vars=n_vars,  # number of variables (including position variables)
    n_alternatives=n_alternatives,  # size of panels
    features_frame=features_frame,  # features by row id
    observations=observations,  # observations layout frame
    estimate_name="Stan comparisons model",  # display name of estimate
    position_quantiles=position_quantiles_c,  # quantiles of estimated positions
    position_penalties=position_penalties,  # ideal position penalties
    score_compare_frame=score_compare_frame,  # score comparison frame (altered by call)
    rng=rng,  # pseudo random source
    show_plots=do_display,
)
collected_stats.append(stat_pull)

In [None]:
collected_stats = pd.concat(collected_stats, ignore_index=True)

In [None]:
if (result_fname is not None) and (len(result_fname) > 0):
    collected_stats.to_csv(result_fname, index=False)

In [None]:
if do_display:
    display(collected_stats)