# Finding the Optimal Policy with Linear Programming

In [1]:
"""
Solve the Dialogue/Recommendation MDP using Linear Programming with Pyomo.

MDP Setup:
- States: S0 (start), W (have weather), M (have mood), WM (have both), SUCCESS, ABANDON
- Actions: ASK_WEATHER, ASK_MOOD, ASK_BOTH, RECOMMEND (availability depends on state)
- Reward: +10 for reaching SUCCESS, 0 otherwise
- Terminal states: SUCCESS, ABANDON (no discounting needed, but we use γ < 1 for numerical stability)

LP Formulation:
    Minimize: Σ V(s)  (over non-terminal states)
    Subject to: V(s) ≥ Σ P(s'|s,a) * [R(s,a,s') + γ*V(s')]  for all valid (s,a) pairs
    
    Terminal states have V = 0 (no future rewards possible)
"""

import pyomo.environ as pyo
from enum import IntEnum


# ----- MDP Definition (from the user's code) -----
class State(IntEnum):
    S0 = 0
    W = 1
    M = 2
    WM = 3
    SUCCESS = 4
    ABANDON = 5


class Action(IntEnum):
    ASK_WEATHER = 0
    ASK_MOOD = 1
    ASK_BOTH = 2
    RECOMMEND = 3


TERMINAL_STATES = {State.SUCCESS, State.ABANDON}
NON_TERMINAL_STATES = [s for s in State if s not in TERMINAL_STATES]

AVAILABLE_ACTIONS = {
    State.S0: [Action.ASK_WEATHER, Action.ASK_MOOD, Action.ASK_BOTH],
    State.W: [Action.ASK_MOOD],
    State.M: [Action.ASK_WEATHER],
    State.WM: [Action.RECOMMEND],
    State.SUCCESS: [],
    State.ABANDON: [],
}

TRANSITIONS = {
    (State.S0, Action.ASK_WEATHER): {
        State.W: 0.75,
        State.S0: 0.15,
        State.ABANDON: 0.10,
    },
    (State.S0, Action.ASK_MOOD): {
        State.M: 0.70,
        State.S0: 0.20,
        State.ABANDON: 0.10,
    },
    (State.S0, Action.ASK_BOTH): {
        State.WM: 0.50,
        State.W: 0.20,
        State.M: 0.15,
        State.ABANDON: 0.15,
    },
    (State.W, Action.ASK_MOOD): {
        State.WM: 0.80,
        State.W: 0.10,
        State.ABANDON: 0.10,
    },
    (State.M, Action.ASK_WEATHER): {
        State.WM: 0.80,
        State.M: 0.10,
        State.ABANDON: 0.10,
    },
    (State.WM, Action.RECOMMEND): {
        State.SUCCESS: 0.75,
        State.ABANDON: 0.25,
    },
}


def reward(state, action, next_state):
    """Reward is +10 for reaching SUCCESS, 0 otherwise."""
    return 10.0 if next_state == State.SUCCESS else 0.0


# ----- Pyomo Model -----
gamma = 0.99  # Discount factor (close to 1 since we care about reaching SUCCESS)

model = pyo.ConcreteModel("Dialogue_MDP_LP")

# Decision variables: V(s) for each state
model.V = pyo.Var(list(State), domain=pyo.Reals)

# Fix terminal state values to 0
model.terminal_success = pyo.Constraint(expr=model.V[State.SUCCESS] == 0)
model.terminal_abandon = pyo.Constraint(expr=model.V[State.ABANDON] == 0)

# Objective: minimize sum of values (over non-terminal states)
model.obj = pyo.Objective(
    expr=sum(model.V[s] for s in NON_TERMINAL_STATES),
    sense=pyo.minimize
)

# Constraints: V(s) >= Σ P(s'|s,a) * [R(s,a,s') + γ*V(s')]  for all valid (s,a)
model.bellman = pyo.ConstraintList()

for s in NON_TERMINAL_STATES:
    for a in AVAILABLE_ACTIONS[s]:
        trans = TRANSITIONS[(s, a)]
        
        # Expected immediate reward + discounted future value
        rhs = sum(
            prob * (reward(s, a, s_next) + gamma * model.V[s_next])
            for s_next, prob in trans.items()
        )
        
        model.bellman.add(model.V[s] >= rhs)

# ----- Solve -----
solver = pyo.SolverFactory('ipopt')
result = solver.solve(model, tee=False)

# ----- Results -----
print("=" * 60)
print("Dialogue MDP Solution via Linear Programming")
print("=" * 60)

print("\nOptimal Value Function:")
for s in State:
    print(f"  V({s.name:8}) = {pyo.value(model.V[s]):8.4f}")

# Extract optimal policy
print("\nOptimal Policy:")
for s in NON_TERMINAL_STATES:
    v_s = pyo.value(model.V[s])
    best_action = None
    best_q = float('-inf')
    
    action_values = []
    for a in AVAILABLE_ACTIONS[s]:
        trans = TRANSITIONS[(s, a)]
        q_value = sum(
            prob * (reward(s, a, s_next) + gamma * pyo.value(model.V[s_next]))
            for s_next, prob in trans.items()
        )
        action_values.append((a, q_value))
        if q_value > best_q:
            best_q = q_value
            best_action = a
    
    # Show all Q-values for this state
    q_str = ", ".join(f"{a.name}={q:.3f}" for a, q in action_values)
    print(f"  π({s.name:8}) = {best_action.name:12}  [Q-values: {q_str}]")

print(f"\nSolver Status: {result.solver.status}")
print("=" * 60)

# ----- Verify: Expected reward from S0 -----
print("\nInterpretation:")
print(f"  Starting from S0, the expected total reward is {pyo.value(model.V[State.S0]):.4f}")
print(f"  Since reward is +10 for SUCCESS, this means P(SUCCESS) ≈ {pyo.value(model.V[State.S0])/10:.2%}")

Dialogue MDP Solution via Linear Programming

Optimal Value Function:
  V(S0      ) =   5.9969
  V(W       ) =   6.5927
  V(M       ) =   6.5927
  V(WM      ) =   7.5000
  V(SUCCESS ) =   0.0000
  V(ABANDON ) =   0.0000

Optimal Policy:
  π(S0      ) = ASK_BOTH      [Q-values: ASK_WEATHER=5.786, ASK_MOOD=5.756, ASK_BOTH=5.997]
  π(W       ) = ASK_MOOD      [Q-values: ASK_MOOD=6.593]
  π(M       ) = ASK_WEATHER   [Q-values: ASK_WEATHER=6.593]
  π(WM      ) = RECOMMEND     [Q-values: RECOMMEND=7.500]

Solver Status: ok

Interpretation:
  Starting from S0, the expected total reward is 5.9969
  Since reward is +10 for SUCCESS, this means P(SUCCESS) ≈ 59.97%
