<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/MCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# https://www.bilibili.com/video/BV11MEHzwEp6/?spm_id_from=333.337.search-card.all.click&vd_source=83baba81780fd95e96c22e9346057527

In [1]:
import random
from collections import defaultdict

# Environment: Simulated LLM response environment
class LLMEnvironment:
    def __init__(self):
        self.states = ["greeting", "math_question", "story_prompt"]
        self.actions = ["formal", "casual", "concise", "detailed"]

    def get_possible_states(self):
        return self.states

    def reset(self):
        return random.choice(self.states)

    def step(self, state, action):
        """Simulate an LLM response and return a reward"""
        # Toy reward logic
        if state == "greeting" and action == "casual":
            reward = 1.0
        elif state == "math_question" and action == "concise":
            reward = 1.0
        elif state == "story_prompt" and action == "detailed":
            reward = 1.0
        else:
            reward = 0.2
        return reward

# Monte Carlo Prediction
class MonteCarloPredictor:
    def __init__(self, env, policy, episodes=1000):
        self.env = env
        self.policy = policy
        self.episodes = episodes
        self.returns_sum = defaultdict(float)
        self.returns_count = defaultdict(int)
        self.V = defaultdict(float)

    def generate_episode(self):
        state = self.env.reset()
        action = self.policy(state)
        reward = self.env.step(state, action)
        return [(state, reward)]

    def evaluate(self):
        for _ in range(self.episodes):
            episode = self.generate_episode()
            G = 0.0
            visited_states = set()
            for (state, reward) in reversed(episode):
                G += reward
                if state not in visited_states:
                    self.returns_sum[state] += G
                    self.returns_count[state] += 1
                    self.V[state] = self.returns_sum[state] / self.returns_count[state]
                    visited_states.add(state)
        return self.V

# Example policy
def simple_policy(state):
    return {
        "greeting": "casual",
        "math_question": "concise",
        "story_prompt": "detailed"
    }.get(state, "formal")

# Run
env = LLMEnvironment()
mcp = MonteCarloPredictor(env, simple_policy, episodes=500)
value_estimates = mcp.evaluate()

# Show results
for state in env.get_possible_states():
    print(f"Estimated value for state '{state}': {value_estimates[state]:.2f}")

Estimated value for state 'greeting': 1.00
Estimated value for state 'math_question': 1.00
Estimated value for state 'story_prompt': 1.00
