# 4 Model-free Method(无模型算法)

#### 通过前面的算法我们发现，必须要知道环境的状态转换概率（state-transition probobility）才可以计算环境状态的V-function和Q-function，这属于model-based method， 当我们不知道环境的状态是如何转变的时候就属于 model-free question

定义本章所用环境

In [2]:
import gym
import pandas as pd
import random
from collections import defaultdict

env = gym.make('Blackjack-v0')


## 4.1 Monte Carlo Methods(蒙特卡洛方法)

### Monte Carlo Prediction  
该算法目的是，求解当前policy下的v-function or q-function  
- V function

In [2]:
#初始化policy 用于计算value function
def policy(state):
    if state[0] > 19:
        return 0
    else:
        return 1

In [3]:
#episode生成器，生成一次完整的游戏过程
def generate_episode(policy, num_steps): #有一个最大游戏步数，防止游戏过程过长不结束
    episode = []
    state = env.reset() #随机产生一个初始状态
    
    for t in range(num_steps):
        action = policy(state)
        s_, r, done, info = env.step(action)
        episode.append((state, action, r))
        if done:
            break
        state = s_
        
    return episode

#print(generate_episode(policy, 100))

[((13, 6, False), 1, -1.0)]


In [21]:
#计算value function
total_rewards = defaultdict(float) #定义一个字典用来储存每个状态的累积奖励之和
N = defaultdict(int) #定义一个字典用来储存每个状态的出现次数

num_iter = 1000

for i in range(num_iter):
    episode = generate_episode(policy, 100)
    states, actions, rewards = zip(*episode)
    for t, state in enumerate(states):
        R = sum(rewards[t:])
        total_rewards[state] += R
        N[state] += 1

In [22]:
total_return = pd.DataFrame(total_rewards.items(), columns=['State', 'Total_rewards'])
N = pd.DataFrame(N.items(), columns=['State', 'N'])
df = pd.merge(total_return, N, on="State")
df['value_table'] = df['Total_rewards'] / df['N']
df.head(10)

Unnamed: 0,State,Total_rewards,N,value_table
0,"(18, 9, True)",0.0,4,0.0
1,"(15, 9, False)",-9.0,13,-0.692308
2,"(20, 9, False)",11.0,12,0.916667
3,"(20, 2, False)",11.0,15,0.733333
4,"(12, 1, False)",-2.0,10,-0.2
5,"(15, 1, False)",-7.0,10,-0.7
6,"(21, 8, True)",4.0,4,1.0
7,"(19, 10, False)",-38.0,44,-0.863636
8,"(21, 9, True)",9.0,9,1.0
9,"(15, 2, True)",-2.0,2,-1.0


我们也可以采用增量式均值的方法求解value function

In [None]:
v_function = defaultdict(float) #定义一个字典用来储存每个state的value值
N = defaultdict(int) #定义一个字典用来储存每个状态的出现次数

num_iter = 1000

for i in range(num_iter):
    episode = generate_episode(policy, 100)
    states, actions, rewards = zip(*episode)
    for t, state in enumerate(states):
        R = sum(rewards[t:])
        #incremental mean
        N[state] += 1
        v_function[state] += 1.0 / N[state] * (R - v_function[state])

- Q function

In [6]:
q_function = defaultdict(float) #定义一个字典用来储存每个（state，action）的value值
N = defaultdict(int) #定义一个字典用来储存每个状态的出现次数

num_iter = 1000

for i in range(num_iter):
    episode = generate_episode(policy, 100)
    state_action = [(s, a) for s, a, r in episode] #记录本次episode中所有（state， action）
    rewards = [r for s, a, r in episode]            #及其reward
    for t, (state, action, _) in enumerate(episode):
        R = sum(rewards[t:])
        #incremental mean
        N[(state, action)] += 1
        q_function[(state,action)] += 1.0 / N[(state, action)] * (R - q_function[(state, action)])

In [9]:
q = pd.DataFrame(q_function.items(), columns=['State-action', 'q-value'])
q.head(10)

Unnamed: 0,State-action,q-value
0,"((13, 5, False), 1)",-0.8
1,"((17, 5, False), 1)",-1.0
2,"((19, 5, False), 1)",-1.0
3,"((19, 10, True), 1)",-0.571429
4,"((17, 10, False), 1)",-0.755102
5,"((12, 10, False), 1)",-0.52381
6,"((9, 1, False), 1)",-1.0
7,"((17, 1, False), 1)",-0.777778
8,"((21, 3, True), 0)",1.0
9,"((14, 5, False), 1)",-0.5


### Monte Carlo Control  
计算 optimal policy

In [6]:
Q = defaultdict(float)
total_return = defaultdict(float)
N = defaultdict(int)

In [7]:
#定义epsilon贪心策略
def epsilon_greedy_policy(state, Q):
    epsilon = 0.5
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x:Q[(state, x)])

In [8]:
#定义episode生成器
'''代码同上'''
def generate_episode(policy, num_steps): #有一个最大游戏步数，防止游戏过程过长不结束
    episode = []
    state = env.reset() #随机产生一个初始状态
    
    for t in range(num_steps):
        action = epsilon_greedy_policy(state, policy)
        s_, r, done, info = env.step(action)
        episode.append((state, action, r))
        if done:
            break
        state = s_
        
    return episode

In [10]:
#计算最佳policy
#从上面的policy定义代码可以发现，agent实际上就是在根据Q function来采取action。所以计算最佳policy就是计算optimal Q function
#所以该部分代码与上面计算Q function的代码一样
num_iter = 1000

for i in range(num_iter):
    episode = generate_episode(Q, 100)
    state_action = [(s, a) for s, a, r in episode] #记录本次episode中所有（state， action）
    rewards = [r for s, a, r in episode]            #及其reward
    for t, (state, action, _) in enumerate(episode):
        R = sum(rewards[t:])
        #incremental mean
        N[(state, action)] += 1
        Q[(state,action)] += 1.0 / N[(state, action)] * (R - Q[(state, action)])

In [11]:
df = pd.DataFrame(Q.items(), columns = ['state-action', 'value'])
df.head(11)

Unnamed: 0,state-action,value
0,"((21, 10, True), 1)",-0.333333
1,"((16, 10, False), 0)",-0.538462
2,"((16, 10, False), 1)",-0.470588
3,"((12, 6, False), 1)",0.0
4,"((13, 6, False), 0)",-0.25
5,"((20, 5, False), 0)",0.875
6,"((20, 5, False), 1)",-1.0
7,"((9, 10, False), 0)",-0.818182
8,"((9, 10, False), 1)",-1.0
9,"((16, 1, True), 0)",-1.0
