In [None]:
'This code might take a long time to run'
'Another sample code from https://github.com/kumarnikhil936/q_learning_mountain_car_openai might be easier to run'

import numpy as np
import pandas as pd
import time
import gym
import csv
import os
import pickle
from queue import Queue
import pickle

class QLearning:
    def __init__(self, actions_space, learning_rate=0.01, reward_decay=0.99, e_greedy=0.6):
        self.actions = actions_space    
        #self.target                    
        self.lr = learning_rate         
        self.gamma = reward_decay       
        self.epsilon = e_greedy        
        self.num_pos = 20               
        self.num_vel = 14               
        
        self.q_table =  np.random.uniform(low=-1, high=1, size=(self.num_pos*self.num_vel, self.actions.n)) 
        self.pos_bins = self.toBins(-1.2, 0.6, self.num_pos)
        self.vel_bins = self.toBins(-0.07, 0.07, self.num_vel)

    
    def choose_action(self,state):
       
        if np.random.uniform() < self.epsilon:
            
            action = np.argmax(self.q_table[state])
        else:
            
            action = self.actions.sample()
        return action

    
    def toBins(self,clip_min, clip_max, num):
        return np.linspace(clip_min, clip_max, num + 1)
   
   
    def digit(self,x, bin):
        n = np.digitize(x,bins = bin)
        if x== bin[-1]:
            n=n-1
        return n

    
    def digitize_state(self,observation):
        
        cart_pos, cart_v = observation
       
        digitized = [self.digit(cart_pos,self.pos_bins),
                    self.digit(cart_v,self.vel_bins),]
        
        return (digitized[1]-1)*self.num_pos + digitized[0]-1

    
    def learn(self, state, action, r, next_state):
        next_action = np.argmax(self.q_table[next_state]) 
        q_predict = self.q_table[state, action]
        q_target = r + self.gamma * self.q_table[next_state, next_action]   
        self.q_table[state, action] += self.lr * (q_target - q_predict)     


def train():
    env = gym.make('MountainCar-v0')   
    print(env.action_space)
    agent = QLearning(env.action_space)
    # with open(os.getcwd()+'/tmp/carmountain.model', 'rb') as f:
    #     agent = pickle.load(f)
    # agent.actions = env.action_space    
    
    for i in range(10000):  
        observation = env.reset()  
        state = agent.digitize_state(observation)  
        for t in range(300):   
            action = agent.choose_action(state)  
            observation, reward, done, info = env.step(action)   
            next_state = agent.digitize_state(observation)
            # if done:
            #     reward-=200  
            if reward == 0:  
                reward+=1000   
            
            print(action,reward,done,state,next_state)
            agent.learn(state,action,reward,next_state)
            state = next_state
            if done:    
                print("Episode finished after {} timesteps".format(t+1))
                break
            # env.render()    
    print(agent.q_table)
    env.close()
    #保存 
    with open(os.getcwd()+'/tmp/carmountain.model', 'wb') as f:
        pickle.dump(agent, f)

def test():
    env = gym.make('MountainCar-v0')   
    print(env.action_space)
    with open(os.getcwd()+'/tmp/carmountain.model', 'rb') as f:
        agent = pickle.load(f)
    agent.actions = env.action_space    
    agent.epsilon = 1
    observation = env.reset()  
    state = agent.digitize_state(observation)  
    
    for t in range(500):   
        action = agent.choose_action(state)  
        observation, reward, done, info = env.step(action)   
        next_state = agent.digitize_state(observation)
        print(action,reward,done,state,next_state)
        agent.learn(state,action,reward,next_state)
        state = next_state
        env.render()    
    env.close()

def run_test():
    env = gym.make('MountainCar-v0')   
    
    observation = env.reset()  
    
    for t in range(500):   
        action =  np.random.choice([0, 1, 2]) 
        #action = 2
        observation, reward, done, info = env.step(action)   
        print(action,reward,done)
        print(observation)
        env.render() 
        time.sleep(0.02)
    env.close()

if __name__ == '__main__':
    train() 
    test()    

   # run_test()