In [None]:
import gym
import numpy as np

class QLearningAgent:
    def __init__(self, state_size, action_size, alpha=0.01, gamma=0.99, epsilon=1.0):
        self.state_size = state_size
        self.action_size = action_size
        self.alpha = alpha    # learning rate
        self.gamma = gamma    # discount factor
        self.epsilon = epsilon  # exploration rate
        self.weights = np.zeros((self.state_size, self.action_size))
        
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            # explore
            return np.random.choice(self.action_size)
        # exploit
        q_values = np.dot(state, self.weights)
        return np.argmax(q_values)
        
    def learn(self, state, action, reward, next_state, done):
        q_values = np.dot(state, self.weights)
        next_q_values = np.dot(next_state, self.weights)
        target = reward + (1 - done) * self.gamma * np.max(next_q_values)
        td_error = target - q_values[action]
        self.weights += self.alpha * td_error * np.reshape(state, (self.state_size, 1)) * np.reshape(np.eye(self.action_size)[action], (1, self.action_size))
        # reduce exploration rate
        self.epsilon = max(0.1, self.epsilon * 0.999)

# Create a MountainCar environment
env = gym.make('MountainCar-v0')

# Create a Q-learning agent
agent = QLearningAgent(state_size=env.observation_space.shape[0], action_size=env.action_space.n)

# Train the agent
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.learn(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
    print("Episode {}/{}: Total reward = {}".format(episode+1, num_episodes, total_reward))

# Test the agent
num_episodes = 10
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
    print("Episode {}/{}: Total reward = {}".format(episode+1, num_episodes, total_reward))

# Close the environment
env.close()