In [1]:
import pygame
import random
from pygame.locals import *
import numpy as np
from collections import deque
import tensorflow as tf  # http://blog.topspeedsnail.com/archives/10116
import cv2               # http://blog.topspeedsnail.com/archives/4755

BLACK     = (0  ,0  ,0  )
WHITE     = (255,255,255)

SCREEN_SIZE = [320,400]
BAR_SIZE = [50, 5]
BALL_SIZE = [15, 15]

# 神经网络的输出
MOVE_STAY = [1, 0, 0]
MOVE_LEFT = [0, 1, 0]
MOVE_RIGHT = [0, 0, 1]

class Game(object):
	def __init__(self):
		pygame.init()
		self.clock = pygame.time.Clock()
		self.screen = pygame.display.set_mode(SCREEN_SIZE)
		pygame.display.set_caption('Simple Game')

		self.ball_pos_x = SCREEN_SIZE[0]//2 - BALL_SIZE[0]/2
		self.ball_pos_y = SCREEN_SIZE[1]//2 - BALL_SIZE[1]/2

		self.ball_dir_x = -1 # -1 = left 1 = right  
		self.ball_dir_y = -1 # -1 = up   1 = down
		self.ball_pos = pygame.Rect(self.ball_pos_x, self.ball_pos_y, BALL_SIZE[0], BALL_SIZE[1])

		self.bar_pos_x = SCREEN_SIZE[0]//2-BAR_SIZE[0]//2
		self.bar_pos = pygame.Rect(self.bar_pos_x, SCREEN_SIZE[1]-BAR_SIZE[1], BAR_SIZE[0], BAR_SIZE[1])

	# action是MOVE_STAY、MOVE_LEFT、MOVE_RIGHT
	# ai控制棒子左右移动；返回游戏界面像素数和对应的奖励。(像素->奖励->强化棒子往奖励高的方向移动)
	def step(self, action):

		if action == MOVE_LEFT:
			self.bar_pos_x = self.bar_pos_x - 2
		elif action == MOVE_RIGHT:
			self.bar_pos_x = self.bar_pos_x + 2
		else:
			pass
		if self.bar_pos_x < 0:
			self.bar_pos_x = 0
		if self.bar_pos_x > SCREEN_SIZE[0] - BAR_SIZE[0]:
			self.bar_pos_x = SCREEN_SIZE[0] - BAR_SIZE[0]
			
		self.screen.fill(BLACK)
		self.bar_pos.left = self.bar_pos_x
		pygame.draw.rect(self.screen, WHITE, self.bar_pos)

		self.ball_pos.left += self.ball_dir_x * 2
		self.ball_pos.bottom += self.ball_dir_y * 3
		pygame.draw.rect(self.screen, WHITE, self.ball_pos)

		if self.ball_pos.top <= 0 or self.ball_pos.bottom >= (SCREEN_SIZE[1] - BAR_SIZE[1]+1):
			self.ball_dir_y = self.ball_dir_y * -1
		if self.ball_pos.left <= 0 or self.ball_pos.right >= (SCREEN_SIZE[0]):
			self.ball_dir_x = self.ball_dir_x * -1

		reward = 0
		if self.bar_pos.top <= self.ball_pos.bottom and (self.bar_pos.left < self.ball_pos.right and self.bar_pos.right > self.ball_pos.left):
			reward = 1    # 击中奖励
		elif self.bar_pos.top <= self.ball_pos.bottom and (self.bar_pos.left > self.ball_pos.right or self.bar_pos.right < self.ball_pos.left):
			reward = -1   # 没击中惩罚

		# 获得游戏界面像素
		screen_image = pygame.surfarray.array3d(pygame.display.get_surface())
		pygame.display.update()
		# 返回游戏界面像素和对应的奖励
		return reward, screen_image

# learning_rate
LEARNING_RATE = 0.99
# 更新梯度
INITIAL_EPSILON = 1.0
FINAL_EPSILON = 0.05
# 测试观测次数
EXPLORE = 500000 
OBSERVE = 50000
# 存储过往经验大小
REPLAY_MEMORY = 500000

BATCH = 100

output = 3  # 输出层神经元数。代表3种操作-MOVE_STAY:[1, 0, 0]  MOVE_LEFT:[0, 1, 0]  MOVE_RIGHT:[0, 0, 1]
input_image = tf.placeholder("float", [None, 80, 100, 4])  # 游戏像素
action = tf.placeholder("float", [None, output])     # 操作

# 定义CNN-卷积神经网络 参考:http://blog.topspeedsnail.com/archives/10451
def convolutional_neural_network(input_image):
	weights = {'w_conv1':tf.Variable(tf.zeros([8, 8, 4, 32])),
               'w_conv2':tf.Variable(tf.zeros([4, 4, 32, 64])),
               'w_conv3':tf.Variable(tf.zeros([3, 3, 64, 64])),
               'w_fc4':tf.Variable(tf.zeros([3456, 784])),
               'w_out':tf.Variable(tf.zeros([784, output]))}

	biases = {'b_conv1':tf.Variable(tf.zeros([32])),
              'b_conv2':tf.Variable(tf.zeros([64])),
              'b_conv3':tf.Variable(tf.zeros([64])),
              'b_fc4':tf.Variable(tf.zeros([784])),
              'b_out':tf.Variable(tf.zeros([output]))}

	conv1 = tf.nn.relu(tf.nn.conv2d(input_image, weights['w_conv1'], strides = [1, 4, 4, 1], padding = "VALID") + biases['b_conv1'])
	conv2 = tf.nn.relu(tf.nn.conv2d(conv1, weights['w_conv2'], strides = [1, 2, 2, 1], padding = "VALID") + biases['b_conv2'])
	conv3 = tf.nn.relu(tf.nn.conv2d(conv2, weights['w_conv3'], strides = [1, 1, 1, 1], padding = "VALID") + biases['b_conv3'])
	conv3_flat = tf.reshape(conv3, [-1, 3456])
	fc4 = tf.nn.relu(tf.matmul(conv3_flat, weights['w_fc4']) + biases['b_fc4'])

	output_layer = tf.matmul(fc4, weights['w_out']) + biases['b_out']
	return output_layer

# 深度强化学习入门: https://www.nervanasys.com/demystifying-deep-reinforcement-learning/
# 训练神经网络
def train_neural_network(input_image):
	predict_action = convolutional_neural_network(input_image)

	argmax = tf.placeholder("float", [None, output])
	gt = tf.placeholder("float", [None])

	action = tf.reduce_sum(tf.matmul(predict_action, argmax), reduction_indices = 1)
	cost = tf.reduce_mean(tf.square(action - gt))
	optimizer = tf.train.AdamOptimizer(1e-6).minimize(cost)

	game = Game()
	D = deque()

	_, image = game.step(MOVE_STAY)
	# 转换为灰度值
	image = cv2.cvtColor(cv2.resize(image, (100, 80)), cv2.COLOR_BGR2GRAY)
	# 转换为二值
	ret, image = cv2.threshold(image, 1, 255, cv2.THRESH_BINARY)
	input_image_data = np.stack((image, image, image, image), axis = 2)
	
	with tf.Session() as sess:
		sess.run(tf.initialize_all_variables())
		
		saver = tf.train.Saver()
		
		n = 0
		epsilon = INITIAL_EPSILON
		while True:
			action_t = predict_action.eval(feed_dict = {input_image : [input_image_data]})[0]

			argmax_t = np.zeros([output], dtype=np.int)
			if(random.random() <= INITIAL_EPSILON):
				maxIndex = random.randrange(output)
			else:
				maxIndex = np.argmax(action_t)
			argmax_t[maxIndex] = 1
			if epsilon > FINAL_EPSILON:
				epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

			for event in pygame.event.get():  #macOS需要事件循环，否则白屏
				if event.type == QUIT:
					pygame.quit()
					sys.exit()
			reward, image = game.step(list(argmax_t))

			image = cv2.cvtColor(cv2.resize(image, (100, 80)), cv2.COLOR_BGR2GRAY)
			ret, image = cv2.threshold(image, 1, 255, cv2.THRESH_BINARY)
			image = np.reshape(image, (80, 100, 1))
			input_image_data1 = np.append(image, input_image_data[:, :, 0:3], axis = 2)

			D.append((input_image_data, argmax_t, reward, input_image_data1))

			if len(D) > REPLAY_MEMORY:
				D.popleft()

			if n > OBSERVE:
				minibatch = random.sample(D, BATCH)
				input_image_data_batch = [d[0] for d in minibatch]
				argmax_batch = [d[1] for d in minibatch]
				reward_batch = [d[2] for d in minibatch]
				input_image_data1_batch = [d[3] for d in minibatch]

				gt_batch = []

				out_batch = predict_action.eval(feed_dict = {input_image : input_image_data1_batch})

				for i in range(0, len(minibatch)):
					gt_batch.append(reward_batch[i] + LEARNING_RATE * np.max(out_batch[i]))

				optimizer.run(feed_dict = {gt : gt_batch, argmax : argmax_batch, input_image : input_image_data_batch})

			input_image_data = input_image_data1
			n = n+1

			if n % 10000 == 0:
				saver.save(sess, './game.cpk', global_step = n)  # 保存模型

			print(n, "epsilon:", epsilon, " " ,"action:", maxIndex, " " ,"reward:", reward)


train_neural_network(input_image)

pygame 2.0.0.dev1 (SDL 1.2.15, python 3.7.3)
Hello from the pygame community. https://www.pygame.org/contribute.html
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
1 epsilon: 0.9999981   action: 0   reward: 0
2 epsilon: 0.9999962   action: 1   reward: 0
3 epsilon: 0.9999943   action: 1   reward: 0
4 epsilon: 0.9999924   action: 0   reward: 0
5 epsilon: 0.9999905   action: 0   reward: 0
6 epsilon: 0.9999886   action: 1   reward: 0
7 epsilon: 0.9999867   action: 2   reward: 0
8 epsilon: 0.9999848   action: 0   reward: 0
9 epsilon: 0.9999829   action: 0   reward: 0
10 epsilon: 0.999981   action: 1   reward: 0
11 epsilon: 0.9999791   action: 0   reward: 0
12 epsilon: 0.9999772   action: 1   reward: 0
13 epsilon: 0.9999753   action: 0   reward: 0
14 epsilon: 0.9999734   action: 2   reward: 0
15 epsilon: 0.9999715   action: 1   reward: 0
16 epsilo

145 epsilon: 0.9997245000000001   action: 0   reward: 0
146 epsilon: 0.9997226000000001   action: 2   reward: 0
147 epsilon: 0.9997207000000001   action: 0   reward: 0
148 epsilon: 0.9997188000000001   action: 2   reward: 0
149 epsilon: 0.9997169000000001   action: 0   reward: 0
150 epsilon: 0.9997150000000001   action: 2   reward: 0
151 epsilon: 0.9997131000000001   action: 1   reward: 0
152 epsilon: 0.9997112000000001   action: 1   reward: 0
153 epsilon: 0.9997093000000001   action: 0   reward: 0
154 epsilon: 0.9997074000000001   action: 2   reward: 0
155 epsilon: 0.9997055000000001   action: 1   reward: 0
156 epsilon: 0.9997036000000001   action: 0   reward: 0
157 epsilon: 0.9997017000000001   action: 0   reward: 0
158 epsilon: 0.9996998000000001   action: 1   reward: 0
159 epsilon: 0.9996979000000001   action: 0   reward: 0
160 epsilon: 0.9996960000000001   action: 0   reward: 0
161 epsilon: 0.9996941000000001   action: 0   reward: 0
162 epsilon: 0.9996922000000001   action: 0   re

296 epsilon: 0.9994376000000003   action: 2   reward: 0
297 epsilon: 0.9994357000000003   action: 0   reward: 0
298 epsilon: 0.9994338000000003   action: 0   reward: 0
299 epsilon: 0.9994319000000003   action: 0   reward: 0
300 epsilon: 0.9994300000000003   action: 0   reward: 0
301 epsilon: 0.9994281000000003   action: 2   reward: 0
302 epsilon: 0.9994262000000003   action: 1   reward: 0
303 epsilon: 0.9994243000000003   action: 1   reward: 0
304 epsilon: 0.9994224000000003   action: 1   reward: 0
305 epsilon: 0.9994205000000003   action: 0   reward: 0
306 epsilon: 0.9994186000000003   action: 0   reward: 0
307 epsilon: 0.9994167000000003   action: 1   reward: 0
308 epsilon: 0.9994148000000003   action: 0   reward: 0
309 epsilon: 0.9994129000000003   action: 1   reward: 0
310 epsilon: 0.9994110000000003   action: 2   reward: 0
311 epsilon: 0.9994091000000003   action: 1   reward: 0
312 epsilon: 0.9994072000000003   action: 1   reward: 0
313 epsilon: 0.9994053000000003   action: 2   re

444 epsilon: 0.9991564000000004   action: 2   reward: 1
445 epsilon: 0.9991545000000004   action: 1   reward: 0
446 epsilon: 0.9991526000000004   action: 0   reward: 0
447 epsilon: 0.9991507000000004   action: 0   reward: 0
448 epsilon: 0.9991488000000004   action: 0   reward: 0
449 epsilon: 0.9991469000000004   action: 1   reward: 0
450 epsilon: 0.9991450000000004   action: 1   reward: 0
451 epsilon: 0.9991431000000004   action: 0   reward: 0
452 epsilon: 0.9991412000000004   action: 2   reward: 0
453 epsilon: 0.9991393000000004   action: 1   reward: 0
454 epsilon: 0.9991374000000004   action: 0   reward: 0
455 epsilon: 0.9991355000000004   action: 2   reward: 0
456 epsilon: 0.9991336000000004   action: 1   reward: 0
457 epsilon: 0.9991317000000004   action: 2   reward: 0
458 epsilon: 0.9991298000000004   action: 1   reward: 0
459 epsilon: 0.9991279000000004   action: 1   reward: 0
460 epsilon: 0.9991260000000004   action: 1   reward: 0
461 epsilon: 0.9991241000000004   action: 2   re

593 epsilon: 0.9988733000000005   action: 0   reward: 0
594 epsilon: 0.9988714000000005   action: 0   reward: 0
595 epsilon: 0.9988695000000005   action: 1   reward: 0
596 epsilon: 0.9988676000000005   action: 2   reward: 0
597 epsilon: 0.9988657000000005   action: 0   reward: 0
598 epsilon: 0.9988638000000005   action: 1   reward: 0
599 epsilon: 0.9988619000000005   action: 1   reward: 0
600 epsilon: 0.9988600000000005   action: 2   reward: 0
601 epsilon: 0.9988581000000005   action: 1   reward: 0
602 epsilon: 0.9988562000000005   action: 2   reward: 0
603 epsilon: 0.9988543000000005   action: 1   reward: 0
604 epsilon: 0.9988524000000005   action: 0   reward: 0
605 epsilon: 0.9988505000000005   action: 2   reward: 0
606 epsilon: 0.9988486000000005   action: 0   reward: 0
607 epsilon: 0.9988467000000005   action: 2   reward: 0
608 epsilon: 0.9988448000000005   action: 2   reward: 0
609 epsilon: 0.9988429000000005   action: 0   reward: 0
610 epsilon: 0.9988410000000005   action: 0   re

742 epsilon: 0.9985902000000006   action: 0   reward: 0
743 epsilon: 0.9985883000000007   action: 0   reward: 0
744 epsilon: 0.9985864000000007   action: 2   reward: 0
745 epsilon: 0.9985845000000007   action: 1   reward: 0
746 epsilon: 0.9985826000000007   action: 2   reward: 0
747 epsilon: 0.9985807000000007   action: 1   reward: 0
748 epsilon: 0.9985788000000007   action: 1   reward: 0
749 epsilon: 0.9985769000000007   action: 0   reward: 0
750 epsilon: 0.9985750000000007   action: 1   reward: 0
751 epsilon: 0.9985731000000007   action: 2   reward: 0
752 epsilon: 0.9985712000000007   action: 2   reward: 0
753 epsilon: 0.9985693000000007   action: 2   reward: 0
754 epsilon: 0.9985674000000007   action: 0   reward: 0
755 epsilon: 0.9985655000000007   action: 0   reward: 0
756 epsilon: 0.9985636000000007   action: 1   reward: 0
757 epsilon: 0.9985617000000007   action: 0   reward: 0
758 epsilon: 0.9985598000000007   action: 2   reward: 0
759 epsilon: 0.9985579000000007   action: 2   re

890 epsilon: 0.9983090000000008   action: 0   reward: 0
891 epsilon: 0.9983071000000008   action: 0   reward: 0
892 epsilon: 0.9983052000000008   action: 2   reward: 0
893 epsilon: 0.9983033000000008   action: 1   reward: 0
894 epsilon: 0.9983014000000008   action: 2   reward: 0
895 epsilon: 0.9982995000000008   action: 2   reward: 0
896 epsilon: 0.9982976000000008   action: 0   reward: 0
897 epsilon: 0.9982957000000008   action: 0   reward: 0
898 epsilon: 0.9982938000000008   action: 0   reward: 0
899 epsilon: 0.9982919000000008   action: 2   reward: 0
900 epsilon: 0.9982900000000008   action: 0   reward: 0
901 epsilon: 0.9982881000000008   action: 1   reward: 0
902 epsilon: 0.9982862000000008   action: 0   reward: 0
903 epsilon: 0.9982843000000008   action: 2   reward: 0
904 epsilon: 0.9982824000000008   action: 0   reward: 0
905 epsilon: 0.9982805000000008   action: 1   reward: 0
906 epsilon: 0.9982786000000008   action: 2   reward: 0
907 epsilon: 0.9982767000000008   action: 2   re

1041 epsilon: 0.9980221000000009   action: 2   reward: 0
1042 epsilon: 0.9980202000000009   action: 0   reward: 0
1043 epsilon: 0.9980183000000009   action: 2   reward: 0
1044 epsilon: 0.9980164000000009   action: 0   reward: 0
1045 epsilon: 0.9980145000000009   action: 0   reward: 0
1046 epsilon: 0.9980126000000009   action: 1   reward: 0
1047 epsilon: 0.9980107000000009   action: 1   reward: 0
1048 epsilon: 0.9980088000000009   action: 0   reward: 0
1049 epsilon: 0.9980069000000009   action: 1   reward: 0
1050 epsilon: 0.9980050000000009   action: 0   reward: 0
1051 epsilon: 0.9980031000000009   action: 1   reward: 0
1052 epsilon: 0.9980012000000009   action: 2   reward: 0
1053 epsilon: 0.9979993000000009   action: 2   reward: 0
1054 epsilon: 0.9979974000000009   action: 1   reward: 0
1055 epsilon: 0.9979955000000009   action: 0   reward: 0
1056 epsilon: 0.9979936000000009   action: 2   reward: 0
1057 epsilon: 0.9979917000000009   action: 1   reward: 0
1058 epsilon: 0.997989800000000

1188 epsilon: 0.997742800000001   action: 2   reward: 0
1189 epsilon: 0.997740900000001   action: 2   reward: 0
1190 epsilon: 0.997739000000001   action: 1   reward: 0
1191 epsilon: 0.997737100000001   action: 2   reward: 0
1192 epsilon: 0.997735200000001   action: 2   reward: 0
1193 epsilon: 0.997733300000001   action: 0   reward: 0
1194 epsilon: 0.997731400000001   action: 2   reward: 0
1195 epsilon: 0.997729500000001   action: 1   reward: 0
1196 epsilon: 0.997727600000001   action: 0   reward: 0
1197 epsilon: 0.997725700000001   action: 0   reward: 0
1198 epsilon: 0.997723800000001   action: 1   reward: 0
1199 epsilon: 0.997721900000001   action: 2   reward: 0
1200 epsilon: 0.997720000000001   action: 0   reward: 0
1201 epsilon: 0.997718100000001   action: 0   reward: 0
1202 epsilon: 0.997716200000001   action: 1   reward: 0
1203 epsilon: 0.997714300000001   action: 1   reward: 0
1204 epsilon: 0.997712400000001   action: 0   reward: 0
1205 epsilon: 0.997710500000001   action: 1   re

1333 epsilon: 0.9974673000000012   action: 0   reward: 0
1334 epsilon: 0.9974654000000012   action: 0   reward: 0
1335 epsilon: 0.9974635000000012   action: 1   reward: 0
1336 epsilon: 0.9974616000000012   action: 1   reward: 0
1337 epsilon: 0.9974597000000012   action: 0   reward: 0
1338 epsilon: 0.9974578000000012   action: 0   reward: 0
1339 epsilon: 0.9974559000000012   action: 0   reward: 0
1340 epsilon: 0.9974540000000012   action: 2   reward: 0
1341 epsilon: 0.9974521000000012   action: 2   reward: 0
1342 epsilon: 0.9974502000000012   action: 1   reward: 0
1343 epsilon: 0.9974483000000012   action: 1   reward: 0
1344 epsilon: 0.9974464000000012   action: 1   reward: 0
1345 epsilon: 0.9974445000000012   action: 2   reward: 0
1346 epsilon: 0.9974426000000012   action: 2   reward: 0
1347 epsilon: 0.9974407000000012   action: 1   reward: 0
1348 epsilon: 0.9974388000000012   action: 2   reward: 0
1349 epsilon: 0.9974369000000012   action: 0   reward: 0
1350 epsilon: 0.997435000000001

1480 epsilon: 0.9971880000000013   action: 2   reward: 0
1481 epsilon: 0.9971861000000013   action: 0   reward: 0
1482 epsilon: 0.9971842000000013   action: 0   reward: 0
1483 epsilon: 0.9971823000000013   action: 0   reward: 0
1484 epsilon: 0.9971804000000013   action: 2   reward: 0
1485 epsilon: 0.9971785000000013   action: 0   reward: 0
1486 epsilon: 0.9971766000000013   action: 0   reward: 0
1487 epsilon: 0.9971747000000013   action: 0   reward: 0
1488 epsilon: 0.9971728000000013   action: 1   reward: 0
1489 epsilon: 0.9971709000000013   action: 1   reward: 0
1490 epsilon: 0.9971690000000013   action: 2   reward: 0
1491 epsilon: 0.9971671000000013   action: 2   reward: 0
1492 epsilon: 0.9971652000000013   action: 0   reward: 0
1493 epsilon: 0.9971633000000013   action: 1   reward: 0
1494 epsilon: 0.9971614000000013   action: 1   reward: 0
1495 epsilon: 0.9971595000000013   action: 2   reward: 0
1496 epsilon: 0.9971576000000013   action: 1   reward: 0
1497 epsilon: 0.997155700000001

1627 epsilon: 0.9969087000000014   action: 2   reward: 0
1628 epsilon: 0.9969068000000014   action: 0   reward: 0
1629 epsilon: 0.9969049000000014   action: 2   reward: 0
1630 epsilon: 0.9969030000000014   action: 1   reward: 0
1631 epsilon: 0.9969011000000014   action: 0   reward: 0
1632 epsilon: 0.9968992000000014   action: 0   reward: 0
1633 epsilon: 0.9968973000000014   action: 0   reward: 0
1634 epsilon: 0.9968954000000014   action: 2   reward: 0
1635 epsilon: 0.9968935000000014   action: 1   reward: 0
1636 epsilon: 0.9968916000000014   action: 0   reward: 0
1637 epsilon: 0.9968897000000014   action: 2   reward: 0
1638 epsilon: 0.9968878000000014   action: 0   reward: 0
1639 epsilon: 0.9968859000000014   action: 2   reward: 0
1640 epsilon: 0.9968840000000014   action: 0   reward: 0
1641 epsilon: 0.9968821000000014   action: 1   reward: 0
1642 epsilon: 0.9968802000000014   action: 0   reward: 0
1643 epsilon: 0.9968783000000014   action: 0   reward: 0
1644 epsilon: 0.996876400000001

1773 epsilon: 0.9966313000000016   action: 2   reward: 0
1774 epsilon: 0.9966294000000016   action: 0   reward: 0
1775 epsilon: 0.9966275000000016   action: 2   reward: 0
1776 epsilon: 0.9966256000000016   action: 0   reward: 0
1777 epsilon: 0.9966237000000016   action: 0   reward: 0
1778 epsilon: 0.9966218000000016   action: 2   reward: 0
1779 epsilon: 0.9966199000000016   action: 2   reward: 0
1780 epsilon: 0.9966180000000016   action: 1   reward: 0
1781 epsilon: 0.9966161000000016   action: 1   reward: 0
1782 epsilon: 0.9966142000000016   action: 2   reward: 0
1783 epsilon: 0.9966123000000016   action: 2   reward: 0
1784 epsilon: 0.9966104000000016   action: 1   reward: 0
1785 epsilon: 0.9966085000000016   action: 0   reward: 0
1786 epsilon: 0.9966066000000016   action: 2   reward: 0
1787 epsilon: 0.9966047000000016   action: 0   reward: 0
1788 epsilon: 0.9966028000000016   action: 0   reward: 0
1789 epsilon: 0.9966009000000016   action: 0   reward: 0
1790 epsilon: 0.996599000000001

1918 epsilon: 0.9963558000000017   action: 1   reward: 0
1919 epsilon: 0.9963539000000017   action: 0   reward: 0
1920 epsilon: 0.9963520000000017   action: 1   reward: 0
1921 epsilon: 0.9963501000000017   action: 2   reward: 0
1922 epsilon: 0.9963482000000017   action: 0   reward: 0
1923 epsilon: 0.9963463000000017   action: 2   reward: 0
1924 epsilon: 0.9963444000000017   action: 1   reward: 0
1925 epsilon: 0.9963425000000017   action: 0   reward: 0
1926 epsilon: 0.9963406000000017   action: 1   reward: 0
1927 epsilon: 0.9963387000000017   action: 1   reward: 0
1928 epsilon: 0.9963368000000017   action: 2   reward: 0
1929 epsilon: 0.9963349000000017   action: 1   reward: 0
1930 epsilon: 0.9963330000000017   action: 0   reward: 0
1931 epsilon: 0.9963311000000017   action: 2   reward: 0
1932 epsilon: 0.9963292000000017   action: 1   reward: 0
1933 epsilon: 0.9963273000000017   action: 1   reward: 0
1934 epsilon: 0.9963254000000017   action: 2   reward: 0
1935 epsilon: 0.996323500000001

2067 epsilon: 0.9960727000000018   action: 1   reward: 0
2068 epsilon: 0.9960708000000018   action: 1   reward: 0
2069 epsilon: 0.9960689000000018   action: 0   reward: 0
2070 epsilon: 0.9960670000000018   action: 2   reward: 0
2071 epsilon: 0.9960651000000018   action: 2   reward: 0
2072 epsilon: 0.9960632000000018   action: 1   reward: 0
2073 epsilon: 0.9960613000000018   action: 2   reward: 0
2074 epsilon: 0.9960594000000018   action: 1   reward: 0
2075 epsilon: 0.9960575000000018   action: 1   reward: 0
2076 epsilon: 0.9960556000000018   action: 0   reward: 0
2077 epsilon: 0.9960537000000018   action: 0   reward: 0
2078 epsilon: 0.9960518000000018   action: 2   reward: 0
2079 epsilon: 0.9960499000000018   action: 1   reward: 0
2080 epsilon: 0.9960480000000018   action: 1   reward: 0
2081 epsilon: 0.9960461000000018   action: 2   reward: 0
2082 epsilon: 0.9960442000000018   action: 1   reward: 0
2083 epsilon: 0.9960423000000018   action: 1   reward: 0
2084 epsilon: 0.996040400000001

2212 epsilon: 0.9957972000000019   action: 2   reward: 0
2213 epsilon: 0.9957953000000019   action: 0   reward: 0
2214 epsilon: 0.9957934000000019   action: 0   reward: 0
2215 epsilon: 0.9957915000000019   action: 0   reward: 0
2216 epsilon: 0.9957896000000019   action: 0   reward: 0
2217 epsilon: 0.9957877000000019   action: 0   reward: 0
2218 epsilon: 0.9957858000000019   action: 0   reward: 0
2219 epsilon: 0.9957839000000019   action: 1   reward: 0
2220 epsilon: 0.9957820000000019   action: 2   reward: 0
2221 epsilon: 0.9957801000000019   action: 0   reward: 0
2222 epsilon: 0.995778200000002   action: 0   reward: -1
2223 epsilon: 0.995776300000002   action: 0   reward: 0
2224 epsilon: 0.995774400000002   action: 0   reward: 0
2225 epsilon: 0.995772500000002   action: 0   reward: 0
2226 epsilon: 0.995770600000002   action: 2   reward: 0
2227 epsilon: 0.995768700000002   action: 1   reward: 0
2228 epsilon: 0.995766800000002   action: 2   reward: 0
2229 epsilon: 0.995764900000002   act

2359 epsilon: 0.9955179000000021   action: 2   reward: 0
2360 epsilon: 0.9955160000000021   action: 2   reward: 0
2361 epsilon: 0.9955141000000021   action: 1   reward: 0
2362 epsilon: 0.9955122000000021   action: 2   reward: 0
2363 epsilon: 0.9955103000000021   action: 0   reward: 0
2364 epsilon: 0.9955084000000021   action: 1   reward: 0
2365 epsilon: 0.9955065000000021   action: 0   reward: 0
2366 epsilon: 0.9955046000000021   action: 2   reward: 0
2367 epsilon: 0.9955027000000021   action: 1   reward: 0
2368 epsilon: 0.9955008000000021   action: 2   reward: 0
2369 epsilon: 0.9954989000000021   action: 0   reward: 0
2370 epsilon: 0.9954970000000021   action: 2   reward: 0
2371 epsilon: 0.9954951000000021   action: 0   reward: 0
2372 epsilon: 0.9954932000000021   action: 0   reward: 0
2373 epsilon: 0.9954913000000021   action: 0   reward: 0
2374 epsilon: 0.9954894000000021   action: 1   reward: 0
2375 epsilon: 0.9954875000000021   action: 2   reward: 0
2376 epsilon: 0.995485600000002

2503 epsilon: 0.9952443000000022   action: 2   reward: 0
2504 epsilon: 0.9952424000000022   action: 2   reward: 0
2505 epsilon: 0.9952405000000022   action: 2   reward: 0
2506 epsilon: 0.9952386000000022   action: 0   reward: 0
2507 epsilon: 0.9952367000000022   action: 1   reward: 0
2508 epsilon: 0.9952348000000022   action: 1   reward: 0
2509 epsilon: 0.9952329000000022   action: 1   reward: 0
2510 epsilon: 0.9952310000000022   action: 1   reward: 0
2511 epsilon: 0.9952291000000022   action: 0   reward: 0
2512 epsilon: 0.9952272000000022   action: 1   reward: 0
2513 epsilon: 0.9952253000000022   action: 0   reward: 0
2514 epsilon: 0.9952234000000022   action: 2   reward: 0
2515 epsilon: 0.9952215000000022   action: 1   reward: 0
2516 epsilon: 0.9952196000000022   action: 1   reward: 0
2517 epsilon: 0.9952177000000022   action: 1   reward: 0
2518 epsilon: 0.9952158000000022   action: 1   reward: 0
2519 epsilon: 0.9952139000000022   action: 0   reward: 0
2520 epsilon: 0.995212000000002

2648 epsilon: 0.9949688000000023   action: 0   reward: 0
2649 epsilon: 0.9949669000000023   action: 0   reward: 0
2650 epsilon: 0.9949650000000023   action: 0   reward: 0
2651 epsilon: 0.9949631000000023   action: 1   reward: 0
2652 epsilon: 0.9949612000000023   action: 1   reward: 0
2653 epsilon: 0.9949593000000023   action: 2   reward: 0
2654 epsilon: 0.9949574000000023   action: 1   reward: 0
2655 epsilon: 0.9949555000000023   action: 0   reward: 0
2656 epsilon: 0.9949536000000023   action: 0   reward: 0
2657 epsilon: 0.9949517000000023   action: 2   reward: 0
2658 epsilon: 0.9949498000000023   action: 2   reward: 0
2659 epsilon: 0.9949479000000023   action: 0   reward: 0
2660 epsilon: 0.9949460000000023   action: 1   reward: 0
2661 epsilon: 0.9949441000000023   action: 0   reward: 0
2662 epsilon: 0.9949422000000023   action: 0   reward: 0
2663 epsilon: 0.9949403000000023   action: 2   reward: 0
2664 epsilon: 0.9949384000000023   action: 1   reward: 0
2665 epsilon: 0.994936500000002

2794 epsilon: 0.9946914000000024   action: 0   reward: 0
2795 epsilon: 0.9946895000000024   action: 0   reward: 0
2796 epsilon: 0.9946876000000024   action: 2   reward: 0
2797 epsilon: 0.9946857000000024   action: 2   reward: 0
2798 epsilon: 0.9946838000000024   action: 2   reward: 0
2799 epsilon: 0.9946819000000025   action: 2   reward: 0
2800 epsilon: 0.9946800000000025   action: 0   reward: 0
2801 epsilon: 0.9946781000000025   action: 0   reward: 0
2802 epsilon: 0.9946762000000025   action: 0   reward: 0
2803 epsilon: 0.9946743000000025   action: 0   reward: 0
2804 epsilon: 0.9946724000000025   action: 1   reward: 0
2805 epsilon: 0.9946705000000025   action: 2   reward: 0
2806 epsilon: 0.9946686000000025   action: 1   reward: 0
2807 epsilon: 0.9946667000000025   action: 1   reward: 0
2808 epsilon: 0.9946648000000025   action: 2   reward: 0
2809 epsilon: 0.9946629000000025   action: 2   reward: 0
2810 epsilon: 0.9946610000000025   action: 0   reward: 0
2811 epsilon: 0.994659100000002

2940 epsilon: 0.9944140000000026   action: 2   reward: 0
2941 epsilon: 0.9944121000000026   action: 0   reward: 0
2942 epsilon: 0.9944102000000026   action: 1   reward: 0
2943 epsilon: 0.9944083000000026   action: 1   reward: 0
2944 epsilon: 0.9944064000000026   action: 2   reward: 0
2945 epsilon: 0.9944045000000026   action: 1   reward: 0
2946 epsilon: 0.9944026000000026   action: 2   reward: 0
2947 epsilon: 0.9944007000000026   action: 2   reward: 0
2948 epsilon: 0.9943988000000026   action: 1   reward: 0
2949 epsilon: 0.9943969000000026   action: 1   reward: 0
2950 epsilon: 0.9943950000000026   action: 0   reward: 0
2951 epsilon: 0.9943931000000026   action: 2   reward: 0
2952 epsilon: 0.9943912000000026   action: 2   reward: 0
2953 epsilon: 0.9943893000000026   action: 1   reward: 0
2954 epsilon: 0.9943874000000026   action: 1   reward: 0
2955 epsilon: 0.9943855000000026   action: 2   reward: 0
2956 epsilon: 0.9943836000000026   action: 0   reward: 0
2957 epsilon: 0.994381700000002

NameError: name 'sys' is not defined