In [None]:
# imports
import numpy as np
import time
import gym

In [None]:
# config
env_name = "CarRacing-v1"  # name of the environment
n = (3,96*96*3)  # weight matrix size
runs = 3  # runs per weights to get a good average
simulation_len = 100  # number of steps in one simulation run
search_len = 20  # number of weights to try

In [None]:
# setup environment
env = gym.make(env_name)
observation = env.reset()

# setup best weights
w_best = np.zeros(n)
best_score = -np.inf

# try many random weights
for i in range(search_len):

    # log
    print(f'\rtry {i+1}/{search_len}', end='')

    # generate random neuron weights
    w = np.random.uniform(-1, 1, n)

    avg_score = 0

    # do multiple runs for the same weights to get a good average score
    for run in range(runs):

        # reset environment
        observation = env.reset()
        score = 0

        # do a single simulation run
        action = env.action_space.sample()  # random first step
        for i in range(simulation_len):

            # do the action and get new observations
            observation, reward, done, info = env.step(action)
            score += reward

            # determine action
            action = (w * observation.flatten()).mean(axis=1)

            # stop when done
            if done:
                break

        avg_score += score / runs

    # update best score and weights
    if avg_score > best_score:
        print(f'\rfound new best score: {avg_score}')
        best_score = avg_score
        w_best = w

# clean up
env.close()

# print results
print('\rbest score:', best_score)
print(w_best)

## Visualize

In [None]:
w = w_best # or alternatively random: np.random.uniform(size=n)

# setup environment
env = gym.make(env_name)
observation = env.reset()
score = 0

# do a simulation run with visualization for the best weights
action = env.action_space.sample()
for i in range(simulation_len*2):
    
    # do the action and get new observations
    observation, reward, done, info = env.step(action)
    score += reward

    # determine action
    action = (w * observation.flatten()).mean(axis=1)

    # update plot and log
    env.render(mode = "human")
    print(f'\rstep {i+1}, reward: {reward}, done: {done}, action: {action}', end=' '*20)
    # print(f'\rstep {i+1}, observation {observation}, reward: {reward}, done: {done}, action: {action}', end=' '*20)
    time.sleep(0.02)

    # stop when done
    if done:
        break

# log result score
print(f'\ntotal reward: {score}')

# clean up
time.sleep(1)
env.close()

## Results

seems to work a tiny bit with a 3 neuron model, but a more complex model is probably needed