In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import optuna

from DQN_parametrized import DQN_parametrized 

import gym
import matplotlib.pyplot as plt
from itertools import count

import torch
import torch.optim as optim
import torch.nn.functional as F

from memory import ReplayMemory
from DQN import DQN
from preprocessing import get_screen
from utils import select_action, plot_scores
from training import optimize_model



In [2]:
env = gym.make('LunarLander-v2')

In [3]:
n_actions = env.action_space.n
init_screen = get_screen(env)
_, _, screen_height, screen_width = init_screen.shape

In [4]:
plt.ion()

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
best_booster = None
target_net = None

def callback(study, trial):
    global best_booster
    if study.best_trial == trial:
        best_booster = target_net

In [50]:
def objective(trial):
    global target_net
    
    pred_net = DQN_parametrized(screen_height, screen_width, n_actions, trial).to(device)
    
    target_net = DQN_parametrized(screen_height, screen_width, n_actions, trial).to(device)
    target_net.load_state_dict(pred_net.state_dict())
    target_net.eval() 
    
    REPLAY_MEMORY_SIZE = 1000
    memory = ReplayMemory(REPLAY_MEMORY_SIZE)

    TARGET_UPDATE = 1000  # period of target network update
    optimizer = optim.RMSprop(pred_net.parameters())

    num_episodes = 40
    episode_rewards = []
    steps = 0
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen(env).to(device)
        current_screen = get_screen(env).to(device)
        state = current_screen - last_screen
        episode_rewards.append(0)
        done = False
        while not done:
            # Select and perform an action
            action = select_action(pred_net, state, n_actions).to(device)
            _, reward, done, _ = env.step(action.item())  # our states are screenshot differences
            episode_rewards[-1] += reward

            reward = torch.tensor([reward], device=device)

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env).to(device)
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(device, pred_net, target_net, optimizer, memory)
            steps += 1

            if steps == TARGET_UPDATE:  # update the target net weights
                steps = 0
                target_net.load_state_dict(pred_net.state_dict())
        print(i_episode, 'reward:', episode_rewards[-1])
    return max(episode_rewards)

In [51]:
study = optuna.create_study(direction='maximize')

[32m[I 2021-03-07 23:26:40,576][0m A new study created in memory with name: no-name-94298053-3875-4dc8-affd-048a11b079f6[0m


In [52]:
%%time
study.optimize(objective, n_trials=10, callbacks=[callback])

0 reward: -193.47168327341757
1 reward: -33.93075403572931
2 reward: -125.53032236734325
3 reward: -85.9964378127736
4 reward: -85.21963042188071
5 reward: -139.6636811559829
6 reward: -143.53591462379052
7 reward: -127.60085857257738
8 reward: -151.39647231179842
9 reward: -174.15802717839549
10 reward: -130.463864975585
11 reward: -138.8973573312558
12 reward: -115.6110347071672
13 reward: -93.9474833255259
14 reward: -113.10385365209497
15 reward: -82.63073380104468
16 reward: -109.76363388470351
17 reward: -191.9516804669805
18 reward: -157.4038667464405
19 reward: -196.9134134835227
20 reward: -140.31488816142468
21 reward: -117.85225956469046
22 reward: -120.00872413968408
23 reward: -230.35936216967588
24 reward: -135.9250429780692
25 reward: -139.11034691034814
26 reward: -167.43011955825966
27 reward: -137.21390554548702
28 reward: -96.54490678551667
29 reward: -172.96694417078925
30 reward: -114.86960174349309
31 reward: -148.03251359120003
32 reward: -136.28972239804898
33 r

[32m[I 2021-03-08 00:37:11,464][0m Trial 0 finished with value: 11.229728994714918 and parameters: {'n_layers': 2, 'kernel_size_0': 3, 'stride_0': 1, 'padding_0': 1, 'out_channel_0': 6, 'activation_0': 'hardswish', 'kernel_size_1': 2, 'stride_1': 1, 'padding_1': 0, 'out_channel_1': 7, 'activation_1': 'hardswish', 'n_layers_1': 1, 'out_features_1_0': 879, 'activation_1_0': 'selu', 'hidden_size': 788, 'n_layers_2': 0}. Best is trial 0 with value: 11.229728994714918.[0m


39 reward: -20.556073236276333
0 reward: -207.60870996539677
1 reward: -313.86895318533414
2 reward: -600.8460695213594
3 reward: -331.364940465624
4 reward: -182.34476676025372
5 reward: -156.9262924345816
6 reward: -112.50993007984273
7 reward: 20.944303567334757
8 reward: -73.91441064604464
9 reward: -337.7737754643307
10 reward: -78.38580072723423
11 reward: -217.04817884860358
12 reward: -96.4265099988959
13 reward: -55.277292480615316
14 reward: -96.19210808746936
15 reward: -403.4045545220032
16 reward: -172.3938272483763
17 reward: -96.62519671819787
18 reward: -190.38922296799993
19 reward: -86.38395201643011
20 reward: -100.42168243960153
21 reward: -163.06615101054797
22 reward: -211.87885803560437
23 reward: -385.43541081364657
24 reward: -51.18181182389756
25 reward: 89.55769618282758
26 reward: -141.47708538248176
27 reward: -338.9098641363929
28 reward: -218.3744051570914
29 reward: -402.7894415904489
30 reward: -575.5183184910869
31 reward: -46.01859005475627
32 reward:

[32m[I 2021-03-08 10:27:28,390][0m Trial 1 finished with value: 89.55769618282758 and parameters: {'n_layers': 1, 'kernel_size_0': 2, 'stride_0': 1, 'padding_0': 1, 'out_channel_0': 10, 'activation_0': 'relu', 'n_layers_1': 1, 'out_features_1_0': 674, 'activation_1_0': 'hardswish', 'hidden_size': 408, 'n_layers_2': 1, 'out_features_2_0': 61, 'activation_2_0': 'selu'}. Best is trial 1 with value: 89.55769618282758.[0m


39 reward: -210.53643778946778
0 reward: -392.5134232122845
1 reward: -466.2805943303826
2 reward: -111.66582676720854
3 reward: -145.88329243619103
4 reward: -143.65277519532282
5 reward: -123.51308321300189
6 reward: -108.50169274716374
7 reward: -174.108587575106
8 reward: -156.24981364245548
9 reward: -181.30605949573874
10 reward: -28.25240561809329
11 reward: -128.80620503648225
12 reward: -141.35245499065527
13 reward: -127.15753615211126
14 reward: -163.06724285631674
15 reward: -140.12047008982935
16 reward: -196.99131210315664
17 reward: 12.78052063942971
18 reward: -173.48740915192764
19 reward: -158.63812567433888
20 reward: -184.0713617485496
21 reward: -149.71688855131316
22 reward: -144.59591937947943
23 reward: -185.8713869050146
24 reward: -162.58136034840427
25 reward: -155.67974463169924
26 reward: -88.0432448146961
27 reward: -136.93985467922226
28 reward: -133.42409811934243
29 reward: -157.40336314127842
30 reward: -130.8345170525712
31 reward: -147.7306119386687


[32m[I 2021-03-08 10:34:24,315][0m Trial 2 finished with value: 12.78052063942971 and parameters: {'n_layers': 1, 'kernel_size_0': 3, 'stride_0': 3, 'padding_0': 0, 'out_channel_0': 8, 'activation_0': 'hardswish', 'n_layers_1': 2, 'out_features_1_0': 303, 'activation_1_0': 'selu', 'out_features_1_1': 914, 'activation_1_1': 'selu', 'hidden_size': 517, 'n_layers_2': 1, 'out_features_2_0': 501, 'activation_2_0': 'relu'}. Best is trial 1 with value: 89.55769618282758.[0m


39 reward: -111.25121187070027
0 reward: -293.3812676931407
1 reward: -329.46455026710714
2 reward: -123.57610698256867
3 reward: -121.43740493300001
4 reward: -13.61182391707446
5 reward: -36.48197448042211
6 reward: -148.07477265143137
7 reward: -160.6386668870947
8 reward: -131.85019524068167
9 reward: -153.05605285422155
10 reward: -148.85604866032216
11 reward: -147.15281337730426
12 reward: -169.0220883068876
13 reward: -116.99364252131564
14 reward: -149.27962230144485
15 reward: -93.269793253842
16 reward: -160.71500960167046
17 reward: -120.3361408894828
18 reward: -109.87127786857681
19 reward: -120.83635580417686
20 reward: -177.5805178021271
21 reward: -122.16948298865901
22 reward: -88.94245269819586
23 reward: -156.80879153429572
24 reward: -124.45599886855152
25 reward: -124.1618079578765
26 reward: -139.62274536829756
27 reward: -223.94561502175597
28 reward: 28.721462078307127
29 reward: -122.17172686405713
30 reward: -90.50154313340947
31 reward: -84.32007508732895
32

[32m[I 2021-03-08 10:44:25,823][0m Trial 3 finished with value: 28.721462078307127 and parameters: {'n_layers': 3, 'kernel_size_0': 4, 'stride_0': 3, 'padding_0': 1, 'out_channel_0': 10, 'activation_0': 'relu', 'kernel_size_1': 4, 'stride_1': 1, 'padding_1': 0, 'out_channel_1': 5, 'activation_1': 'selu', 'kernel_size_2': 4, 'stride_2': 3, 'padding_2': 0, 'out_channel_2': 3, 'activation_2': 'hardswish', 'n_layers_1': 2, 'out_features_1_0': 528, 'activation_1_0': 'relu', 'out_features_1_1': 683, 'activation_1_1': 'selu', 'hidden_size': 572, 'n_layers_2': 1, 'out_features_2_0': 280, 'activation_2_0': 'hardswish'}. Best is trial 1 with value: 89.55769618282758.[0m


39 reward: -140.16799407385034
0 reward: -123.9062995375327
1 reward: -313.1052601948969
2 reward: -322.86631989948836
3 reward: -424.277624852986
4 reward: -64.2278630154464
5 reward: -427.24165710764146
6 reward: -162.7531623099241
7 reward: -111.35586472283389
8 reward: -237.67947943697024
9 reward: -143.7896966711169
10 reward: -512.149708702523
11 reward: -84.98095793793456
12 reward: -288.2443214305831
13 reward: -458.39111825913415
14 reward: -158.17528048550497
15 reward: -154.4813151081833
16 reward: -333.8562958140792
17 reward: -280.7186792812609
18 reward: -94.80517996660447
19 reward: -15.123342670678227
20 reward: -248.35955875068356
21 reward: -411.85054453570524
22 reward: -287.7704435183197
23 reward: -90.82678953802665
24 reward: -254.72781796827772
25 reward: -68.26285349595396
26 reward: -83.7790241319629
27 reward: -97.49472470745549
28 reward: -103.03016442240985
29 reward: -321.1561263104527
30 reward: -168.20793197973882
31 reward: -90.85873154886276
32 reward: 

[32m[I 2021-03-08 11:03:50,301][0m Trial 4 finished with value: -15.123342670678227 and parameters: {'n_layers': 1, 'kernel_size_0': 3, 'stride_0': 2, 'padding_0': 1, 'out_channel_0': 5, 'activation_0': 'relu', 'n_layers_1': 3, 'out_features_1_0': 978, 'activation_1_0': 'selu', 'out_features_1_1': 403, 'activation_1_1': 'hardswish', 'out_features_1_2': 405, 'activation_1_2': 'selu', 'hidden_size': 321, 'n_layers_2': 0}. Best is trial 1 with value: 89.55769618282758.[0m


39 reward: -229.70584908445386
0 reward: -183.81488663125242
1 reward: -270.3625824364141
2 reward: -600.04259323113
3 reward: -182.5758248763142
4 reward: -244.23304305724193
5 reward: -14.634199887584401
6 reward: -122.84781004315025
7 reward: -188.42182495540652
8 reward: -270.35900341171316
9 reward: -449.451702645551
10 reward: -272.9419130570577
11 reward: -378.3427840689516
12 reward: -92.58680523515334
13 reward: -280.46068555933573
14 reward: -271.9853796642301
15 reward: -547.0698810743961
16 reward: -352.8948246046069
17 reward: -120.10497363050513
18 reward: -321.3961979791286
19 reward: -286.9502338095491
20 reward: -290.3321656829861
21 reward: -310.989557915134
22 reward: -291.55578812222353
23 reward: -343.054702370697
24 reward: -118.95735012076678
25 reward: -461.0157388435436
26 reward: -101.55958942162482
27 reward: -97.95547288502152
28 reward: -65.95828903598965
29 reward: -67.42320982696533
30 reward: -102.48954691960994
31 reward: -271.69894998523273
32 reward: 

[32m[I 2021-03-08 11:15:33,653][0m Trial 5 finished with value: -14.634199887584401 and parameters: {'n_layers': 3, 'kernel_size_0': 2, 'stride_0': 2, 'padding_0': 1, 'out_channel_0': 4, 'activation_0': 'relu', 'kernel_size_1': 2, 'stride_1': 3, 'padding_1': 0, 'out_channel_1': 8, 'activation_1': 'relu', 'kernel_size_2': 2, 'stride_2': 3, 'padding_2': 0, 'out_channel_2': 12, 'activation_2': 'hardswish', 'n_layers_1': 3, 'out_features_1_0': 780, 'activation_1_0': 'hardswish', 'out_features_1_1': 799, 'activation_1_1': 'selu', 'out_features_1_2': 532, 'activation_1_2': 'relu', 'hidden_size': 317, 'n_layers_2': 0}. Best is trial 1 with value: 89.55769618282758.[0m


39 reward: -426.5520917535929
0 reward: -104.44399432340964
1 reward: -269.20212511968737
2 reward: -559.0559703096801
3 reward: -47.26525302351152
4 reward: -279.7623087785264
5 reward: -288.5211394450386
6 reward: -321.2970505884781
7 reward: -265.55536509411013
8 reward: -203.7717901161034
9 reward: -80.06810033160112
10 reward: -204.50507424788606
11 reward: -331.1950144365364
12 reward: -187.48919439346193
13 reward: -114.19808160634346
14 reward: -239.36419684573679
15 reward: -300.2955966702044
16 reward: -99.28211521225516
17 reward: -221.11488444457882
18 reward: -210.04053898825381
19 reward: -625.7009341342994
20 reward: -365.5361747212397
21 reward: -249.34505052133855
22 reward: -403.0466867237794
23 reward: -217.18827988034906
24 reward: -369.9782061952778
25 reward: -80.31032130451068
26 reward: -112.9080398996971
27 reward: -125.53704100061364
28 reward: -372.59614158687754
29 reward: -584.536605988282
30 reward: -37.172513972460905
31 reward: -461.23393881950614
32 rew

[32m[I 2021-03-08 11:29:34,484][0m Trial 6 finished with value: -37.172513972460905 and parameters: {'n_layers': 3, 'kernel_size_0': 2, 'stride_0': 2, 'padding_0': 1, 'out_channel_0': 6, 'activation_0': 'hardswish', 'kernel_size_1': 4, 'stride_1': 2, 'padding_1': 1, 'out_channel_1': 8, 'activation_1': 'hardswish', 'kernel_size_2': 2, 'stride_2': 3, 'padding_2': 0, 'out_channel_2': 4, 'activation_2': 'selu', 'n_layers_1': 3, 'out_features_1_0': 384, 'activation_1_0': 'hardswish', 'out_features_1_1': 908, 'activation_1_1': 'hardswish', 'out_features_1_2': 812, 'activation_1_2': 'hardswish', 'hidden_size': 536, 'n_layers_2': 1, 'out_features_2_0': 175, 'activation_2_0': 'hardswish'}. Best is trial 1 with value: 89.55769618282758.[0m


39 reward: -494.92980497318854
0 reward: -331.4510203168837
1 reward: -219.6706553457633
2 reward: -115.64161855326806
3 reward: -187.88332890691632
4 reward: -184.62183729512898
5 reward: -174.40227361043327
6 reward: -161.00688789370105
7 reward: -139.04084499378365
8 reward: -177.13041684346314
9 reward: -150.28862991771166
10 reward: -109.35005252651
11 reward: -102.00510988529474
12 reward: -111.8863556264128
13 reward: -163.57565534156987
14 reward: -173.3338393199498
15 reward: -185.66505233485339
16 reward: 3.2884780951672212
17 reward: -130.3340110721311
18 reward: -136.24928479218468
19 reward: -187.44801202115363
20 reward: -174.09409707649883
21 reward: -139.84284754226763
22 reward: -135.82352871251098
23 reward: -109.1705676087405
24 reward: -185.74468118169327
25 reward: -142.8450319662485
26 reward: -137.85486817178824
27 reward: -152.79247766187163
28 reward: -111.39149481131695
29 reward: -87.25998206308782
30 reward: -151.21500283951445
31 reward: -186.10178482897308

[32m[I 2021-03-08 11:41:32,241][0m Trial 7 finished with value: 3.2884780951672212 and parameters: {'n_layers': 3, 'kernel_size_0': 4, 'stride_0': 2, 'padding_0': 0, 'out_channel_0': 11, 'activation_0': 'hardswish', 'kernel_size_1': 4, 'stride_1': 3, 'padding_1': 1, 'out_channel_1': 12, 'activation_1': 'selu', 'kernel_size_2': 3, 'stride_2': 3, 'padding_2': 0, 'out_channel_2': 9, 'activation_2': 'selu', 'n_layers_1': 1, 'out_features_1_0': 782, 'activation_1_0': 'selu', 'hidden_size': 683, 'n_layers_2': 1, 'out_features_2_0': 368, 'activation_2_0': 'hardswish'}. Best is trial 1 with value: 89.55769618282758.[0m


39 reward: -164.5566862071659
0 reward: -527.8058292250537
1 reward: -260.16985881888866
2 reward: -124.68699234264483
3 reward: -138.74741042378278
4 reward: -139.69563908272755
5 reward: -111.80002452241655
6 reward: -211.35763108391023
7 reward: -182.63377844598068
8 reward: -113.07572972513749
9 reward: -89.23190244369017
10 reward: -114.88766998575792
11 reward: -143.39696017863477
12 reward: -119.59327972666391
13 reward: -150.49656137555218
14 reward: -117.79295648861708
15 reward: -123.57519836076916
16 reward: -199.00480110474135
17 reward: -29.366305649374013
18 reward: -94.81819994812095
19 reward: -126.2974225643936
20 reward: -115.95087044689933
21 reward: -154.30538424555888
22 reward: -144.600638414635
23 reward: -130.93984502006617
24 reward: -172.2453900666236
25 reward: -142.68307582511875
26 reward: -190.62764625505582
27 reward: -142.26437101457117
28 reward: -101.94915339990133
29 reward: -114.70974020231365
30 reward: -110.77021728945081
31 reward: -77.94445805720

[32m[I 2021-03-08 12:06:13,270][0m Trial 8 finished with value: -29.366305649374013 and parameters: {'n_layers': 1, 'kernel_size_0': 3, 'stride_0': 2, 'padding_0': 1, 'out_channel_0': 10, 'activation_0': 'selu', 'n_layers_1': 1, 'out_features_1_0': 998, 'activation_1_0': 'selu', 'hidden_size': 562, 'n_layers_2': 2, 'out_features_2_0': 68, 'activation_2_0': 'hardswish', 'out_features_2_1': 261, 'activation_2_1': 'selu'}. Best is trial 1 with value: 89.55769618282758.[0m


39 reward: -173.71696489504387
0 reward: -102.56577583043267
1 reward: -69.31084706393995
2 reward: -333.60852052764074
3 reward: -601.2196437217099
4 reward: -465.15243153770217
5 reward: -519.7299100056946
6 reward: -208.57394093623378
7 reward: -78.82472829384193
8 reward: -224.62574900563118
9 reward: -297.73325703647964
10 reward: -334.01722873537597
11 reward: -469.03831958546743
12 reward: -146.43755409062712
13 reward: -105.22910498159027
14 reward: -5.985249337974366
15 reward: -457.64545188925666
16 reward: -282.90753736750514
17 reward: -372.8150189340222
18 reward: -457.0461085447802
19 reward: -95.3883250433248
20 reward: -67.94212103729771
21 reward: -69.11022300324169
22 reward: -454.54893011556845
23 reward: -32.854170772009255
24 reward: -463.77722987075634
25 reward: -122.52978291219415
26 reward: -96.35051950707523
27 reward: -459.04492343209574
28 reward: -79.81564194677128
29 reward: -437.5645744385903
30 reward: -68.23122869119197
31 reward: -279.91029829138864
32

[32m[I 2021-03-08 19:25:46,918][0m Trial 9 finished with value: -5.985249337974366 and parameters: {'n_layers': 2, 'kernel_size_0': 3, 'stride_0': 1, 'padding_0': 1, 'out_channel_0': 9, 'activation_0': 'selu', 'kernel_size_1': 4, 'stride_1': 2, 'padding_1': 1, 'out_channel_1': 12, 'activation_1': 'selu', 'n_layers_1': 3, 'out_features_1_0': 847, 'activation_1_0': 'hardswish', 'out_features_1_1': 344, 'activation_1_1': 'hardswish', 'out_features_1_2': 273, 'activation_1_2': 'selu', 'hidden_size': 181, 'n_layers_2': 1, 'out_features_2_0': 74, 'activation_2_0': 'selu'}. Best is trial 1 with value: 89.55769618282758.[0m


39 reward: -206.32245551767758
CPU times: user 6h 52min 30s, sys: 1h 12min 1s, total: 8h 4min 31s
Wall time: 19h 59min 5s


In [49]:
study.best_trial

FrozenTrial(number=0, values=[18.434346464375864], datetime_start=datetime.datetime(2021, 3, 7, 20, 3, 46, 201516), datetime_complete=datetime.datetime(2021, 3, 7, 20, 18, 7, 553231), params={'n_layers': 1, 'kernel_size_0': 4, 'stride_0': 2, 'padding_0': 0, 'out_channel_0': 11, 'activation_0': 'hardswish', 'n_layers_1': 2, 'out_features_1_0': 842, 'activation_1_0': 'selu', 'out_features_1_1': 327, 'activation_1_1': 'hardswish', 'hidden_size': 201, 'n_layers_2': 1, 'out_features_2_0': 361, 'activation_2_0': 'hardswish'}, distributions={'n_layers': IntUniformDistribution(high=3, low=1, step=1), 'kernel_size_0': IntUniformDistribution(high=4, low=2, step=1), 'stride_0': IntUniformDistribution(high=3, low=1, step=1), 'padding_0': IntUniformDistribution(high=1, low=0, step=1), 'out_channel_0': IntUniformDistribution(high=12, low=3, step=1), 'activation_0': CategoricalDistribution(choices=('relu', 'selu', 'hardswish')), 'n_layers_1': IntUniformDistribution(high=3, low=1, step=1), 'out_featur

In [None]:
# fast-learning prediction/policy net
pred_net = DQN(screen_height, screen_width, n_actions).to(device)
# slow-learning target network
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(pred_net.state_dict())
target_net.eval()  # freeze the weights of the target net


# training loop parameters

REPLAY_MEMORY_SIZE = 1000
memory = ReplayMemory(REPLAY_MEMORY_SIZE)

TARGET_UPDATE = 1000  # period of target network update
optimizer = optim.RMSprop(pred_net.parameters())

# TODO make weights update be after a certain step count

### TRAINING LOOP ###
num_episodes = 5
episode_rewards = []
steps = 0
for i_episode in range(num_episodes):
    # Initialize the environment and state
    env.reset()
    last_screen = get_screen(env).to(device)
    current_screen = get_screen(env).to(device)
    state = current_screen - last_screen
    episode_rewards.append(0)
    done = False
    while not done:
        # Select and perform an action
        action = select_action(pred_net, state, n_actions).to(device)
        _, reward, done, _ = env.step(action.item())  # our states are screenshot differences
        episode_rewards[-1] += reward

        reward = torch.tensor([reward], device=device)

        # Observe new state
        last_screen = current_screen
        current_screen = get_screen(env).to(device)
        if not done:
            next_state = current_screen - last_screen
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model(device, pred_net, target_net, optimizer, memory)
        steps += 1

        if steps == TARGET_UPDATE:  # update the target net weights
            steps = 0
            target_net.load_state_dict(pred_net.state_dict())
        
    plot_scores(episode_rewards)

print('Done')
env.render()
env.close()
plt.ioff()
plt.show()

In [23]:
%%time
a = [i ** 2for i in range(100000)]

CPU times: user 41.7 ms, sys: 3.56 ms, total: 45.2 ms
Wall time: 43.8 ms
