In [43]:
from PPO import PPO
from time import time
from boardgame2 import ReversiEnv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pandas as pd
import plotly.graph_objects as go
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras

In [3]:
def plot_wins_entropy(df):

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=df.iteration, y=df.wins,mode='markers', name='%Wins'))

    fig.add_trace(go.Scatter(x=df.iteration, y=df.entropy,mode='markers', name='Entropy'))

    fig.update_xaxes(title='Iteration')
    fig.update_yaxes(title='')

    fig.update_layout(title='%Wins vs Entropy',coloraxis_colorbar=dict(
        title="%CR",
        thicknessmode="pixels", thickness=50,
        lenmode="pixels", len=200,
        yanchor="top", y=1,
        ticks="outside", ticksuffix="%",
        dtick=5))

    return fig.show()


def plot_wins_comparison(df1, df2):

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=df1.iteration, y=df1.wins,mode='markers', name='First'))

    fig.add_trace(go.Scatter(x=df2.iteration, y=df2.wins,mode='markers', name='Second'))

    fig.update_xaxes(title='Iteration')
    fig.update_yaxes(title='Wins %')

    fig.update_layout(title='%Wins Comparison',coloraxis_colorbar=dict(
        title="%CR",
        thicknessmode="pixels", thickness=50,
        lenmode="pixels", len=200,
        yanchor="top", y=1,
        ticks="outside", ticksuffix="%",
        dtick=5))

    return fig.show()

# Training with reward 1 or -1, and always randomly

In [64]:
PPO_agent = PPO(board_shape=8, n_experience_episodes=70, iterations=4000, batch_size=64, epochs=2, 
                entropy_loss=10e-4, eval_period=50, sum_board=False,train_randomly=True, lr=0.0001, algorithm='PPO')

random_simple_reward = PPO_agent.run()

  0%|          | 1/4000 [00:25<28:31:52, 25.68s/it]

Iteration 0: Win % 0.56
Best model at iteration 0: 0.56


  1%|          | 26/4000 [06:00<16:26:18, 14.89s/it]

Iteration 25: Win % 0.6
Best model at iteration 25: 0.6


  1%|▏         | 51/4000 [11:09<16:00:36, 14.60s/it]

Iteration 50: Win % 0.62
Best model at iteration 50: 0.62


  2%|▏         | 76/4000 [16:20<15:42:59, 14.42s/it]

Iteration 75: Win % 0.66
Best model at iteration 75: 0.66


  3%|▎         | 101/4000 [21:24<15:32:13, 14.35s/it]

Iteration 100: Win % 0.64


  3%|▎         | 126/4000 [26:24<15:19:12, 14.24s/it]

Iteration 125: Win % 0.68
Best model at iteration 125: 0.68


  4%|▍         | 151/4000 [31:25<15:15:50, 14.28s/it]

Iteration 150: Win % 0.66


  4%|▍         | 176/4000 [36:02<13:40:21, 12.87s/it]

Iteration 175: Win % 0.88
Best model at iteration 175: 0.88


  5%|▌         | 201/4000 [40:35<13:31:46, 12.82s/it]

Iteration 200: Win % 0.72


  6%|▌         | 226/4000 [45:08<13:31:07, 12.90s/it]

Iteration 225: Win % 0.62


  6%|▋         | 251/4000 [49:42<13:29:05, 12.95s/it]

Iteration 250: Win % 0.8


  7%|▋         | 276/4000 [54:17<13:23:20, 12.94s/it]

Iteration 275: Win % 0.74


  8%|▊         | 301/4000 [58:53<13:28:21, 13.11s/it]

Iteration 300: Win % 0.78


  8%|▊         | 326/4000 [1:03:29<13:20:54, 13.08s/it]

Iteration 325: Win % 0.68


  9%|▉         | 351/4000 [1:08:07<13:20:56, 13.17s/it]

Iteration 350: Win % 0.64


  9%|▉         | 376/4000 [1:12:43<13:06:20, 13.02s/it]

Iteration 375: Win % 0.78


 10%|█         | 401/4000 [1:17:21<13:14:13, 13.24s/it]

Iteration 400: Win % 0.76


 11%|█         | 426/4000 [1:22:00<13:04:59, 13.18s/it]

Iteration 425: Win % 0.66


 11%|█▏        | 451/4000 [1:26:41<12:58:18, 13.16s/it]

Iteration 450: Win % 0.74


 12%|█▏        | 476/4000 [1:31:23<13:07:47, 13.41s/it]

Iteration 475: Win % 0.7


 13%|█▎        | 501/4000 [1:36:05<12:56:42, 13.32s/it]

Iteration 500: Win % 0.7


 13%|█▎        | 526/4000 [1:40:47<12:45:16, 13.22s/it]

Iteration 525: Win % 0.7


 14%|█▍        | 551/4000 [1:45:25<12:36:04, 13.15s/it]

Iteration 550: Win % 0.66


 14%|█▍        | 576/4000 [1:50:06<12:35:06, 13.23s/it]

Iteration 575: Win % 0.78


 15%|█▌        | 601/4000 [1:54:46<12:30:46, 13.25s/it]

Iteration 600: Win % 0.8


 16%|█▌        | 626/4000 [1:59:26<12:21:56, 13.19s/it]

Iteration 625: Win % 0.82


 16%|█▋        | 651/4000 [2:04:06<12:18:47, 13.24s/it]

Iteration 650: Win % 0.78


 17%|█▋        | 676/4000 [2:08:46<12:17:45, 13.32s/it]

Iteration 675: Win % 0.78


 18%|█▊        | 701/4000 [2:13:27<12:14:01, 13.35s/it]

Iteration 700: Win % 0.74


 18%|█▊        | 726/4000 [2:19:11<14:52:36, 16.36s/it]

Iteration 725: Win % 0.6


 19%|█▉        | 751/4000 [2:24:37<13:53:13, 15.39s/it]

Iteration 750: Win % 0.8


 19%|█▉        | 776/4000 [2:30:04<13:49:58, 15.45s/it]

Iteration 775: Win % 0.7


 20%|██        | 801/4000 [2:35:32<13:46:29, 15.50s/it]

Iteration 800: Win % 0.68


 21%|██        | 826/4000 [2:41:16<15:05:27, 17.12s/it]

Iteration 825: Win % 0.76


 21%|██▏       | 851/4000 [2:46:47<13:43:18, 15.69s/it]

Iteration 850: Win % 0.72


 22%|██▏       | 876/4000 [2:52:17<13:26:05, 15.48s/it]

Iteration 875: Win % 0.66


 23%|██▎       | 901/4000 [2:57:49<13:16:55, 15.43s/it]

Iteration 900: Win % 0.76


 23%|██▎       | 926/4000 [3:03:15<13:07:48, 15.38s/it]

Iteration 925: Win % 0.74


 24%|██▍       | 951/4000 [3:08:44<13:23:09, 15.81s/it]

Iteration 950: Win % 0.7


 24%|██▍       | 976/4000 [3:14:11<13:02:01, 15.52s/it]

Iteration 975: Win % 0.64


 25%|██▌       | 1001/4000 [3:19:36<12:53:02, 15.47s/it]

Iteration 1000: Win % 0.68


 26%|██▌       | 1026/4000 [3:25:01<12:36:45, 15.27s/it]

Iteration 1025: Win % 0.72


 26%|██▋       | 1051/4000 [3:30:28<12:37:37, 15.41s/it]

Iteration 1050: Win % 0.74


 27%|██▋       | 1076/4000 [3:35:57<12:32:41, 15.44s/it]

Iteration 1075: Win % 0.76


 28%|██▊       | 1101/4000 [3:41:26<12:22:59, 15.38s/it]

Iteration 1100: Win % 0.72


 28%|██▊       | 1126/4000 [3:46:53<12:12:12, 15.29s/it]

Iteration 1125: Win % 0.68


 29%|██▉       | 1151/4000 [3:52:18<11:57:32, 15.11s/it]

Iteration 1150: Win % 0.62


 29%|██▉       | 1176/4000 [3:57:36<11:49:32, 15.08s/it]

Iteration 1175: Win % 0.62


 30%|███       | 1201/4000 [4:02:55<11:44:40, 15.11s/it]

Iteration 1200: Win % 0.7


 31%|███       | 1226/4000 [4:08:14<11:43:52, 15.22s/it]

Iteration 1225: Win % 0.7


 31%|███▏      | 1251/4000 [4:13:34<11:31:34, 15.09s/it]

Iteration 1250: Win % 0.74


 32%|███▏      | 1276/4000 [4:18:43<10:25:04, 13.77s/it]

Iteration 1275: Win % 0.68


 33%|███▎      | 1301/4000 [4:23:31<10:17:13, 13.72s/it]

Iteration 1300: Win % 0.76


 33%|███▎      | 1326/4000 [4:28:18<10:09:50, 13.68s/it]

Iteration 1325: Win % 0.82


 34%|███▍      | 1351/4000 [4:33:06<10:04:26, 13.69s/it]

Iteration 1350: Win % 0.84


 34%|███▍      | 1376/4000 [4:37:54<10:04:39, 13.83s/it]

Iteration 1375: Win % 0.72


 35%|███▌      | 1401/4000 [4:42:41<9:42:54, 13.46s/it]

Iteration 1400: Win % 0.72


 36%|███▌      | 1426/4000 [4:47:28<9:38:16, 13.48s/it]

Iteration 1425: Win % 0.72


 36%|███▋      | 1451/4000 [4:52:16<9:39:57, 13.65s/it]

Iteration 1450: Win % 0.78


 37%|███▋      | 1476/4000 [4:57:02<9:24:23, 13.42s/it]

Iteration 1475: Win % 0.72


 38%|███▊      | 1501/4000 [5:01:47<9:19:45, 13.44s/it]

Iteration 1500: Win % 0.76


 38%|███▊      | 1526/4000 [5:06:34<9:20:05, 13.58s/it]

Iteration 1525: Win % 0.74


 39%|███▉      | 1551/4000 [5:11:20<9:14:30, 13.59s/it]

Iteration 1550: Win % 0.66


 39%|███▉      | 1576/4000 [5:16:08<9:07:46, 13.56s/it]

Iteration 1575: Win % 0.7


 40%|████      | 1601/4000 [5:20:56<9:10:04, 13.76s/it]

Iteration 1600: Win % 0.58


 41%|████      | 1626/4000 [5:25:46<9:00:40, 13.66s/it]

Iteration 1625: Win % 0.7


 41%|████▏     | 1651/4000 [5:30:35<8:49:05, 13.51s/it]

Iteration 1650: Win % 0.66


 42%|████▏     | 1676/4000 [5:35:24<8:43:11, 13.51s/it]

Iteration 1675: Win % 0.74


 43%|████▎     | 1701/4000 [5:40:11<8:35:42, 13.46s/it]

Iteration 1700: Win % 0.76


 43%|████▎     | 1726/4000 [5:44:57<8:32:06, 13.51s/it]

Iteration 1725: Win % 0.7


 44%|████▍     | 1751/4000 [5:49:44<8:30:17, 13.61s/it]

Iteration 1750: Win % 0.64


 44%|████▍     | 1776/4000 [5:54:30<8:21:30, 13.53s/it]

Iteration 1775: Win % 0.62


 45%|████▌     | 1801/4000 [5:59:17<8:14:53, 13.50s/it]

Iteration 1800: Win % 0.74


 46%|████▌     | 1826/4000 [6:04:03<8:07:19, 13.45s/it]

Iteration 1825: Win % 0.8


 46%|████▋     | 1851/4000 [6:08:48<8:01:46, 13.45s/it]

Iteration 1850: Win % 0.74


 47%|████▋     | 1876/4000 [6:13:33<7:54:04, 13.39s/it]

Iteration 1875: Win % 0.72


 48%|████▊     | 1901/4000 [6:18:19<8:00:18, 13.73s/it]

Iteration 1900: Win % 0.74


 48%|████▊     | 1926/4000 [6:23:06<7:46:24, 13.49s/it]

Iteration 1925: Win % 0.62


 49%|████▉     | 1951/4000 [6:27:54<7:53:16, 13.86s/it]

Iteration 1950: Win % 0.68


 49%|████▉     | 1976/4000 [6:33:04<8:35:10, 15.27s/it]

Iteration 1975: Win % 0.76


 50%|█████     | 2001/4000 [6:38:24<8:33:19, 15.41s/it]

Iteration 2000: Win % 0.6


 51%|█████     | 2026/4000 [6:43:46<8:24:11, 15.33s/it]

Iteration 2025: Win % 0.54


 51%|█████▏    | 2051/4000 [6:49:06<8:10:35, 15.10s/it]

Iteration 2050: Win % 0.68


 52%|█████▏    | 2076/4000 [6:54:27<8:09:19, 15.26s/it]

Iteration 2075: Win % 0.6


 53%|█████▎    | 2101/4000 [6:59:57<8:16:11, 15.68s/it]

Iteration 2100: Win % 0.78


 53%|█████▎    | 2126/4000 [7:05:15<7:57:00, 15.27s/it]

Iteration 2125: Win % 0.72


 54%|█████▍    | 2151/4000 [7:10:36<7:46:27, 15.14s/it]

Iteration 2150: Win % 0.7


 54%|█████▍    | 2176/4000 [7:15:56<7:42:54, 15.23s/it]

Iteration 2175: Win % 0.74


 55%|█████▌    | 2201/4000 [7:20:42<6:49:38, 13.66s/it]

Iteration 2200: Win % 0.7


 56%|█████▌    | 2226/4000 [7:25:26<6:37:18, 13.44s/it]

Iteration 2225: Win % 0.58


 56%|█████▋    | 2251/4000 [7:30:11<6:35:38, 13.57s/it]

Iteration 2250: Win % 0.7


 57%|█████▋    | 2276/4000 [7:34:58<6:25:51, 13.43s/it]

Iteration 2275: Win % 0.76


 58%|█████▊    | 2301/4000 [7:39:45<6:20:20, 13.43s/it]

Iteration 2300: Win % 0.6


 58%|█████▊    | 2326/4000 [7:44:29<6:16:01, 13.48s/it]

Iteration 2325: Win % 0.76


 59%|█████▉    | 2351/4000 [7:49:14<6:11:04, 13.50s/it]

Iteration 2350: Win % 0.74


 59%|█████▉    | 2376/4000 [7:53:58<6:05:03, 13.49s/it]

Iteration 2375: Win % 0.8


 60%|██████    | 2401/4000 [7:58:41<5:58:31, 13.45s/it]

Iteration 2400: Win % 0.72


 61%|██████    | 2426/4000 [8:03:24<5:52:51, 13.45s/it]

Iteration 2425: Win % 0.72


 61%|██████▏   | 2451/4000 [8:08:08<5:50:47, 13.59s/it]

Iteration 2450: Win % 0.78


 62%|██████▏   | 2476/4000 [8:12:53<5:43:07, 13.51s/it]

Iteration 2475: Win % 0.76


 63%|██████▎   | 2501/4000 [8:17:35<5:32:37, 13.31s/it]

Iteration 2500: Win % 0.6


 63%|██████▎   | 2526/4000 [8:22:19<5:29:15, 13.40s/it]

Iteration 2525: Win % 0.74


 64%|██████▍   | 2551/4000 [8:27:07<5:30:30, 13.69s/it]

Iteration 2550: Win % 0.74


 64%|██████▍   | 2576/4000 [8:32:02<6:05:05, 15.38s/it]

Iteration 2575: Win % 0.78


 65%|██████▌   | 2601/4000 [8:36:47<5:12:40, 13.41s/it]

Iteration 2600: Win % 0.66


 66%|██████▌   | 2626/4000 [8:41:31<5:05:50, 13.36s/it]

Iteration 2625: Win % 0.64


 66%|██████▋   | 2651/4000 [8:46:14<5:01:31, 13.41s/it]

Iteration 2650: Win % 0.76


 67%|██████▋   | 2676/4000 [8:50:56<4:57:08, 13.47s/it]

Iteration 2675: Win % 0.74


 68%|██████▊   | 2701/4000 [8:55:38<4:48:48, 13.34s/it]

Iteration 2700: Win % 0.68


 68%|██████▊   | 2726/4000 [9:00:21<4:45:12, 13.43s/it]

Iteration 2725: Win % 0.78


 69%|██████▉   | 2751/4000 [9:05:04<4:38:36, 13.38s/it]

Iteration 2750: Win % 0.6


 69%|██████▉   | 2776/4000 [9:09:48<4:35:27, 13.50s/it]

Iteration 2775: Win % 0.6


 70%|███████   | 2801/4000 [9:14:30<4:25:37, 13.29s/it]

Iteration 2800: Win % 0.74


 71%|███████   | 2826/4000 [9:19:12<4:23:01, 13.44s/it]

Iteration 2825: Win % 0.78


 71%|███████▏  | 2851/4000 [9:23:54<4:15:26, 13.34s/it]

Iteration 2850: Win % 0.76


 72%|███████▏  | 2876/4000 [9:28:36<4:10:37, 13.38s/it]

Iteration 2875: Win % 0.74


 73%|███████▎  | 2901/4000 [9:33:21<4:05:16, 13.39s/it]

Iteration 2900: Win % 0.7


 73%|███████▎  | 2926/4000 [9:38:06<4:01:55, 13.52s/it]

Iteration 2925: Win % 0.78


 74%|███████▍  | 2951/4000 [9:42:49<3:54:51, 13.43s/it]

Iteration 2950: Win % 0.86


 74%|███████▍  | 2976/4000 [9:47:32<3:49:12, 13.43s/it]

Iteration 2975: Win % 0.8


 75%|███████▌  | 3001/4000 [9:52:15<3:42:23, 13.36s/it]

Iteration 3000: Win % 0.6


 76%|███████▌  | 3026/4000 [9:56:58<3:38:23, 13.45s/it]

Iteration 3025: Win % 0.72


 76%|███████▋  | 3051/4000 [10:01:40<3:31:35, 13.38s/it]

Iteration 3050: Win % 0.74


 77%|███████▋  | 3076/4000 [10:06:22<3:26:23, 13.40s/it]

Iteration 3075: Win % 0.76


 78%|███████▊  | 3101/4000 [10:11:03<3:18:58, 13.28s/it]

Iteration 3100: Win % 0.72


 78%|███████▊  | 3126/4000 [10:15:43<3:11:30, 13.15s/it]

Iteration 3125: Win % 0.76


 79%|███████▉  | 3151/4000 [10:20:23<3:06:50, 13.20s/it]

Iteration 3150: Win % 0.7


 79%|███████▉  | 3176/4000 [10:25:05<3:01:49, 13.24s/it]

Iteration 3175: Win % 0.7


 80%|████████  | 3201/4000 [10:29:47<2:55:59, 13.22s/it]

Iteration 3200: Win % 0.58


 81%|████████  | 3226/4000 [10:34:31<2:53:36, 13.46s/it]

Iteration 3225: Win % 0.6


 81%|████████▏ | 3251/4000 [10:39:16<2:47:16, 13.40s/it]

Iteration 3250: Win % 0.7


 82%|████████▏ | 3276/4000 [10:43:58<2:42:35, 13.48s/it]

Iteration 3275: Win % 0.68


 83%|████████▎ | 3301/4000 [10:48:40<2:35:14, 13.32s/it]

Iteration 3300: Win % 0.8


 83%|████████▎ | 3326/4000 [10:53:22<2:30:46, 13.42s/it]

Iteration 3325: Win % 0.6


 84%|████████▍ | 3351/4000 [10:58:03<2:23:58, 13.31s/it]

Iteration 3350: Win % 0.74


 84%|████████▍ | 3376/4000 [11:02:43<2:18:28, 13.31s/it]

Iteration 3375: Win % 0.78


 85%|████████▌ | 3401/4000 [11:07:24<2:11:56, 13.22s/it]

Iteration 3400: Win % 0.72


 86%|████████▌ | 3426/4000 [11:12:03<2:05:57, 13.17s/it]

Iteration 3425: Win % 0.7


 86%|████████▋ | 3451/4000 [11:16:43<2:01:23, 13.27s/it]

Iteration 3450: Win % 0.72


 87%|████████▋ | 3476/4000 [11:21:27<2:01:48, 13.95s/it]

Iteration 3475: Win % 0.74


 88%|████████▊ | 3501/4000 [11:26:19<1:50:38, 13.30s/it]

Iteration 3500: Win % 0.64


 88%|████████▊ | 3526/4000 [11:31:02<1:46:32, 13.49s/it]

Iteration 3525: Win % 0.7


 89%|████████▉ | 3551/4000 [11:35:44<1:39:15, 13.26s/it]

Iteration 3550: Win % 0.74


 89%|████████▉ | 3576/4000 [11:40:25<1:33:27, 13.22s/it]

Iteration 3575: Win % 0.76


 90%|█████████ | 3601/4000 [11:45:06<1:28:35, 13.32s/it]

Iteration 3600: Win % 0.82


 91%|█████████ | 3626/4000 [11:49:48<1:23:07, 13.33s/it]

Iteration 3625: Win % 0.7


 91%|█████████▏| 3651/4000 [11:54:28<1:16:30, 13.15s/it]

Iteration 3650: Win % 0.78


 92%|█████████▏| 3676/4000 [11:59:09<1:12:18, 13.39s/it]

Iteration 3675: Win % 0.84


 93%|█████████▎| 3701/4000 [12:03:51<1:06:22, 13.32s/it]

Iteration 3700: Win % 0.78


 93%|█████████▎| 3726/4000 [12:08:31<1:00:16, 13.20s/it]

Iteration 3725: Win % 0.72


 94%|█████████▍| 3751/4000 [12:13:12<55:11, 13.30s/it]

Iteration 3750: Win % 0.72


 94%|█████████▍| 3776/4000 [12:17:52<49:33, 13.28s/it]

Iteration 3775: Win % 0.74


 95%|█████████▌| 3801/4000 [12:22:32<43:43, 13.18s/it]

Iteration 3800: Win % 0.68


 96%|█████████▌| 3826/4000 [12:27:13<39:14, 13.53s/it]

Iteration 3825: Win % 0.64


 96%|█████████▋| 3851/4000 [12:31:54<32:56, 13.26s/it]

Iteration 3850: Win % 0.62


 97%|█████████▋| 3876/4000 [12:36:36<27:31, 13.32s/it]

Iteration 3875: Win % 0.72


 98%|█████████▊| 3901/4000 [12:41:17<21:57, 13.31s/it]

Iteration 3900: Win % 0.64


 98%|█████████▊| 3926/4000 [12:45:59<16:30, 13.38s/it]

Iteration 3925: Win % 0.66


 99%|█████████▉| 3951/4000 [12:50:40<10:55, 13.37s/it]

Iteration 3950: Win % 0.76


 99%|█████████▉| 3976/4000 [12:55:23<05:20, 13.34s/it]

Iteration 3975: Win % 0.62


100%|██████████| 4000/4000 [12:59:45<00:00, 11.70s/it]

MAX WINS: 0.88. In iteration: 175





In [69]:
random_simple_reward.to_csv('models/random_sumboardFalse_0001lr_2epoch_10e4entropy_64bs.csv')

In [65]:
plot_wins_entropy(random_simple_reward)

# Training with reward 1 or -1, random with the model predict, and when eval reach 85% change to argmax

In [66]:
PPO_agent = PPO(board_shape=8, n_experience_episodes=70, iterations=4000, batch_size=64, epochs=2, 
                entropy_loss=10e-4, eval_period=50, sum_board=False,train_randomly=False, lr=0.0001, algorithm='PPO')

not_random_simple_reward = PPO_agent.run()


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.

  0%|          | 1/4000 [00:22<24:50:29, 22.36s/it]

Iteration 0: Win % 0.42
Best model at iteration 0: 0.42


  1%|          | 26/4000 [06:28<18:48:12, 17.03s/it]

Iteration 25: Win % 0.68
Best model at iteration 25: 0.68


  1%|▏         | 51/4000 [12:37<18:40:16, 17.02s/it]

Iteration 50: Win % 0.66


  2%|▏         | 76/4000 [18:25<16:40:10, 15.29s/it]

Iteration 75: Win % 0.74
Best model at iteration 75: 0.74


  3%|▎         | 101/4000 [24:03<17:18:49, 15.99s/it]

Iteration 100: Win % 0.8
Best model at iteration 100: 0.8


  3%|▎         | 126/4000 [29:36<16:35:27, 15.42s/it]

Iteration 125: Win % 0.72


  4%|▍         | 151/4000 [35:08<16:27:45, 15.40s/it]

Iteration 150: Win % 0.86
Best model at iteration 150: 0.86


  4%|▍         | 176/4000 [40:44<16:25:46, 15.47s/it]

Iteration 175: Win % 0.84


  5%|▌         | 201/4000 [46:20<16:16:22, 15.42s/it]

Iteration 200: Win % 0.82


  6%|▌         | 226/4000 [52:40<17:23:18, 16.59s/it]

Iteration 225: Win % 0.78


  6%|▋         | 251/4000 [59:09<18:49:59, 18.08s/it]

Iteration 250: Win % 0.84


  7%|▋         | 276/4000 [1:06:11<21:36:37, 20.89s/it]

Iteration 275: Win % 0.86


  8%|▊         | 301/4000 [1:13:14<16:56:45, 16.49s/it]

Iteration 300: Win % 0.92
Best model at iteration 300: 0.92


  8%|▊         | 326/4000 [1:19:13<16:57:32, 16.62s/it]

Iteration 325: Win % 0.84


  9%|▉         | 351/4000 [1:25:12<16:33:46, 16.34s/it]

Iteration 350: Win % 0.82


  9%|▉         | 376/4000 [1:31:18<16:22:03, 16.26s/it]

Iteration 375: Win % 0.84


 10%|█         | 401/4000 [1:37:04<15:48:25, 15.81s/it]

Iteration 400: Win % 0.82


 11%|█         | 426/4000 [1:42:46<15:38:13, 15.75s/it]

Iteration 425: Win % 0.86


 11%|█▏        | 451/4000 [1:48:28<15:25:37, 15.65s/it]

Iteration 450: Win % 0.86


 12%|█▏        | 476/4000 [1:54:06<15:11:00, 15.51s/it]

Iteration 475: Win % 0.86


 13%|█▎        | 501/4000 [1:59:56<15:27:15, 15.90s/it]

Iteration 500: Win % 0.92


 13%|█▎        | 526/4000 [2:05:37<14:58:09, 15.51s/it]

Iteration 525: Win % 0.88


 14%|█▍        | 551/4000 [2:11:21<15:07:47, 15.79s/it]

Iteration 550: Win % 0.8


 14%|█▍        | 576/4000 [2:17:01<14:51:28, 15.62s/it]

Iteration 575: Win % 0.86


 15%|█▌        | 601/4000 [2:22:50<14:54:22, 15.79s/it]

Iteration 600: Win % 0.86


 16%|█▌        | 626/4000 [2:28:33<14:35:26, 15.57s/it]

Iteration 625: Win % 0.86


 16%|█▋        | 651/4000 [2:34:14<14:24:46, 15.49s/it]

Iteration 650: Win % 0.94
Best model at iteration 650: 0.94


 17%|█▋        | 676/4000 [2:39:55<14:18:27, 15.50s/it]

Iteration 675: Win % 0.8


 18%|█▊        | 701/4000 [2:45:32<14:09:45, 15.45s/it]

Iteration 700: Win % 0.8


 18%|█▊        | 726/4000 [2:51:12<14:06:59, 15.52s/it]

Iteration 725: Win % 0.8


 19%|█▉        | 751/4000 [2:56:52<14:02:28, 15.56s/it]

Iteration 750: Win % 0.82


 19%|█▉        | 776/4000 [3:02:32<13:59:57, 15.63s/it]

Iteration 775: Win % 0.86


 20%|██        | 801/4000 [3:08:11<13:46:23, 15.50s/it]

Iteration 800: Win % 0.84


 21%|██        | 826/4000 [3:14:01<14:58:10, 16.98s/it]

Iteration 825: Win % 0.94


 21%|██▏       | 851/4000 [3:20:04<14:29:59, 16.58s/it]

Iteration 850: Win % 0.9


 22%|██▏       | 876/4000 [3:26:09<14:15:27, 16.43s/it]

Iteration 875: Win % 0.82


 23%|██▎       | 901/4000 [3:32:11<14:12:31, 16.51s/it]

Iteration 900: Win % 0.8


 23%|██▎       | 926/4000 [3:38:11<14:03:03, 16.46s/it]

Iteration 925: Win % 0.96
Best model at iteration 925: 0.96


 24%|██▍       | 951/4000 [3:44:13<14:02:42, 16.58s/it]

Iteration 950: Win % 0.86


 24%|██▍       | 976/4000 [3:50:14<13:48:00, 16.43s/it]

Iteration 975: Win % 0.88


 25%|██▌       | 1001/4000 [3:56:16<13:52:31, 16.66s/it]

Iteration 1000: Win % 0.86


 26%|██▌       | 1026/4000 [4:02:02<12:53:46, 15.61s/it]

Iteration 1025: Win % 0.82


 26%|██▋       | 1051/4000 [4:07:44<12:47:21, 15.61s/it]

Iteration 1050: Win % 0.92


 27%|██▋       | 1076/4000 [4:13:26<12:39:18, 15.58s/it]

Iteration 1075: Win % 0.76


 28%|██▊       | 1101/4000 [4:19:04<12:33:39, 15.60s/it]

Iteration 1100: Win % 0.8


 28%|██▊       | 1126/4000 [4:24:44<12:29:01, 15.64s/it]

Iteration 1125: Win % 0.82


 29%|██▉       | 1151/4000 [4:30:22<12:14:40, 15.47s/it]

Iteration 1150: Win % 0.84


 29%|██▉       | 1176/4000 [4:36:03<12:09:32, 15.50s/it]

Iteration 1175: Win % 0.78


 30%|███       | 1201/4000 [4:41:40<11:58:50, 15.41s/it]

Iteration 1200: Win % 0.94


 31%|███       | 1226/4000 [4:47:32<13:15:14, 17.20s/it]

Iteration 1225: Win % 0.82


 31%|███▏      | 1251/4000 [4:53:40<12:37:56, 16.54s/it]

Iteration 1250: Win % 0.92


 32%|███▏      | 1276/4000 [4:59:44<12:44:52, 16.85s/it]

Iteration 1275: Win % 0.88


 33%|███▎      | 1301/4000 [5:05:44<12:18:38, 16.42s/it]

Iteration 1300: Win % 0.86


 33%|███▎      | 1326/4000 [5:11:43<12:14:02, 16.47s/it]

Iteration 1325: Win % 0.94


 34%|███▍      | 1351/4000 [5:17:43<12:08:40, 16.50s/it]

Iteration 1350: Win % 0.82


 34%|███▍      | 1376/4000 [5:23:50<12:32:51, 17.21s/it]

Iteration 1375: Win % 0.82


 35%|███▌      | 1401/4000 [5:30:00<12:03:55, 16.71s/it]

Iteration 1400: Win % 0.88


 36%|███▌      | 1426/4000 [5:36:05<11:41:31, 16.35s/it]

Iteration 1425: Win % 0.92


 36%|███▋      | 1451/4000 [5:42:32<12:55:47, 18.26s/it]

Iteration 1450: Win % 0.98
Best model at iteration 1450: 0.98


 37%|███▋      | 1476/4000 [5:49:44<13:36:42, 19.41s/it]

Iteration 1475: Win % 0.9


 38%|███▊      | 1501/4000 [5:56:46<13:23:16, 19.29s/it]

Iteration 1500: Win % 0.84


 38%|███▊      | 1526/4000 [6:03:53<13:13:52, 19.25s/it]

Iteration 1525: Win % 0.76


 39%|███▉      | 1551/4000 [6:10:36<12:05:55, 17.78s/it]

Iteration 1550: Win % 0.84


 39%|███▉      | 1576/4000 [6:17:01<11:47:04, 17.50s/it]

Iteration 1575: Win % 0.84


 40%|████      | 1601/4000 [6:23:25<10:58:43, 16.47s/it]

Iteration 1600: Win % 0.8


 41%|████      | 1626/4000 [6:29:16<10:26:10, 15.83s/it]

Iteration 1625: Win % 0.9


 41%|████▏     | 1651/4000 [6:35:05<10:23:23, 15.92s/it]

Iteration 1650: Win % 0.96


 42%|████▏     | 1676/4000 [6:40:46<10:02:01, 15.54s/it]

Iteration 1675: Win % 0.88


 43%|████▎     | 1701/4000 [6:46:26<10:17:49, 16.12s/it]

Iteration 1700: Win % 0.84


 43%|████▎     | 1726/4000 [6:52:49<11:25:38, 18.09s/it]

Iteration 1725: Win % 0.88


 44%|████▍     | 1751/4000 [6:59:15<12:02:52, 19.29s/it]

Iteration 1750: Win % 0.86


 44%|████▍     | 1776/4000 [7:06:06<11:21:33, 18.39s/it]

Iteration 1775: Win % 0.86


 45%|████▌     | 1801/4000 [7:12:50<11:01:23, 18.05s/it]

Iteration 1800: Win % 0.86


 46%|████▌     | 1826/4000 [7:19:33<10:40:40, 17.68s/it]

Iteration 1825: Win % 0.92


 46%|████▋     | 1851/4000 [7:26:00<11:02:52, 18.51s/it]

Iteration 1850: Win % 0.78


 47%|████▋     | 1876/4000 [7:32:43<10:36:52, 17.99s/it]

Iteration 1875: Win % 0.92


 48%|████▊     | 1901/4000 [7:38:56<10:13:18, 17.53s/it]

Iteration 1900: Win % 0.84


 48%|████▊     | 1926/4000 [7:45:41<10:41:45, 18.57s/it]

Iteration 1925: Win % 0.8


 49%|████▉     | 1951/4000 [7:52:01<10:00:17, 17.58s/it]

Iteration 1950: Win % 0.9


 49%|████▉     | 1976/4000 [7:58:25<9:06:25, 16.20s/it]

Iteration 1975: Win % 0.86


 50%|█████     | 2001/4000 [8:04:16<8:50:57, 15.94s/it]

Iteration 2000: Win % 0.86


 51%|█████     | 2026/4000 [8:10:06<8:40:08, 15.81s/it]

Iteration 2025: Win % 0.94


 51%|█████▏    | 2051/4000 [8:15:54<8:33:59, 15.82s/it]

Iteration 2050: Win % 0.88


 52%|█████▏    | 2076/4000 [8:21:41<8:32:19, 15.98s/it]

Iteration 2075: Win % 0.78


 53%|█████▎    | 2101/4000 [8:27:45<9:07:11, 17.29s/it]

Iteration 2100: Win % 0.86


 53%|█████▎    | 2126/4000 [8:34:06<9:12:19, 17.68s/it]

Iteration 2125: Win % 0.86


 54%|█████▍    | 2151/4000 [8:40:14<8:12:16, 15.97s/it]

Iteration 2150: Win % 0.82


 54%|█████▍    | 2176/4000 [8:46:09<8:35:10, 16.95s/it]

Iteration 2175: Win % 0.94


 55%|█████▌    | 2201/4000 [8:52:24<8:15:15, 16.52s/it]

Iteration 2200: Win % 0.9


 56%|█████▌    | 2226/4000 [8:58:29<8:23:37, 17.03s/it]

Iteration 2225: Win % 0.76


 56%|█████▋    | 2251/4000 [9:04:54<8:51:25, 18.23s/it]

Iteration 2250: Win % 0.86


 57%|█████▋    | 2276/4000 [9:11:23<8:53:37, 18.57s/it]

Iteration 2275: Win % 0.96


 58%|█████▊    | 2301/4000 [9:17:37<7:59:49, 16.94s/it]

Iteration 2300: Win % 0.84


 58%|█████▊    | 2326/4000 [9:23:54<7:49:56, 16.84s/it]

Iteration 2325: Win % 0.9


 59%|█████▉    | 2351/4000 [9:30:08<7:50:45, 17.13s/it]

Iteration 2350: Win % 0.88


 59%|█████▉    | 2376/4000 [9:36:02<7:16:02, 16.11s/it]

Iteration 2375: Win % 0.88


 60%|██████    | 2401/4000 [9:41:54<7:07:06, 16.03s/it]

Iteration 2400: Win % 0.92


 61%|██████    | 2426/4000 [9:47:43<6:50:49, 15.66s/it]

Iteration 2425: Win % 0.88


 61%|██████▏   | 2451/4000 [9:53:21<6:36:32, 15.36s/it]

Iteration 2450: Win % 0.86


 62%|██████▏   | 2476/4000 [9:58:58<6:32:03, 15.44s/it]

Iteration 2475: Win % 0.96


 63%|██████▎   | 2501/4000 [10:04:36<6:29:09, 15.58s/it]

Iteration 2500: Win % 0.86


 63%|██████▎   | 2526/4000 [10:10:15<6:20:33, 15.49s/it]

Iteration 2525: Win % 0.84


 64%|██████▍   | 2551/4000 [10:15:53<6:12:26, 15.42s/it]

Iteration 2550: Win % 0.86


 64%|██████▍   | 2576/4000 [10:21:31<6:03:03, 15.30s/it]

Iteration 2575: Win % 0.82


 65%|██████▌   | 2601/4000 [10:27:09<6:00:44, 15.47s/it]

Iteration 2600: Win % 0.94


 66%|██████▌   | 2626/4000 [10:32:48<5:54:08, 15.47s/it]

Iteration 2625: Win % 0.78


 66%|██████▋   | 2651/4000 [10:38:24<5:46:15, 15.40s/it]

Iteration 2650: Win % 0.84


 67%|██████▋   | 2676/4000 [10:44:02<5:40:58, 15.45s/it]

Iteration 2675: Win % 0.86


 68%|██████▊   | 2701/4000 [10:49:38<5:34:17, 15.44s/it]

Iteration 2700: Win % 0.92


 68%|██████▊   | 2726/4000 [10:55:15<5:29:04, 15.50s/it]

Iteration 2725: Win % 0.9


 69%|██████▉   | 2751/4000 [11:00:57<5:41:45, 16.42s/it]

Iteration 2750: Win % 0.9


 69%|██████▉   | 2776/4000 [11:07:10<5:44:22, 16.88s/it]

Iteration 2775: Win % 0.94


 70%|███████   | 2801/4000 [11:13:54<6:07:10, 18.37s/it]

Iteration 2800: Win % 0.86


 71%|███████   | 2826/4000 [11:19:58<5:17:26, 16.22s/it]

Iteration 2825: Win % 0.86


 71%|███████▏  | 2851/4000 [11:25:56<5:16:29, 16.53s/it]

Iteration 2850: Win % 0.88


 72%|███████▏  | 2876/4000 [11:31:55<5:05:36, 16.31s/it]

Iteration 2875: Win % 0.9


 73%|███████▎  | 2901/4000 [11:37:53<4:59:32, 16.35s/it]

Iteration 2900: Win % 0.92


 73%|███████▎  | 2926/4000 [11:43:42<4:49:46, 16.19s/it]

Iteration 2925: Win % 0.94


 74%|███████▍  | 2951/4000 [11:49:36<4:33:54, 15.67s/it]

Iteration 2950: Win % 0.86


 74%|███████▍  | 2976/4000 [11:55:22<4:39:19, 16.37s/it]

Iteration 2975: Win % 0.88


 75%|███████▌  | 3001/4000 [12:01:21<4:31:12, 16.29s/it]

Iteration 3000: Win % 0.84


 76%|███████▌  | 3026/4000 [12:07:01<4:10:22, 15.42s/it]

Iteration 3025: Win % 0.82


 76%|███████▋  | 3051/4000 [12:12:39<4:08:08, 15.69s/it]

Iteration 3050: Win % 0.9


 77%|███████▋  | 3076/4000 [12:18:14<3:57:53, 15.45s/it]

Iteration 3075: Win % 0.92


 78%|███████▊  | 3101/4000 [12:23:52<3:50:29, 15.38s/it]

Iteration 3100: Win % 0.88


 78%|███████▊  | 3126/4000 [12:29:51<4:00:20, 16.50s/it]

Iteration 3125: Win % 0.88


 79%|███████▉  | 3151/4000 [12:35:50<3:52:04, 16.40s/it]

Iteration 3150: Win % 0.84


 79%|███████▉  | 3176/4000 [12:41:47<3:44:56, 16.38s/it]

Iteration 3175: Win % 0.9


 80%|████████  | 3201/4000 [12:48:57<3:28:07, 15.63s/it]

Iteration 3200: Win % 0.82


 81%|████████  | 3226/4000 [12:54:52<3:33:04, 16.52s/it]

Iteration 3225: Win % 0.76


 81%|████████▏ | 3251/4000 [13:00:51<3:25:38, 16.47s/it]

Iteration 3250: Win % 0.84


 82%|████████▏ | 3276/4000 [13:06:46<3:16:01, 16.25s/it]

Iteration 3275: Win % 0.94


 83%|████████▎ | 3301/4000 [13:13:11<3:25:14, 17.62s/it]

Iteration 3300: Win % 0.88


 83%|████████▎ | 3326/4000 [13:18:55<3:11:29, 17.05s/it]

Iteration 3325: Win % 0.86


 84%|████████▍ | 3351/4000 [13:24:47<2:55:48, 16.25s/it]

Iteration 3350: Win % 0.86


 84%|████████▍ | 3376/4000 [13:30:35<2:49:07, 16.26s/it]

Iteration 3375: Win % 0.86


 85%|████████▌ | 3401/4000 [13:36:29<2:42:40, 16.30s/it]

Iteration 3400: Win % 0.82


 86%|████████▌ | 3426/4000 [13:42:23<2:38:53, 16.61s/it]

Iteration 3425: Win % 0.8


 86%|████████▋ | 3451/4000 [13:48:22<2:27:23, 16.11s/it]

Iteration 3450: Win % 0.86


 87%|████████▋ | 3476/4000 [13:54:22<2:24:18, 16.52s/it]

Iteration 3475: Win % 0.82


 88%|████████▊ | 3501/4000 [14:00:24<2:19:12, 16.74s/it]

Iteration 3500: Win % 0.88


 88%|████████▊ | 3526/4000 [14:06:32<2:25:54, 18.47s/it]

Iteration 3525: Win % 0.82


 89%|████████▉ | 3551/4000 [14:12:45<2:04:59, 16.70s/it]

Iteration 3550: Win % 0.94


 89%|████████▉ | 3576/4000 [14:18:45<1:57:37, 16.65s/it]

Iteration 3575: Win % 0.88


 90%|█████████ | 3601/4000 [14:24:46<1:51:02, 16.70s/it]

Iteration 3600: Win % 0.82


 91%|█████████ | 3626/4000 [14:31:13<1:47:37, 17.27s/it]

Iteration 3625: Win % 0.82


 91%|█████████▏| 3651/4000 [14:37:05<1:37:47, 16.81s/it]

Iteration 3650: Win % 0.84


 92%|█████████▏| 3676/4000 [14:43:23<1:33:24, 17.30s/it]

Iteration 3675: Win % 0.84


 93%|█████████▎| 3701/4000 [14:49:57<1:27:11, 17.50s/it]

Iteration 3700: Win % 0.82


 93%|█████████▎| 3726/4000 [14:56:17<1:19:00, 17.30s/it]

Iteration 3725: Win % 0.88


 94%|█████████▍| 3751/4000 [15:02:09<1:03:39, 15.34s/it]

Iteration 3750: Win % 0.92


 94%|█████████▍| 3776/4000 [15:07:43<56:53, 15.24s/it]

Iteration 3775: Win % 0.82


 95%|█████████▌| 3801/4000 [15:13:17<50:30, 15.23s/it]

Iteration 3800: Win % 0.86


 96%|█████████▌| 3826/4000 [15:18:50<45:08, 15.57s/it]

Iteration 3825: Win % 0.92


 96%|█████████▋| 3851/4000 [15:24:23<37:43, 15.19s/it]

Iteration 3850: Win % 0.9


 97%|█████████▋| 3876/4000 [15:29:56<31:48, 15.39s/it]

Iteration 3875: Win % 0.94


 98%|█████████▊| 3901/4000 [15:35:29<25:07, 15.23s/it]

Iteration 3900: Win % 0.86


 98%|█████████▊| 3926/4000 [15:41:00<18:39, 15.13s/it]

Iteration 3925: Win % 0.84


 99%|█████████▉| 3951/4000 [15:46:33<12:32, 15.35s/it]

Iteration 3950: Win % 0.88


 99%|█████████▉| 3976/4000 [15:52:10<06:08, 15.34s/it]

Iteration 3975: Win % 0.8


100%|██████████| 4000/4000 [15:57:22<00:00, 14.36s/it]

MAX WINS: 0.98. In iteration: 1450





In [68]:
not_random_simple_reward.to_csv('models/not_random_sumboardFalse_0001lr_2epoch_10e4entropy_64bs.csv')

In [67]:
plot_wins_entropy(not_random_simple_reward)

# Training with reward 1 or -1, and at 85% winning in evaluation it changes to prediction with the model itself 

In [63]:
PPO_agent = PPO(board_shape=8, n_experience_episodes=70, iterations=4000, batch_size=64, epochs=2, 
                entropy_loss=10e-4, eval_period=50, sum_board=True,train_randomly=False, lr=0.0001, algorithm='PPO')

not_random_board_reward = PPO_agent.run()


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.

  0%|          | 1/6000 [00:28<47:05:20, 28.26s/it]

Iteration 0: Win % 0.6
Best model at iteration 0: 0.6


  0%|          | 26/6000 [06:17<26:36:07, 16.03s/it]

Iteration 25: Win % 0.66
Best model at iteration 25: 0.66


  1%|          | 51/6000 [12:05<26:40:18, 16.14s/it]

Iteration 50: Win % 0.66


  1%|▏         | 76/6000 [17:39<24:11:27, 14.70s/it]

Iteration 75: Win % 0.72
Best model at iteration 75: 0.72


  2%|▏         | 101/6000 [22:57<24:03:57, 14.69s/it]

Iteration 100: Win % 0.74
Best model at iteration 100: 0.74


  2%|▏         | 126/6000 [28:16<24:00:26, 14.71s/it]

Iteration 125: Win % 0.84
Best model at iteration 125: 0.84


  3%|▎         | 151/6000 [33:40<24:02:29, 14.80s/it]

Iteration 150: Win % 0.82


  3%|▎         | 176/6000 [39:02<24:13:02, 14.97s/it]

Iteration 175: Win % 0.82


  3%|▎         | 201/6000 [44:28<24:37:36, 15.29s/it]

Iteration 200: Win % 0.7


  4%|▍         | 226/6000 [49:54<24:25:41, 15.23s/it]

Iteration 225: Win % 0.72


  4%|▍         | 251/6000 [55:25<24:37:18, 15.42s/it]

Iteration 250: Win % 0.8


  5%|▍         | 276/6000 [1:00:59<24:31:09, 15.42s/it]

Iteration 275: Win % 0.78


  5%|▌         | 301/6000 [1:06:31<24:22:44, 15.40s/it]

Iteration 300: Win % 0.8


  5%|▌         | 326/6000 [1:12:06<24:23:02, 15.47s/it]

Iteration 325: Win % 0.8


  6%|▌         | 351/6000 [1:17:41<24:18:17, 15.49s/it]

Iteration 350: Win % 0.84


  6%|▋         | 376/6000 [1:23:16<23:58:24, 15.35s/it]

Iteration 375: Win % 0.86
Best model at iteration 375: 0.86


  7%|▋         | 401/6000 [1:28:52<24:11:02, 15.55s/it]

Iteration 400: Win % 0.8


  7%|▋         | 426/6000 [1:34:30<24:12:48, 15.64s/it]

Iteration 425: Win % 0.82


  8%|▊         | 451/6000 [1:40:07<23:46:57, 15.43s/it]

Iteration 450: Win % 0.72


  8%|▊         | 476/6000 [1:45:45<23:54:09, 15.58s/it]

Iteration 475: Win % 0.84


  8%|▊         | 501/6000 [1:51:22<23:39:05, 15.48s/it]

Iteration 500: Win % 0.9
Best model at iteration 500: 0.9


  9%|▉         | 526/6000 [1:57:01<23:40:26, 15.57s/it]

Iteration 525: Win % 0.8


  9%|▉         | 551/6000 [2:02:44<23:48:50, 15.73s/it]

Iteration 550: Win % 0.94
Best model at iteration 550: 0.94


 10%|▉         | 576/6000 [2:08:24<23:44:06, 15.75s/it]

Iteration 575: Win % 0.9


 10%|█         | 601/6000 [2:14:05<23:25:38, 15.62s/it]

Iteration 600: Win % 0.9


 10%|█         | 626/6000 [2:19:46<23:25:56, 15.70s/it]

Iteration 625: Win % 0.9


 11%|█         | 651/6000 [2:25:28<23:18:55, 15.69s/it]

Iteration 650: Win % 0.92


 11%|█▏        | 676/6000 [2:31:08<23:14:23, 15.71s/it]

Iteration 675: Win % 0.86


 12%|█▏        | 701/6000 [2:36:50<23:09:04, 15.73s/it]

Iteration 700: Win % 0.84


 12%|█▏        | 726/6000 [2:42:31<23:00:58, 15.71s/it]

Iteration 725: Win % 0.76


 13%|█▎        | 751/6000 [2:48:16<23:37:35, 16.20s/it]

Iteration 750: Win % 0.88


 13%|█▎        | 776/6000 [2:54:33<25:16:15, 17.41s/it]

Iteration 775: Win % 0.94


 13%|█▎        | 801/6000 [3:00:50<24:57:49, 17.29s/it]

Iteration 800: Win % 0.88


 14%|█▍        | 826/6000 [3:06:46<22:32:02, 15.68s/it]

Iteration 825: Win % 0.9


 14%|█▍        | 851/6000 [3:12:33<22:47:19, 15.93s/it]

Iteration 850: Win % 0.92


 15%|█▍        | 876/6000 [3:18:16<22:25:25, 15.75s/it]

Iteration 875: Win % 0.76


 15%|█▌        | 901/6000 [3:24:01<22:33:13, 15.92s/it]

Iteration 900: Win % 0.78


 15%|█▌        | 926/6000 [3:29:43<22:21:35, 15.86s/it]

Iteration 925: Win % 0.8


 16%|█▌        | 951/6000 [3:35:26<22:13:03, 15.84s/it]

Iteration 950: Win % 0.82


 16%|█▋        | 976/6000 [3:41:35<25:20:04, 18.15s/it]

Iteration 975: Win % 0.88


 17%|█▋        | 1001/6000 [3:47:42<23:12:06, 16.71s/it]

Iteration 1000: Win % 0.94


 17%|█▋        | 1026/6000 [3:53:45<23:11:21, 16.78s/it]

Iteration 1025: Win % 0.96
Best model at iteration 1025: 0.96


 18%|█▊        | 1051/6000 [3:59:48<23:03:46, 16.78s/it]

Iteration 1050: Win % 0.84


 18%|█▊        | 1076/6000 [4:05:49<22:46:27, 16.65s/it]

Iteration 1075: Win % 0.9


 18%|█▊        | 1101/6000 [4:11:50<22:40:59, 16.67s/it]

Iteration 1100: Win % 0.84


 19%|█▉        | 1126/6000 [4:17:51<22:25:50, 16.57s/it]

Iteration 1125: Win % 0.86


 19%|█▉        | 1151/6000 [4:23:51<22:20:17, 16.58s/it]

Iteration 1150: Win % 0.94


 20%|█▉        | 1176/6000 [4:29:51<22:21:20, 16.68s/it]

Iteration 1175: Win % 0.9


 20%|██        | 1201/6000 [4:35:51<22:07:31, 16.60s/it]

Iteration 1200: Win % 0.9


 20%|██        | 1226/6000 [4:41:51<22:00:10, 16.59s/it]

Iteration 1225: Win % 0.94


 21%|██        | 1251/6000 [4:47:53<21:56:46, 16.64s/it]

Iteration 1250: Win % 0.9


 21%|██▏       | 1276/6000 [4:53:54<21:21:19, 16.27s/it]

Iteration 1275: Win % 0.94


 22%|██▏       | 1301/6000 [4:59:53<21:58:17, 16.83s/it]

Iteration 1300: Win % 0.88


 22%|██▏       | 1326/6000 [5:05:55<21:35:23, 16.63s/it]

Iteration 1325: Win % 0.94


 23%|██▎       | 1351/6000 [5:11:56<21:27:13, 16.61s/it]

Iteration 1350: Win % 0.96


 23%|██▎       | 1376/6000 [5:17:56<21:16:03, 16.56s/it]

Iteration 1375: Win % 0.94


 23%|██▎       | 1401/6000 [5:23:56<21:13:15, 16.61s/it]

Iteration 1400: Win % 0.9


 24%|██▍       | 1426/6000 [5:29:49<19:56:43, 15.70s/it]

Iteration 1425: Win % 0.94


 24%|██▍       | 1451/6000 [5:35:30<19:48:14, 15.67s/it]

Iteration 1450: Win % 0.86


 25%|██▍       | 1476/6000 [5:41:19<20:55:32, 16.65s/it]

Iteration 1475: Win % 0.92


 25%|██▌       | 1501/6000 [5:47:20<20:46:20, 16.62s/it]

Iteration 1500: Win % 0.88


 25%|██▌       | 1526/6000 [5:53:19<22:18:46, 17.95s/it]

Iteration 1525: Win % 0.86


 26%|██▌       | 1551/6000 [5:59:05<19:35:38, 15.85s/it]

Iteration 1550: Win % 0.9


 26%|██▋       | 1576/6000 [6:04:48<19:37:49, 15.97s/it]

Iteration 1575: Win % 0.88


 27%|██▋       | 1601/6000 [6:10:31<19:26:35, 15.91s/it]

Iteration 1600: Win % 0.9


 27%|██▋       | 1626/6000 [6:16:15<19:14:17, 15.83s/it]

Iteration 1625: Win % 0.88


 28%|██▊       | 1651/6000 [6:21:58<19:06:02, 15.81s/it]

Iteration 1650: Win % 0.9


 28%|██▊       | 1676/6000 [6:27:40<18:56:08, 15.77s/it]

Iteration 1675: Win % 0.9


 28%|██▊       | 1701/6000 [6:33:25<18:54:51, 15.84s/it]

Iteration 1700: Win % 0.86


 29%|██▉       | 1726/6000 [6:39:08<18:43:46, 15.78s/it]

Iteration 1725: Win % 0.92


 29%|██▉       | 1751/6000 [6:44:53<19:01:21, 16.12s/it]

Iteration 1750: Win % 0.84


 30%|██▉       | 1776/6000 [6:50:38<18:36:26, 15.86s/it]

Iteration 1775: Win % 0.86


 30%|███       | 1801/6000 [6:56:23<18:36:30, 15.95s/it]

Iteration 1800: Win % 0.8


 30%|███       | 1826/6000 [7:02:08<18:16:25, 15.76s/it]

Iteration 1825: Win % 0.88


 31%|███       | 1851/6000 [7:07:52<18:17:27, 15.87s/it]

Iteration 1850: Win % 0.98
Best model at iteration 1850: 0.98


 31%|███▏      | 1876/6000 [7:13:35<18:08:47, 15.84s/it]

Iteration 1875: Win % 0.92


 32%|███▏      | 1901/6000 [7:19:18<18:01:49, 15.84s/it]

Iteration 1900: Win % 0.92


 32%|███▏      | 1926/6000 [7:25:02<17:56:48, 15.86s/it]

Iteration 1925: Win % 0.9


 33%|███▎      | 1951/6000 [7:30:45<17:39:06, 15.69s/it]

Iteration 1950: Win % 0.9


 33%|███▎      | 1976/6000 [7:36:28<17:36:38, 15.76s/it]

Iteration 1975: Win % 0.86


 33%|███▎      | 2001/6000 [7:42:10<17:26:31, 15.70s/it]

Iteration 2000: Win % 0.96


 34%|███▍      | 2026/6000 [13:52:49<28:13:28, 25.57s/it]

Iteration 2025: Win % 0.84


 34%|███▍      | 2051/6000 [13:58:34<17:26:03, 15.89s/it]

Iteration 2050: Win % 0.9


 35%|███▍      | 2076/6000 [14:04:15<17:10:04, 15.75s/it]

Iteration 2075: Win % 0.96


 35%|███▌      | 2101/6000 [14:09:56<17:02:29, 15.73s/it]

Iteration 2100: Win % 0.88


 35%|███▌      | 2126/6000 [14:15:38<16:57:31, 15.76s/it]

Iteration 2125: Win % 0.82


 36%|███▌      | 2151/6000 [14:21:19<16:47:55, 15.71s/it]

Iteration 2150: Win % 0.92


 36%|███▋      | 2176/6000 [14:27:02<16:43:19, 15.74s/it]

Iteration 2175: Win % 0.88


 37%|███▋      | 2201/6000 [14:32:43<16:35:21, 15.72s/it]

Iteration 2200: Win % 0.9


 37%|███▋      | 2226/6000 [14:38:24<16:25:35, 15.67s/it]

Iteration 2225: Win % 0.96


 38%|███▊      | 2251/6000 [14:44:05<16:21:22, 15.71s/it]

Iteration 2250: Win % 0.88


 38%|███▊      | 2276/6000 [14:49:50<16:30:18, 15.96s/it]

Iteration 2275: Win % 0.92


 38%|███▊      | 2301/6000 [14:55:33<16:14:06, 15.80s/it]

Iteration 2300: Win % 1.0
Best model at iteration 2300: 1.0


 39%|███▉      | 2326/6000 [15:01:56<19:00:16, 18.62s/it]

Iteration 2325: Win % 0.86


 39%|███▉      | 2351/6000 [15:08:28<18:28:10, 18.22s/it]

Iteration 2350: Win % 0.8


 40%|███▉      | 2376/6000 [15:17:03<24:45:05, 24.59s/it]

Iteration 2375: Win % 0.86


 40%|████      | 2401/6000 [15:25:51<25:48:12, 25.81s/it]

Iteration 2400: Win % 0.9


 40%|████      | 2426/6000 [15:35:16<26:48:44, 27.01s/it]

Iteration 2425: Win % 0.88


 41%|████      | 2451/6000 [15:45:00<29:14:06, 29.66s/it]

Iteration 2450: Win % 0.9


 41%|████▏     | 2476/6000 [15:53:38<23:29:16, 23.99s/it]

Iteration 2475: Win % 0.92


 42%|████▏     | 2501/6000 [16:02:13<18:54:10, 19.45s/it]

Iteration 2500: Win % 0.94


 42%|████▏     | 2526/6000 [16:09:01<18:48:41, 19.49s/it]

Iteration 2525: Win % 0.92


 43%|████▎     | 2551/6000 [16:16:21<19:09:19, 19.99s/it]

Iteration 2550: Win % 0.88


 43%|████▎     | 2576/6000 [16:22:49<16:36:58, 17.47s/it]

Iteration 2575: Win % 0.84


 43%|████▎     | 2601/6000 [16:29:18<16:51:25, 17.85s/it]

Iteration 2600: Win % 0.98


 44%|████▍     | 2626/6000 [16:35:08<14:56:40, 15.95s/it]

Iteration 2625: Win % 0.94


 44%|████▍     | 2651/6000 [16:40:55<14:57:18, 16.08s/it]

Iteration 2650: Win % 0.9


 45%|████▍     | 2676/6000 [16:46:42<14:32:13, 15.74s/it]

Iteration 2675: Win % 0.88


 45%|████▌     | 2701/6000 [16:52:25<14:37:53, 15.97s/it]

Iteration 2700: Win % 0.78


 45%|████▌     | 2726/6000 [16:58:36<15:39:39, 17.22s/it]

Iteration 2725: Win % 0.82


 46%|████▌     | 2751/6000 [17:04:32<14:37:16, 16.20s/it]

Iteration 2750: Win % 0.94


 46%|████▋     | 2776/6000 [17:10:27<15:10:09, 16.94s/it]

Iteration 2775: Win % 0.88


 47%|████▋     | 2801/6000 [17:16:25<14:11:33, 15.97s/it]

Iteration 2800: Win % 0.92


 47%|████▋     | 2826/6000 [17:22:09<13:53:59, 15.77s/it]

Iteration 2825: Win % 0.96


 48%|████▊     | 2851/6000 [17:27:55<13:48:15, 15.78s/it]

Iteration 2850: Win % 0.88


 48%|████▊     | 2876/6000 [17:33:37<13:41:07, 15.77s/it]

Iteration 2875: Win % 0.94


 48%|████▊     | 2901/6000 [17:39:18<13:31:24, 15.71s/it]

Iteration 2900: Win % 0.82


 49%|████▉     | 2926/6000 [17:44:58<13:23:31, 15.68s/it]

Iteration 2925: Win % 0.8


 49%|████▉     | 2951/6000 [17:50:38<13:18:27, 15.71s/it]

Iteration 2950: Win % 0.9


 50%|████▉     | 2976/6000 [17:56:21<13:12:51, 15.73s/it]

Iteration 2975: Win % 0.88


 50%|█████     | 3001/6000 [18:02:01<13:03:14, 15.67s/it]

Iteration 3000: Win % 0.94


 50%|█████     | 3026/6000 [18:07:57<13:23:13, 16.20s/it]

Iteration 3025: Win % 0.88


 51%|█████     | 3051/6000 [18:13:43<13:04:34, 15.96s/it]

Iteration 3050: Win % 0.82


 51%|█████▏    | 3076/6000 [18:19:27<12:53:36, 15.87s/it]

Iteration 3075: Win % 0.92


 52%|█████▏    | 3101/6000 [18:25:10<12:40:03, 15.73s/it]

Iteration 3100: Win % 0.96


 52%|█████▏    | 3126/6000 [18:30:50<12:30:24, 15.67s/it]

Iteration 3125: Win % 0.94


 53%|█████▎    | 3151/6000 [18:36:30<12:23:41, 15.66s/it]

Iteration 3150: Win % 0.9


 53%|█████▎    | 3176/6000 [18:42:10<12:12:16, 15.56s/it]

Iteration 3175: Win % 0.94


 53%|█████▎    | 3201/6000 [18:47:48<12:10:34, 15.66s/it]

Iteration 3200: Win % 0.88


 54%|█████▍    | 3226/6000 [18:53:31<12:08:23, 15.75s/it]

Iteration 3225: Win % 0.88


 54%|█████▍    | 3251/6000 [18:59:13<12:04:46, 15.82s/it]

Iteration 3250: Win % 0.84


 55%|█████▍    | 3276/6000 [19:04:53<11:54:37, 15.74s/it]

Iteration 3275: Win % 0.96


 55%|█████▌    | 3301/6000 [19:10:33<11:44:38, 15.66s/it]

Iteration 3300: Win % 0.9


 55%|█████▌    | 3326/6000 [19:16:13<11:37:46, 15.66s/it]

Iteration 3325: Win % 0.94


 56%|█████▌    | 3351/6000 [19:21:53<11:32:25, 15.68s/it]

Iteration 3350: Win % 0.96


 56%|█████▋    | 3376/6000 [19:27:50<12:37:33, 17.32s/it]

Iteration 3375: Win % 0.92


 57%|█████▋    | 3401/6000 [19:34:18<12:42:02, 17.59s/it]

Iteration 3400: Win % 0.82


 57%|█████▋    | 3426/6000 [19:40:41<12:32:25, 17.54s/it]

Iteration 3425: Win % 0.86


 58%|█████▊    | 3451/6000 [19:47:05<12:28:33, 17.62s/it]

Iteration 3450: Win % 0.94


 58%|█████▊    | 3476/6000 [19:53:28<12:22:09, 17.64s/it]

Iteration 3475: Win % 0.9


 58%|█████▊    | 3501/6000 [19:59:43<11:44:42, 16.92s/it]

Iteration 3500: Win % 0.92


 59%|█████▉    | 3526/6000 [20:05:50<11:35:19, 16.86s/it]

Iteration 3525: Win % 0.9


 59%|█████▉    | 3551/6000 [20:12:03<11:23:24, 16.74s/it]

Iteration 3550: Win % 0.88


 60%|█████▉    | 3576/6000 [20:18:01<11:10:55, 16.61s/it]

Iteration 3575: Win % 0.84


 60%|██████    | 3601/6000 [20:23:48<10:32:57, 15.83s/it]

Iteration 3600: Win % 0.84


 60%|██████    | 3626/6000 [20:29:55<11:39:18, 17.67s/it]

Iteration 3625: Win % 0.92


 61%|██████    | 3651/6000 [20:36:03<11:21:06, 17.40s/it]

Iteration 3650: Win % 0.9


 61%|██████▏   | 3676/6000 [20:42:10<10:49:59, 16.78s/it]

Iteration 3675: Win % 0.86


 62%|██████▏   | 3701/6000 [20:48:35<11:36:03, 18.17s/it]

Iteration 3700: Win % 0.9


 62%|██████▏   | 3726/6000 [20:55:13<11:10:10, 17.68s/it]

Iteration 3725: Win % 0.88


 63%|██████▎   | 3751/6000 [21:01:27<10:43:05, 17.16s/it]

Iteration 3750: Win % 0.8


 63%|██████▎   | 3776/6000 [21:07:31<10:20:32, 16.74s/it]

Iteration 3775: Win % 0.88


 63%|██████▎   | 3801/6000 [21:13:43<10:11:26, 16.68s/it]

Iteration 3800: Win % 0.76


 64%|██████▍   | 3826/6000 [21:19:44<10:01:05, 16.59s/it]

Iteration 3825: Win % 0.88


 64%|██████▍   | 3832/6000 [21:21:18<12:04:55, 20.06s/it]


KeyboardInterrupt: 

In [49]:
model2 =pd.read_csv('models/not_random_sumboardFalse_0001lr_1epoch_10e4entropy_64bs.csv')

In [50]:
plot_wins_entropy(model2)

## Comparison between all random training and changing to argmax when eval reaches 85%

In [51]:
plot_wins_comparison(model1, model2)

# SumBoard reward, 1 epoch, entropy loss 10e-4

In [62]:
model3 =pd.read_csv('models/not_random_sumboardTrue_0001lr_1epoch_10e4entropy_64bs.csv')
plot_wins_entropy(model3)

# SumBoard reward, 1 epoch, entropy loss 1e-4

In [58]:
model4 =pd.read_csv('models/not_random_sumboardTrue_0001lr_2epoch_1e4entropy_64bs.csv')
plot_wins_entropy(model4)

# SumBoard reward, 1 epoch, entropy loss 1e-2

In [60]:
model5 =pd.read_csv('models/not_random_sumboardTrue_0001lr_2epoch_1e2entropy_64bs.csv')
plot_wins_entropy(model5)

# Comparison between 1e-4 vs 1e-2

In [61]:
plot_wins_comparison(model4, model5)

In [27]:
PPO_agent.actor_model.save('PPO_actor_h5')
PPO_agent.critic_model.save('PPO_critic_h5')

INFO:tensorflow:Assets written to: PPO_actor_h5\assets
INFO:tensorflow:Assets written to: PPO_critic_h5\assets


In [28]:
model_secundario = tf.keras.models.load_model('models/PPO_actor_h5.h5', compile=False)

In [29]:
DUMMY_ACT, DUMMY_VAL = np.zeros((1, 64)), np.zeros((1, 1))

In [13]:
def encode_action(action):
    return [action // 8, action % 8]

def get_actions_mask(board):
    player = 1
    valid_actions = env.get_valid((board, player))
    return valid_actions.reshape(-1)  

board_shape = 8
env = ReversiEnv(board_shape=board_shape)
EPISODES = 100
total_wins_game = []
total_wins_count = []
for e in tqdm(range(20)):
    episode = 0
    (board, player) = env.reset()
    wins = []
    while episode < EPISODES:

        p = model.predict([board.reshape(1,64), DUMMY_VAL, DUMMY_ACT])
        valid_actions = get_actions_mask(board.reshape(board_shape, board_shape))
        mask_prob = p.reshape(valid_actions.shape)*valid_actions
        mask_prob = mask_prob/np.sum(mask_prob)
        action = np.argmax(mask_prob)
        action = encode_action(action)
        (board, player), reward, done, _ = env.step(action)

        while not done and (player == -1):
            valid_actions = np.argwhere(env.get_valid((board, player)) == 1)
            action = valid_actions[np.random.randint(len(valid_actions))]
            (board, player), reward, done, _ = env.step(action)

        if done:
            episode += 1
            if reward == 0:
                reward = 0
            wins.append(reward)
            total_wins_count.append(reward)
            (board, player) = env.reset()
        
    total_wins_game.append(wins)

100%|██████████| 20/20 [05:57<00:00, 17.89s/it]


In [14]:
conter_win = []
for i, wins in enumerate(total_wins_game):
    conter_win.append(Counter(total_wins_game[i])[1])

In [15]:
conter_win

[88,
 86,
 87,
 90,
 94,
 85,
 95,
 94,
 92,
 87,
 89,
 87,
 92,
 93,
 88,
 92,
 90,
 88,
 87,
 85]