Copyright **`(c)`** 2022 Giovanni Squillero `<squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  


# Lab 3: Policy Search

## Task

Write agents able to play [*Nim*](https://en.wikipedia.org/wiki/Nim), with an arbitrary number of rows and an upper bound $k$ on the number of objects that can be removed in a turn (a.k.a., *subtraction game*).

The player **taking the last object wins**.

* Task3.1: An agent using fixed rules based on *nim-sum* (i.e., an *expert system*)
* Task3.2: An agent using evolved rules
* Task3.3: An agent using minmax
* Task3.4: An agent using reinforcement learning

## Instructions

* Create the directory `lab3` inside the course repo 
* Put a `README.md` and your solution (all the files, code and auxiliary data if needed)


In [817]:
import logging
from collections import namedtuple
import random
from typing import Callable
from copy import deepcopy
from itertools import accumulate
from operator import xor

## The *Nim* and *Nimply* classes

In [818]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [819]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects
    
    def state(self): 
        return

    


## Sample (and silly) startegies [HARDCODED]

In [820]:
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result

def cook_status(state: Nim) -> dict:
    cooked = dict()
    cooked["possible_moves"] = [
        (r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k
    ]
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])[0]
    cooked["nim_sum"] = nim_sum(state)

    brute_force = list()
    for m in cooked["possible_moves"]:
        tmp = deepcopy(state)
        tmp.nimming(m)
        brute_force.append((m, nim_sum(tmp)))
    cooked["brute_force"] = brute_force

    return cooked

def get_state(state: Nim):
    pass 

def possible_moves(state: Nim):
    try:
        data = cook_status(state=state)["possible_moves"]
        state_dict = {} #   Row: -> [nobj_1, nobj_2, ...]

        for move in data:
            row = move[0]
            nobj = move[1]
            if row in state_dict:
                state_dict[row].append(nobj)
            else: 
                state_dict[row] = [nobj]
        return state_dict

    except:
        print("Game Ended")
        return [-1]

In [821]:
#Global variable for semplicity
turn = 0

def fast(state: Nim) -> Nimply:
    """
    Take the max elements if its turn is its even one otherwise it will take one random element
    """

    data = cook_status(state)

    if (turn // 2)%2 == 0:

        #Take max from the row with most elements
        row = data["longest_row"]

        if state.k is None:
            num_objects = state.rows[row]
        else:
            if state.rows[row] > state.k:
                num_objects = state.k
            else: 
                num_objects = state.rows[row]
    else:
        #Take 1 single element from a random row
        row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
        num_objects = 1
        
    return Nimply(row, num_objects)

In [822]:
def optimal_startegy(state: Nim) -> Nimply:
    data = cook_status(state)
    return next((bf for bf in data["brute_force"] if bf[1] == 0), random.choice(data["brute_force"]))[0]

In [823]:
NUM_MATCHES = 10
NIM_SIZE = 10

def evaluate(strategy: Callable) -> float:
    opponent = (strategy, fast)
    won = 0

    for m in range(NUM_MATCHES):
        nim = Nim(NIM_SIZE)
        player = 0
        while nim:
            ply = opponent[player](nim)
            nim.nimming(ply)
            player = 1 - player
        if player == 1:
            won += 1
    return won / NUM_MATCHES


# RL Agent def


In [837]:
import numpy as np

class Agent(object):
    def __init__(self, dim_nim, alpha=0.15, random_factor=0.2):  # 80% explore, 20% exploit

        self.state_history = [(Nim(dim_nim).rows, (0,0), 0)]  # state, action, reward
        self.alpha = alpha
        self.random_factor = random_factor
        self.G = {}
        init_state = Nim(dim_nim)
        self.init_reward(init_state)

    def init_reward(self, init_state):
        """for i, row in enumerate(states):
            for j, col in enumerate(row):
                self.G[(j, i)] = np.random.uniform(low=1.0, high=0.1)
        """
        
        allowed_moves = cook_status(init_state)["possible_moves"]
        action_and_val = {}

        for move in allowed_moves:
            g = np.random.uniform(low=7.0, high=0.5)
            action_and_val[move] = g
        
        GAME_ENDED = tuple( 0 for i in range(len(init_state.rows)))
        self.G[init_state.rows] = action_and_val
        self.G[GAME_ENDED] = -1
                
        
    def choose_action(self, state, allowedMoves) -> Nimply:
        maxG = -10e15
        next_move = None

        if state.rows not in self.G:
        
            #print("Never seen this before... Random?")
            self.init_reward(state)
            next_move = random.choice(allowedMoves)

        else:
            #print("Already seen,")
            if random.random() < self.random_factor:
                # if random number below random factor, choose random action
                #print("lets try random:")
                next_move = random.choice(allowedMoves)
            else:
                # if exploiting, gather all possible actions and choose one with the highest G (reward)
                if state.rows in self.G:
                    #print("I KNOW WHAT TO DO!:")
                    actions_and_val = self.G[state.rows]

                    for action in actions_and_val:
                        g = actions_and_val[action]
                        if g >= maxG:
                            next_move = action
                            maxG = g
          
        #print(next_move)
        return Nimply(next_move[0], next_move[1])


    def update_state_history(self, state, action_taken, reward):
        self.state_history.append((state.rows, action_taken, reward))

    def learn(self):
        target = 0

        for a in reversed(self.state_history):
            #print(a)
            try:
                st, act, rew = a
                # print("s",st, "a", act, "r", rew)
                possible_moves_values = self.G[st]

                if possible_moves_values == -1:
                    continue

                g = possible_moves_values[act]
                g = g + self.alpha * (target -g) #   + o - ?
                target += rew
            except:
                #print("Something happened...(?) Nevermind")
                break
                
        self.state_history = []

        self.random_factor -= 10e-5  # decrease random factor each episode of play


## Oversimplified match

In [825]:
logging.getLogger().setLevel(logging.DEBUG)

strategy = (fast, optimal_startegy)
nim = Nim(11)

#print(possible_moves(nim))
start_setup = possible_moves(nim)

robot = Agent(nim)

"""logging.debug(f"status: Initial board  -> {nim}")
player = 0

turn = 0
while nim:
    ply = strategy[player](nim)
    nim.nimming(ply)
    #logging.debug(f"[{turn}] - status: After player {player} -> {nim}")
    player = 1 - player
    turn += 1
winner = 1 - player
logging.info(f"status: Player {winner} won!")
"""

'logging.debug(f"status: Initial board  -> {nim}")\nplayer = 0\n\nturn = 0\nwhile nim:\n    ply = strategy[player](nim)\n    nim.nimming(ply)\n    #logging.debug(f"[{turn}] - status: After player {player} -> {nim}")\n    player = 1 - player\n    turn += 1\nwinner = 1 - player\nlogging.info(f"status: Player {winner} won!")\n'

In [864]:
DIM_NIM = 11
nim = Nim(DIM_NIM)
winning_records = {0:0, 1:0}

robot = Agent(DIM_NIM, alpha=0.1)
robot_number = 0

EPISODES = 10000
for i in range(EPISODES):
    player = random.choice([0,1])
    while nim:
       
        if player == 0:
            state = nim # get the current state
            # choose an action (explore or exploit)
            allowed_moves = cook_status(nim)["possible_moves"]
            action = robot.choose_action(state, allowed_moves)
            # get the new state and reward
            reward = 0  
            # update the robot memory with state and reward
            robot.update_state_history(state, action, reward)
            nim.nimming(ply=action) # update the nim according to the action

        else:
            ply = fast(nim)
            nim.nimming(ply)
        player = 1 - player
        
    winner = 1 - player

    if winner == robot_number:
        reward = 1
    else:
        reward = -1

    robot.update_state_history(state, action, reward)
    robot.learn()  # robot should learn after every episode
    nim = Nim(11) # reinitialize the Nim
    winning_records[winner] += 1
    #print(f" [{i}] -> Winner is: ", winner)

In [869]:
print(winning_records)

{0: 9524, 1: 476}


In [870]:
def play(agent: Agent, eps, nim: Nim, opp_strategy):
    wr = {0:0, 1:0}
    EPISODES = eps
    for i in range(EPISODES):
        player = random.choice([0,1])
        while nim:
        
            if player == 0:
                state = nim # get the current state
                # choose an action (explore or exploit)
                allowed_moves = cook_status(nim)["possible_moves"]
                action = agent.choose_action(state, allowed_moves)
                nim.nimming(ply=action) # update the nim according to the action

            else:
                ply = optimal_startegy(nim)
                nim.nimming(ply)
            player = 1 - player
            
        winner = 1 - player
        nim = Nim(11) # reinitialize the Nim
        wr[winner] += 1
    return wr

In [871]:
nim= Nim(11)
wr = play(agent=robot, eps=100, nim=nim, opp_strategy=fast)
print(wr)

{0: 0, 1: 100}
