Copyright **`(c)`** 2022 Giovanni Squillero `<squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  


# Lab 3: Policy Search

## Task

Write agents able to play [*Nim*](https://en.wikipedia.org/wiki/Nim), with an arbitrary number of rows and an upper bound $k$ on the number of objects that can be removed in a turn (a.k.a., *subtraction game*).

The player **taking the last object wins**.

* Task3.1: An agent using fixed rules based on *nim-sum* (i.e., an *expert system*)
* Task3.2: An agent using evolved rules
* Task3.3: An agent using minmax
* Task3.4: An agent using reinforcement learning

## Instructions

* Create the directory `lab3` inside the course repo 
* Put a `README.md` and your solution (all the files, code and auxiliary data if needed)

## Notes

* Working in group is not only allowed, but recommended (see: [Ubuntu](https://en.wikipedia.org/wiki/Ubuntu_philosophy) and [Cooperative Learning](https://files.eric.ed.gov/fulltext/EJ1096789.pdf)). Collaborations must be explicitly declared in the `README.md`.
* [Yanking](https://www.emacswiki.org/emacs/KillingAndYanking) from the internet is allowed, but sources must be explicitly declared in the `README.md`.

## Deadlines ([AoE](https://en.wikipedia.org/wiki/Anywhere_on_Earth))

* Sunday, December 4th for Task3.1 and Task3.2
* Sunday, December 11th for Task3.3 and Task3.4
* Sunday, December 18th for all reviews

In [2]:
import logging
from collections import namedtuple
import random
from typing import Callable
from copy import deepcopy
from itertools import accumulate
from operator import *

logging.getLogger().setLevel(logging.INFO)

## The *Nim* and *Nimply* classes

In [3]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [4]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

In [5]:
def evaluate(strategyA: Callable, strategyB: Callable, num_matches = 1, nim_size = 3, k = None) -> float:
    players = (strategyA, strategyB)
    won = 0

    for _ in range(num_matches):
        nim = Nim(nim_size, k)
        player = 1
        while nim:
            ply = players[player](nim)
            nim.nimming(ply)
            player = 1 - player
        if player == 1:
            won += 1
    return won / num_matches

## Optimal strategy

In [6]:
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result


def cook_status(state: Nim) -> dict:
    cooked = dict()
    cooked["possible_moves"] = [
        (r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k
    ]
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])[0]
    cooked["nim_sum"] = nim_sum(state)

    brute_force = list()
    for m in cooked["possible_moves"]:
        tmp = deepcopy(state)
        tmp.nimming(m)
        brute_force.append((m, nim_sum(tmp)))
    cooked["brute_force"] = brute_force

    return cooked

In [7]:
def optimal_strategy(state: Nim) -> Nimply:
    data = cook_status(state)
    return next((bf for bf in data["brute_force"] if bf[1] == 0), random.choice(data["brute_force"]))[0]

## Random strategy

In [8]:
# Choose a random non empty row and remove a random number of objects smaller than min(k, row_objects)

def random_strategy(state: Nim):
    r = random.choice([idx for idx, r in enumerate(state.rows) if r > 0])
    num_objects = random.randint(1, min(state.rows[r], state.k) if state.k != None else state.rows[r])

    return (r, num_objects)

## Task 3.1: Fixed-Rule Strategy

In [9]:
# Among all possible moves, simply do this:
# - if there is a winning move, choose it
# - if there is not a winning move but the move puts the opponent in a winning situation, discard it
# - otherwise choose the first move possible, even if not optimal (obliged to do a move)

def fixed_strategy(state: Nim):
    possible_moves = ((r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k)
    move = None
    firstMove = None

    for m in possible_moves:
        if firstMove == None:
            firstMove = Nimply(m[0], m[1])
        tmp = deepcopy(state)
        tmp.nimming(m)
        if not tmp:
            return Nimply(m[0], m[1])
        else:
            active_rows = len([r for r in state.rows if r > 0])
            eliminable_rows = len([r for r in state.rows if r > 0 and (state.k == None or r < state.k)])
            if active_rows == eliminable_rows and eliminable_rows == 1:
                continue
            elif move == None:
                move = Nimply(m[0], m[1])

    return move if move != None else firstMove


## Task 3.2: Evolved Strategy

In [25]:
from functools import *
from statistics import *

GENOME_LENGTH = 11 # NB: GENOME_LENGTH must be of the form 4*n + 3 with n > 0, if n > 1 it may converge to the xor solution
GAMES = 5
GAME_PARS = list((nim_size, k) for nim_size, k in ((random.randint(3, 10), random.choice([None, random.randint(1, 10)])) for _ in range(GAMES)))

def decode_genome(genome):
    assert (GENOME_LENGTH - 3) % 4 == 0 and GENOME_LENGTH > 0, 'GENOME_LENGTH must be of the form 4*n + 3 with n > 0'
    out = ""

    for op_start in range(0, GENOME_LENGTH, 4):
        tmpA = 'a' if genome[op_start] < 0.5 else '!a'
        tmpB = 'b' if genome[op_start + 2] < 0.5 else '!b'
        internal_op = '&' if genome[op_start + 1] < 0.5 else '|'
        op = ('&' if genome[op_start + 3] < 0.5 else '|') if op_start + 3 < GENOME_LENGTH else ''
        out += f'({tmpA} {internal_op} {tmpB}) {op} '

    return out[:-2]


def evolvable_strategy(genome):
    assert (GENOME_LENGTH - 3) % 4 == 0 and GENOME_LENGTH > 0, 'GENOME_LENGTH must be of the form 4*n + 3 with n > 0'

    def genetic_op(a, b):
        result = 0
        op = lambda _, b: b

        for op_start in range(0, GENOME_LENGTH, 4):
            tmpA = a if genome[op_start] < 0.5 else ~a
            tmpB = b if genome[op_start + 2] < 0.5 else ~b
            internal_op = and_ if genome[op_start + 1] < 0.5 else or_
            result = op(result, internal_op(tmpA, tmpB))
            op = (and_ if genome[op_start + 3] < 0.5 else or_) if op_start + 3 < GENOME_LENGTH else None

        return result

    def strategy(state: Nim):
            possible_moves = [(r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k]
            best = None

            for m in possible_moves:
                tmp = deepcopy(state)
                tmp.nimming(m)
                
                val = reduce(genetic_op, tmp.rows)

                if best == None or best[1] > val:
                    best = (m, val)
                             
            return best[0]
    
    return strategy

def mutation(genome):
    point = random.randint(0, len(genome) - 1)
    return genome[:point] + [1 - genome[point]] + genome[point + 1:]

def crossover(genomeA, genomeB):
    p = random.random()
    return [x if p < 0.5 else y for x, y in zip(genomeA, genomeB)]

def tournament(population, tournament_size):
    return max(random.choices(population, k=tournament_size), key=lambda i: i.fitness)

def fitness(genome):
    win_optimal = 0.0
    win_random = 0.0

    for nim_size, k in GAME_PARS:
        home = evaluate(evolvable_strategy(genome), random_strategy, nim_size=nim_size, k=k) 
        away = 1 - evaluate(random_strategy, evolvable_strategy(genome), nim_size=nim_size, k=k)
        win_random += home + away
        home = evaluate(evolvable_strategy(genome), optimal_strategy, nim_size=nim_size, k=k) 
        away = 1 - evaluate(optimal_strategy, evolvable_strategy(genome), nim_size=nim_size, k=k)
        win_optimal += home + away

    return (win_optimal/(2*GAMES), win_random/(2*GAMES))
        
def genetic_algorithm():
    Individual = namedtuple('Individual', ('genome', 'fitness'))

    NUM_GENS = 100    
    POPULATION_SIZE = 10
    OFFSPRING_SIZE = 20
    TOURNAMENT_SIZE = 2
    USELESS_GENS = 0
    STEADY_STATE_LIMIT = 10

    population = [Individual(i, fitness(i)) for i in ([round(random.random(), 2) for _ in range(GENOME_LENGTH)] for _ in range(POPULATION_SIZE))]
    best = None
    
    for g in range(NUM_GENS):
        offspring = list()
        for i in range(OFFSPRING_SIZE):
            if random.random() < 0.3:
                p = tournament(population, tournament_size=TOURNAMENT_SIZE)
                o = mutation(p.genome)
            else:
                p1 = tournament(population, tournament_size=TOURNAMENT_SIZE)
                p2 = tournament(population, tournament_size=TOURNAMENT_SIZE)
                o = crossover(p1.genome, p2.genome)
            f = fitness(o)
            offspring.append(Individual(o, f))
        population += offspring
        population = sorted(population, key=lambda i: i.fitness, reverse=True)[:POPULATION_SIZE]
        newBest = max(population, key=lambda i: i.fitness)

        if best != None and newBest <= best:
            logging.info(f'Gen {g+1} skipped because useless')
            USELESS_GENS += 1
        else:
            logging.info(f'Gen {g+1}, found new best individual: {decode_genome(newBest.genome)} with fitness = {newBest.fitness}')
            best = newBest
            USELESS_GENS = 0
        
        if USELESS_GENS == STEADY_STATE_LIMIT:
            logging.info(f'Gen {g+1}, no improvements after {USELESS_GENS} gens, terminating...')
            break

    logging.info(f'Best individual: {decode_genome(best.genome)} with genome {best.genome} fitness = {best.fitness}')

    return evolvable_strategy(best.genome)

In [17]:
evolved_strategy = genetic_algorithm()

INFO:root:Gen 1, found new best individual: (a & b) & (!a | b) | (!a & !b) with fitness = (0.4, 0.9)
INFO:root:Gen 2, found new best individual: (a & b) & (!a | b) | (!a & !b) with fitness = (0.4, 1.0)
INFO:root:Gen 3 skipped because useless
INFO:root:Gen 4 skipped because useless
INFO:root:Gen 5 skipped because useless
INFO:root:Gen 6 skipped because useless
INFO:root:Gen 7 skipped because useless
INFO:root:Gen 8 skipped because useless
INFO:root:Gen 9 skipped because useless
INFO:root:Gen 10 skipped because useless
INFO:root:Gen 11 skipped because useless
INFO:root:Gen 12 skipped because useless
INFO:root:Gen 12, no improvements after 10 gens, terminating...
INFO:root:Best individual: (a & b) & (!a | b) | (!a & !b) with genome [0.49, 0.09, 0.38, 0.10999999999999999, 0.88, 0.64, 0.47, 0.79, 0.63, 0.06, 0.68] fitness = (0.4, 1.0)


## Oversimplified match

In [19]:
logging.getLogger().setLevel(logging.DEBUG)
xor_strategy = evolvable_strategy([0, 0, 1, 1, 1, 0, 0])
strategy = (evolved_strategy, optimal_strategy)

nim = Nim(3, None)
logging.debug(f"status: Initial board  -> {nim}")
player = 0
while nim:
    ply = strategy[player](nim) 
    nim.nimming(ply)
    logging.debug(f"status: After player {player} -> {nim}")
    player = 1 - player
winner = 1 - player
logging.info(f"status: Player {winner} won!")

DEBUG:root:status: Initial board  -> <1 3 5>
DEBUG:root:status: After player 0 -> <1 3 2>
DEBUG:root:status: After player 1 -> <0 3 2>
DEBUG:root:status: After player 0 -> <0 2 2>
DEBUG:root:status: After player 1 -> <0 2 1>
DEBUG:root:status: After player 0 -> <0 1 1>
DEBUG:root:status: After player 1 -> <0 0 1>
DEBUG:root:status: After player 0 -> <0 0 0>
INFO:root:status: Player 0 won!


## Benchmarks

In [23]:
games = 100
wins_fixed_strategy = 0.0
wins_evolved_strategy = 0.0
wins_fixed_strategy_rand = 0.0
wins_evolved_strategy_rand = 0.0

logging.getLogger().setLevel(logging.INFO)

for idx, pars in enumerate(((nim_size, k) for nim_size, k in ((random.randint(3, 10), random.choice([None, random.randint(1, 10)])) for _ in range(games)))):
    nim_size, k = pars
    
    logging.debug(f'Game {idx}: Nim({nim_size}, {k})')
    
    home = evaluate(fixed_strategy, optimal_strategy, nim_size=nim_size, k=k) 
    away = 1 - evaluate(optimal_strategy, fixed_strategy, nim_size=nim_size, k=k)
    wins_fixed_strategy += home + away

    home = evaluate(evolved_strategy, optimal_strategy, nim_size=nim_size, k=k) 
    away = 1 - evaluate(optimal_strategy, evolved_strategy, nim_size=nim_size, k=k)
    wins_evolved_strategy += home + away

    home = evaluate(fixed_strategy, random_strategy, nim_size=nim_size, k=k) 
    away = 1 - evaluate(random_strategy, fixed_strategy, nim_size=nim_size, k=k)
    wins_fixed_strategy_rand += home + away

    home = evaluate(evolved_strategy, random_strategy, nim_size=nim_size, k=k) 
    away = 1 - evaluate(random_strategy, evolved_strategy, nim_size=nim_size, k=k)
    wins_evolved_strategy_rand += home + away

logging.info(f'Fixed strategy win rate against random strategy was {wins_fixed_strategy_rand * 100 / (2 * games)} % ({wins_fixed_strategy_rand}/{2 * games})')
logging.info(f'Evolved strategy win rate against random strategy was {wins_evolved_strategy_rand * 100 / (2 * games)} % ({wins_evolved_strategy_rand}/{2 * games})')
logging.info(f'Fixed strategy win rate against optimal strategy was {wins_fixed_strategy * 100 / (2 * games)} % ({wins_fixed_strategy}/{2 * games})')
logging.info(f'Evolved strategy win rate against optimal strategy was {wins_evolved_strategy * 100 / (2 * games)} % ({wins_evolved_strategy}/{2 * games})')

INFO:root:Fixed strategy win rate against random strategy was 82.5 % (165.0/200)
INFO:root:Evolved strategy win rate against random strategy was 65.0 % (130.0/200)
INFO:root:Fixed strategy win rate against optimal strategy was 4.5 % (9.0/200)
INFO:root:Evolved strategy win rate against optimal strategy was 23.0 % (46.0/200)
