Copyright **`(c)`** 2022 Giovanni Squillero `<squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  


# Lab 3: Policy Search

## Task

Write agents able to play [*Nim*](https://en.wikipedia.org/wiki/Nim), with an arbitrary number of rows and an upper bound $k$ on the number of objects that can be removed in a turn (a.k.a., *subtraction game*).

The player **taking the last object wins**.

* Task3.1: An agent using fixed rules based on *nim-sum* (i.e., an *expert system*)
* Task3.2: An agent using evolved rules
* Task3.3: An agent using minmax
* Task3.4: An agent using reinforcement learning

## Instructions

* Create the directory `lab3` inside the course repo 
* Put a `README.md` and your solution (all the files, code and auxiliary data if needed)

## Notes

* Working in group is not only allowed, but recommended (see: [Ubuntu](https://en.wikipedia.org/wiki/Ubuntu_philosophy) and [Cooperative Learning](https://files.eric.ed.gov/fulltext/EJ1096789.pdf)). Collaborations must be explicitly declared in the `README.md`.
* [Yanking](https://www.emacswiki.org/emacs/KillingAndYanking) from the internet is allowed, but sources must be explicitly declared in the `README.md`.

## Deadlines ([AoE](https://en.wikipedia.org/wiki/Anywhere_on_Earth))

* Sunday, December 4th for Task3.1 and Task3.2
* Sunday, December 11th for Task3.3 and Task3.4
* Sunday, December 18th for all reviews

In [19]:
import logging
from collections import namedtuple
import random
from typing import Callable
from copy import deepcopy
from itertools import accumulate
from operator import xor

logging.getLogger().setLevel(logging.INFO)

## The *Nim* and *Nimply* classes

In [2]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [3]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

## Sample (and silly) startegies 

In [4]:
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result


def cook_status(state: Nim) -> dict:
    cooked = dict()
    cooked["possible_moves"] = [
        (r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k
    ]
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])[0]
    cooked["nim_sum"] = nim_sum(state)

    brute_force = list()
    for m in cooked["possible_moves"]:
        tmp = deepcopy(state)
        tmp.nimming(m)
        brute_force.append((m, nim_sum(tmp)))
    cooked["brute_force"] = brute_force

    return cooked

In [5]:
def optimal_strategy(state: Nim) -> Nimply:
    data = cook_status(state)
    return next((bf for bf in data["brute_force"] if bf[1] == 0), random.choice(data["brute_force"]))[0]

In [13]:
NUM_MATCHES = 10
NIM_SIZE = 6
K = None

def evaluate(strategyA: Callable, strategyB: Callable) -> float:
    opponent = (strategyA, strategyB)
    won = 0

    for m in range(NUM_MATCHES):
        nim = Nim(NIM_SIZE, K)
        player = 1
        while nim:
            ply = opponent[player](nim)
            nim.nimming(ply)
            player = 1 - player
        if player == 1:
            won += 1
    return won / NUM_MATCHES

In [7]:
# Among all possible moves, simply do this:
# - if there is a winning move, choose it
# - if there is not a winning move but the move puts the opponent in a winning situation, discard it
# - otherwise choose the first move possible, even if not optimal (obliged to do a move)

def fixed_strategy(state: Nim):
    possible_moves = ((r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k)
    move = None
    firstMove = None

    for m in possible_moves:
        if firstMove == None:
            firstMove = Nimply(m[0], m[1])
        tmp = deepcopy(state)
        tmp.nimming(m)
        if not tmp:
            return Nimply(m[0], m[1])
        elif len([r for r in state.rows if r > 0 and (state.k == None or r < state.k)]) == 1:
            continue
        elif move == None:
            move = Nimply(m[0], m[1])

    return move if move != None else firstMove


In [22]:
from statistics import *

RULES = [sum, min, max, mean, stdev]

def evolvable_strategy(genome):
    def strategy(state: Nim):
            possible_moves = [(r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k]
            best_play = None
            for m in possible_moves:
                tmp = deepcopy(state)
                tmp.nimming(m)
                
                play = (m, sum(w * op(state.rows) for w, op in zip(genome, RULES)))

                if play[1] == 0:
                    return play[0]                

                if best_play == None or best_play[1] > play[1]:
                    best_play = play
                                
            return best_play[0]
    
    return strategy

def mutation(genome):
    point = random.randint(0, len(genome))
    return genome[:point] + [random.random()] + genome[point + 1:]

def crossover(genomeA, genomeB):
    p = random.random()
    return [p * x + (1 - p) * y for x, y in zip(genomeA, genomeB)]

def tournament(population, tournament_size):
    return max(random.choices(population, k=tournament_size), key=lambda i: i.fitness)

def fitness(genome, population):
    games = 5
    f = 0.0

    for opponent in random.choices(population, k=games):
        f += evaluate(evolvable_strategy(genome), evolvable_strategy(opponent.genome)) + evaluate(evolvable_strategy(opponent.genome), evolvable_strategy(genome))

    return (f/10, evaluate(evolvable_strategy(genome), optimal_strategy))
        
def genetic_algorithm():
    Individual = namedtuple('Individual', ('genome', 'fitness'))

    NUM_GENS = 10    
    POPULATION_SIZE = 10
    OFFSPRING_SIZE = 10
    TOURNAMENT_SIZE = 2

    population = [Individual(i, 0) for i in ([random.random() for _ in range(len(RULES))] for _ in range(POPULATION_SIZE))]

    for idx, i in enumerate(population):
        population[idx] = Individual(i.genome, fitness(i.genome, population))

    logging.info(population)

    for g in range(NUM_GENS):
        logging.info(f'Generation {g}')
        offspring = list()
        for i in range(OFFSPRING_SIZE):
            if random.random() < 0.3:
                p = tournament(population, tournament_size=TOURNAMENT_SIZE)
                o = mutation(p.genome)
            else:
                p1 = tournament(population, tournament_size=TOURNAMENT_SIZE)
                p2 = tournament(population, tournament_size=TOURNAMENT_SIZE)
                o = crossover(p1.genome, p2.genome)
            f = fitness(o, population)
            offspring.append(Individual(o, f))
        population += offspring
        population = sorted(population, key=lambda i: i.fitness, reverse=True)[:POPULATION_SIZE]
    
    best = max(population, key=lambda i: i.fitness)
    logging.info(f'Best individual: {best}')

    return evolvable_strategy(best.genome)

genetic_algorithm()

INFO:root:[Individual(genome=[0.1348724790452498, 0.994924756926436, 0.37244151399233283, 0.09372452989510183, 0.04845926033229975], fitness=(1.0, 0.11)), Individual(genome=[0.2150090643810777, 0.01151942036530762, 0.02188270846746554, 0.1361667059290339, 0.9766475188062447], fitness=(1.0, 0.06)), Individual(genome=[0.8302306140125435, 0.6577773297714204, 0.1892682005108115, 0.9791417211608511, 0.5319228711076128], fitness=(1.0, 0.07)), Individual(genome=[0.6893559393605387, 0.7495943919023228, 0.9663414076793236, 0.9512788391768748, 0.02348272572469501], fitness=(1.0, 0.1)), Individual(genome=[0.13227155201007001, 0.5218982672110298, 0.8540263597279302, 0.19940064638387411, 0.7976968423948293], fitness=(1.0, 0.14)), Individual(genome=[0.18458035951103302, 0.10982205333541861, 0.42313370714545306, 0.5065330912181025, 0.7186786243093095], fitness=(1.0, 0.08)), Individual(genome=[0.3828595654612942, 0.5962915162341055, 0.3838001843109642, 0.45981303335045054, 0.038913111945559575], fitne

<function __main__.evolvable_strategy.<locals>.strategy(state: __main__.Nim)>

In [18]:
NUM_MATCHES = 100
NIM_SIZE = 4
K = None

logging.info(evaluate(optimal_strategy, fixed_strategy))
logging.info(evaluate(fixed_strategy, optimal_strategy))

K = NIM_SIZE // 2
logging.info(evaluate(optimal_strategy, fixed_strategy))
logging.info(evaluate(fixed_strategy, optimal_strategy))

## Oversimplified match

In [287]:
logging.getLogger().setLevel(logging.DEBUG)

strategy = (optimal_strategy, fixed_strategy)

nim = Nim(4, 2)
logging.debug(f"status: Initial board  -> {nim}")
player = 0
while nim:
    ply = strategy[player](nim) 
    nim.nimming(ply)
    logging.debug(f"status: After player {player} -> {nim}")
    player = 1 - player
winner = 1 - player
logging.info(f"status: Player {winner} won!")

DEBUG:root:status: Initial board  -> <1 3 5 7>
DEBUG:root:status: After player 0 -> <1 1 5 7>
DEBUG:root:status: After player 1 -> <0 1 5 7>
DEBUG:root:status: After player 0 -> <0 1 4 7>
DEBUG:root:status: After player 1 -> <0 0 4 7>
DEBUG:root:status: After player 0 -> <0 0 3 7>
DEBUG:root:status: After player 1 -> <0 0 2 7>
DEBUG:root:status: After player 0 -> <0 0 1 7>
DEBUG:root:status: After player 1 -> <0 0 0 7>
DEBUG:root:status: After player 0 -> <0 0 0 5>
DEBUG:root:status: After player 1 -> <0 0 0 4>
DEBUG:root:status: After player 0 -> <0 0 0 2>
DEBUG:root:status: After player 1 -> <0 0 0 0>
INFO:root:status: Player 1 won!
