# Q-Learning

---


## Introduction

History of an **agent** is a sequence of ***experience = State, Action, Reward, New state***.

We define **Q(s, a)** as the expected value (cumulative discounted reward) of doing a in state s and then following the optimal policy. 



## References

- http://artint.info/html/ArtInt_262.html
- http://artint.info/html/ArtInt_265.html

In [5]:
def concat(s1, s2):
    res = ""
    s1_rows = s1.split("\n")
    s2_rows = s2.split("\n")
    for r1, r2 in zip(s1_rows, s2_rows):
        res += r1 + r2 + "\n"
    return res[:-1]

print(concat(BASE_CELL, BASE_CELL))

+ U ++ U +
L C RL C R
+ D ++ D +


In [2]:
BASE_CELL = (
    "+ U +\n"
    "L C R\n"
    "+ D +"
)

class Board(object):
    def __init__(self, width, high):
        self.width = width
        self.high = high
        self.cols = list(range(width))
        self.rows = list(range(high))
    
    def __str__(self):
        res = ""
        for row in self.rows:
            for col in self.cols:
                if col is 0:
                    res

def board_str(width, high):
    pass
    

## Tic Tac Toe

In [111]:
from collections import defaultdict


def are_same(l):
    base = l[0]
    for elem in l:
        if elem != base:
            return False
    return True


class InvalidPlay(Exception):
    pass


class Position(object):
    def __init__(self, i, j=None):
        if isinstance(i, Position):
            i = i.i
            j = i.j
        elif j is None:
            i = i % 3
            j = i // 3
        self.i = i
        self.j = j

    def __str__(self):
        return "Position(%s, %s)" % (self.i, self.j)

    def __repr__(self):
        return self.__str__()

    
class Player(object):
    def __init__(self, symbol):
        self.symbol = symbol

    def __hash__(self):
        # So we can use player as dictionary key
        return self.symbol.__hash__()

    def __str__(self):
        return "Player%s" % self.symbol
    
    def __repr__(self):
        return self.__str__()


class Cell(object):
    def __init__(self, i, j):
        self.content = None
        self.position = Position(i, j)
    
    @property
    def symbol(self):
        if self.content is None:
            return " "
        else:
            return self.content.symbol
    
    def is_empty(self):
        return self.content is None

    def play(self, player):
        if self.content:
            raise InvalidPlay(
                "Cell at %s already played by %s"
                % (self.position, self.content)
        )
        self.content = player
    
    def __eq__(self, other):
        return self.content == other.content
    
    def __str__(self):
        return self.symbol
    
    def __repr__(self):
        return self.__str__()


class TTT(object):
    def __init__(self):
        self.board = []
        for row in range(3):
            for col in range(3):
                self.board.append(Cell(row, col))
        self.history = defaultdict(list)
        self.winner = None
        self.ended = False

    def cell(self, *args):
        pos = Position(*args)
        return self.board[pos.i + pos.j * 3]
    
    @property
    def cols(self):
        cols = [[], [], []] 
        for row in range(3):
            for col in range(3):
                cols[col].append(self.cell(row, col))
        return cols
    
    @property
    def rows(self):
        rows = [[], [], []]
        for row in range(3):
            for col in range(3):
                rows[row].append(self.cell(row, col))
        return rows

    def check_end(self):
        winner_cell = None

        for row in self.rows:
            if not row[0].is_empty() and are_same(row):
                winner_cell = row[0]
        
        for col in self.cols:
            if not col[0].is_empty() and are_same(col):
                winner_cell = col[0]

        mid = self.cell(1, 1)
        if (
            not mid.is_empty()
            and (
                self.cell(0, 0) == mid == self.cell(2, 2)
                or self.cell(0, 2) == mid == self.cell(2, 0)
            )
        ):
            winner_cell = mid
        
        if winner_cell:
            self.ended = True
            self.winner = winner_cell.content
        else:
            self.ended = are_same(self.board)


    def play(self, cell_n, player):
        if self.ended:
            raise InvalidPlay("Game is Over")
        self.board[cell_n].play(player)
        self.history[player].append(Position(cell_n))
        self.check_end()
        
        if self.ended:
            print("Game Over")
            if self.winner:
                print("%s wins !!!" % self.winner)
        

    def display(self):
        print(self.string())
    
    def display_history(self):
        from pprint import pprint
        pprint(dict(self.history))
    
    def string(self):
        board_frmt = (
            " {} | {} | {} \n"
            "---+---+---\n"
            " {} | {} | {} \n"
            "---+---+---\n"
            " {} | {} | {} "            
        )
        return board_frmt.format(*self.board)

In [115]:
ttt = TTT()
p1 = Player("X")
p2 = Player("O")
ttt.play(1, p1)
ttt.play(2, p2)
ttt.play(3, p1)
ttt.play(4, p2)
ttt.play(5, p1)
ttt.play(6, p2)
ttt.display()

   |   |   
---+---+---
   |   |   
---+---+---
   |   |   
Game Over
PlayerO wins !!!
   | X | O 
---+---+---
 X | O | X 
---+---+---
 O |   |   


In [116]:
ttt = TTT()
p1 = Player("X")
p2 = Player("O")
ttt.play(0, p1)
ttt.play(1, p1)
ttt.play(2, p1)

Game Over
PlayerX wins !!!


In [117]:
ttt.display_history()

{PlayerX: [Position(0, 0), Position(1, 0), Position(2, 0)]}


---

## Deep Q-Learning Agent

In [None]:
# Deep Q-learning Agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
              target = reward + self.gamma * \
                       np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay