# Stochastic Reward Game

In [1]:
import numpy as np
import math
import random
from enum import Enum

In [2]:
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()

In [3]:
ITERATIONS = 20

## Actions (normal distribution)

In [4]:
class Normal:
    """ Normal distribution klass"""
    def __init__(self, mean, std):
        self._mean = mean
        self._std = std

    def mean(self):
        return self._mean

    def std(self):
        return self._std

    def reward(self):
        return np.random.normal(self._mean, self._std)

## Actions : Stochastic Climbing Game

In [5]:
def softmax_distribution(tau, rewardsList):
    distribution = []
    for a in range(nbrActions):
        qta = rewardsList[a]
        distribution.append(np.power(math.e, qta / tau))
    return list(np.divide(distribution, sum(distribution))) 

In [6]:
class StochasticClimbingGameActions:
    """StochasticClimbingGame"""
    def __init__(self, tau, std, std0 = None, std1 = None):
        self._tau = tau
        std = std * std
        std0 = std0*std0 if std0 else std
        std1 = std1*std1 if std1 else std
        self._table = [[Normal(11,std0),Normal(-30,std), Normal(0,std)],
                       [Normal(-30,std),Normal(7,std1), Normal(6,std)],
                       [Normal(0,std),Normal(0,std), Normal(5,std)]
                      ]
    def a(self, action, b_rew):
        actionrewards = [ac[action].reward() for ac in self._table]
        actionproba = softmax_distribution(self._tau, b_rew)
        return sum([actionrewards[i]*actionproba[i] for i in range(nbrActions)])
    
    def b(self, action, a_rew):
        actionrewards = [ac.reward() for ac in self._table[action]]
        actionproba = softmax_distribution(self._tau, a_rew)
        prin
        return sum([actionrewards[i]*actionproba[i] for i in range(nbrActions)])
    
    def reward(self, a, b):
        return self._table[b][a].reward()

## Algos

In [7]:
def random_action():
    return random.choice([i for i in range(nbrActionsPlayer)])

In [8]:
def softmax_action(tau, history, count, player):
    qtamatrix = history[-1]['Qta'] if len(history) > 0 else [0 for i in range(nbrActions)]
    qtamatrix = [[ qtamatrix[i + (j*nbrActionsPlayer)] for i in range(nbrActionsPlayer)] for j in range(nbrActionsPlayer)]
    if np.sum(count) == 0:
        count = [1 for i in range(len(count))]
    countproba = np.divide( np.array(count) , np.sum(count) )
    if player == "A":
        qtal = list(np.transpose(np.dot(countproba, qtamatrix)) )
    elif player == "B":
        qtal = list(np.dot(qtamatrix, np.transpose(countproba)) ) 
    else:
        raise ValueError("Unknown player")
    distribution = []
    for a in range(nbrActionsPlayer):
        qta = qtal[a]
        distribution.append(np.power(math.e, qta / tau))
    distribution = list(np.divide(distribution, sum(distribution))) 
    probability = np.random.uniform()
    action = 0
    while probability > 0:
        probability -= distribution[action]
        if probability > 0:
            action += 1
    return action

In [9]:
def heuristic_action(tau, history, count, player):
    qtamatrix = history[-1]['Qta'] if len(history) > 0 else [0 for i in range(nbrActions)]
    qtamatrix = [[ qtamatrix[i + (j*nbrActionsPlayer)] for i in range(nbrActionsPlayer)] for j in range(nbrActionsPlayer)]
    if np.sum(count) == 0:
        count = [1 for i in range(len(count))]
    countproba = np.divide( np.array(count) , np.sum(count) )
    if player == "A":
        qtal = list(np.transpose(np.dot(countproba, qtamatrix)) )
    elif player == "B":
        qtal = list(np.dot(qtamatrix, np.transpose(countproba)) ) 
    else:
        raise ValueError("Unknown player")
    distribution1 = []
    for a in range(nbrActionsPlayer):
        qta = qtal[a]
        distribution1.append(np.power(math.e, qta / tau))
    distribution1 = np.divide(distribution1, sum(distribution1)) 
    
    qtamatrix = np.amax([history[i]['Qta'] for i in range(len(history))], axis=0)  if len(history) > 0 else [0 for i in range(nbrActions)]
    qtamatrix = [[ qtamatrix[i + (j*nbrActionsPlayer)] for i in range(nbrActionsPlayer)] for j in range(nbrActionsPlayer)]
    if np.sum(count) == 0:
        count = [1 for i in range(len(count))]
    countproba = np.divide( np.array(count) , np.sum(count) )
    if player == "A":
        qtal = list(np.transpose(np.dot(countproba, qtamatrix)) )
    elif player == "B":
        qtal = list(np.dot(qtamatrix, np.transpose(countproba)) ) 
    else:
        raise ValueError("Unknown player")
    distribution2 = []
    for a in range(nbrActionsPlayer):
        qta = qtal[a]
        distribution2.append(np.power(math.e, qta / tau))
    distribution2 = np.divide(distribution2, sum(distribution1))
    rho = 0.9
    distribution = rho*distribution2 + (1-rho)*distribution1
    probability = np.random.uniform()
    action = 0
    if sum(distribution) != 1:
        pass
        #print(distribution, distribution1, distribution2)
        #raise ValueError("Distribution sum not equal to 1")
    while probability > 0:
        probability -= distribution[action]
        if probability > 0:
            action += 1
    return action

## Stochastic Climbing Game

In [10]:
class StochasticClimbingGame(object):
    """ Q learning with custom algo for StochasticClimbingGame"""
    class Algo(Enum): # PYTHON 3.4 FTW
        random = 1
        softmax = 2
        heuristic = 3

    def __init__(self, algo, std, std0 = None, std1 = None, tau=0.1):
        self._algo = algo
        # history for plotting purpose and action selection
        self._tau = tau
        self._historyA = []
        self._historyB = []
        # init count C
        self._countA = [0 for i in range(nbrActionsPlayer)]
        self._countB = [0 for i in range(nbrActionsPlayer)]
        # init Q
        self._currentQa = []
        self._currentQb = []
        for a in range(nbrActions):
            self._currentQa.append(0)
            self._currentQb.append(0)
        # reward helper 
        self._scga = StochasticClimbingGameActions(tau, std, std0, std1)

    def select_action(self, epoch, player):
        if self._algo is StochasticClimbingGame.Algo.random:
            result = random_action()
        elif self._algo == StochasticClimbingGame.Algo.softmax:
            if self._tau == None:
                raise ValueError("Tau not defined for softmax")
            if player == "A":
                result = softmax_action(self._tau, self._historyA, self._countB, player)
            elif player == "B":
                result = softmax_action(self._tau, self._historyB, self._countA, player)
            else:
                raise ValueError("Player invalid")
        elif self._algo == StochasticClimbingGame.Algo.heuristic:
            if self._tau == None:
                raise ValueError("Tau not defined for softmax")
            if player == "A":
                result = heuristic_action(self._tau, self._historyA, self._countB, player)
            elif player == "B":
                result = heuristic_action(self._tau, self._historyB, self._countA, player)
            else:
                raise ValueError("Player invalid")
        else:
            raise ValueError("Algo specified not found")
        return result
    
    def play(self, plays=5000):
        for epoch in range(plays):
            actionA = self.select_action(epoch, "A")
            actionB = self.select_action(epoch, "B")
            jointaction = (actionA, actionB)
            
            rewardA = self._scga.reward(actionA, actionB)
            rewardB = rewardA
            
            self._countA[actionA] += 1
            self._countB[actionB] +=1
            
            QA = self.update_q_a(epoch, jointaction, rewardA)
            QB = self.update_q_b(epoch, jointaction, rewardB)
            qtaA = [self._currentQa[a] for a in range(len(actions))]
            qtaB = [self._currentQb[a] for a in range(len(actions))]
            self._historyA.append({'epoch':epoch, 'action': jointaction , 'reward': rewardA, 'Qta': qtaA})
            self._historyB.append({'epoch':epoch, 'action': jointaction , 'reward': rewardB, 'Qta': qtaB})

    def update_q_a(self, step, jointaction, reward):
        lena = sum([1 if self._historyA[i]['action'] == jointaction else 0 for i in range(step)])
        action = jointaction[0] + (jointaction[1] * nbrActionsPlayer)
        self._currentQa[action] = (self._currentQa[action]*lena + reward) / (lena+1)
        return self._currentQa[action]
    
    def update_q_b(self, step, jointaction, reward):
        lena = sum([1 if self._historyB[i]['action'] == jointaction else 0 for i in range(step)])
        action = jointaction[0] + (jointaction[1] * nbrActionsPlayer)
        self._currentQb[action] = (self._currentQb[action]*lena + reward) / (lena+1)
        return self._currentQb[action]
    
    def get_avg_reward_a(self):
        res = [self._historyA[0]['reward']]
        for epoch in range(1,len(self._historyA)):
            res.append(self._historyA[epoch]['reward'])
        return res

    def get_avg_reward_b(self):
        res = [self._historyB[0]['reward']]
        for epoch in range(1,len(self._historyB)):
            res.append(self._historyB[epoch]['reward'])
        return res

    def get_qta_history_a(self, action):
        return [h['Qta'][action] for h in self._historyA]

    def get_action_count_a(self):
        return [[h['action'] for h in self._historyA].count(a) for a in actions]


## Results

In [11]:
def get_result(algo, std, std0 = None, std1 = None, tau=1):
    def npavg(l):
        return list(np.average(l, axis=0))
    avgrewardlists = []
    Qactions = []
    actionshistory = []
    for i in range(ITERATIONS):
        multiarmedbandit = StochasticClimbingGame(algo=algo, std=std, std0=std0, std1=std1, tau=tau)
        multiarmedbandit.play(5000)
        avgrewardlists.append(multiarmedbandit.get_avg_reward_a())
        for i, action in enumerate(actions):
            if len(Qactions) == len(actions):
                Qactions[i].append(multiarmedbandit.get_qta_history_a(i))
            else:
                Qactions.append([multiarmedbandit.get_qta_history_a(i)])
        actionshistory.append(multiarmedbandit.get_action_count_a())
    avgrewardlists = npavg(avgrewardlists)
    actionshistory = npavg(actionshistory)
    for i in range(len(Qactions)):
        Qactions[i] = npavg(Qactions[i])
    return avgrewardlists, Qactions, actionshistory

# Exo 1

In [12]:
nbrActionsPlayer = 3
actions = [(i,j) for i in range(nbrActionsPlayer) for j in range(nbrActionsPlayer)]
nbrActions = len(actions)

In [13]:
actions

[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)]

In [14]:
%time softmax01 = get_result(StochasticClimbingGame.Algo.softmax, 0.2, tau=0.1, std0=0.2, std1=0.2)

CPU times: user 1min 34s, sys: 120 ms, total: 1min 34s
Wall time: 1min 35s


In [15]:
%time heuristic = get_result(StochasticClimbingGame.Algo.heuristic, 0.2, tau=1, std0=0.2, std1=0.2)

CPU times: user 9min 29s, sys: 156 ms, total: 9min 29s
Wall time: 9min 33s


In [16]:

g3 = go.Scatter(
    x=[i for i in range(len(softmax01[0]))],
    y=softmax01[0],
    name='Softmax 0.1'
)

g4 = go.Scatter(
    x=[i for i in range(len(heuristic[0]))],
    y=heuristic[0],
    name='Heuristic'
)

layout = dict(title = 'Collected reward per episode versus the episode number',
              xaxis = dict(title = 'Plays'),
              yaxis = dict(title = 'Rewards'),
)

data = [g3,g4]
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
%time softmax012 = get_result(StochasticClimbingGame.Algo.softmax, 0.1, tau=0.1, std0=4, std1=0.1)

CPU times: user 16min 19s, sys: 336 ms, total: 16min 20s
Wall time: 16min 26s


In [None]:
%time heuristic2 = get_result(StochasticClimbingGame.Algo.heuristic, 0.1, tau=1, std0=4, std1=0.1)

In [38]:

g3 = go.Scatter(
    x=[i for i in range(len(softmax012[0]))],
    y=softmax012[0],
    name='Softmax 0.1'
)

g4 = go.Scatter(
    x=[i for i in range(len(heuristic2[0]))],
    y=heuristic2[0],
    name='Heuristic'
)

layout = dict(title = 'Collected reward per episode versus the episode number',
              xaxis = dict(title = 'Plays'),
              yaxis = dict(title = 'Rewards'),
)

data = [g3,g4]
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)

In [29]:
%time softmax013 = get_result(StochasticClimbingGame.Algo.softmax, 0.1, tau=0.1, std0=0.1, std1=4)

CPU times: user 16min 2s, sys: 376 ms, total: 16min 2s
Wall time: 16min 8s


In [27]:
%time heuristic3 = get_result(StochasticClimbingGame.Algo.heuristic, 0.1, tau=1, std0=0.1, std1=4)

CPU times: user 9min 34s, sys: 128 ms, total: 9min 34s
Wall time: 9min 37s


In [30]:
g3 = go.Scatter(
    x=[i for i in range(len(softmax013[0]))],
    y=softmax013[0],
    name='Softmax 0.1'
)

g4 = go.Scatter(
    x=[i for i in range(len(heuristic3[0]))],
    y=heuristic3[0],
    name='Heuristic'
)

layout = dict(title = 'Collected reward per episode versus the episode number',
              xaxis = dict(title = 'Plays'),
              yaxis = dict(title = 'Rewards'),
)

data = [g3,g4]
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)