# Investment Game
The rules of the investment game are outlined in the `README.md`. Here we are implementing a simple environment in order to play the game.

In [8]:
import numpy as np
import gym
import numpy as np
import pandas as pd
import itertools
import operator
import ujson

## Game environment

In [9]:
class InvestmentGameSimple(gym.Env):
    def __init__(self):
        # add transaction costs
        file_name = 'resources/data/asset.csv'
        self.asset = pd.read_csv(file_name)
        self.num_rounds = len(self.asset)
        self.initial_balance = 1
        self.action_space = ['buy', 'sell', 'hold'] # depending on state
        self.price_levels = np.array([4,7,10])
        self.state_space = list(itertools.product(self.price_levels, [-1, 1], [True, False]))
        
    def is_terminal(self):
        if self.round >= self.num_rounds:
            return True
        else:
            return False
        
    def transform_price_to_price_level(self, price):
        return self.price_levels[np.argmin(np.abs(self.price_levels - price))]
        
    def reset(self):
        self.round = 0
        self.invested = True
        self.balance = self.initial_balance
        self.price = self.get_price_for_round(self.round)
        self.price_level = self.transform_price_to_price_level(self.price)
        self.signal_sma = self.get_signal_sma(self.round)
        return (self.price_level, self.signal_sma, self.invested)
    
    def get_next_state_and_reward(self, state, action):
        # business logic
        price_level, signal_sma, invested = state
        result = dict()
        if action == 'buy':
            result['invested_in_next_step'] = True
        elif action == 'sell':
            result['invested_in_next_step'] = False
        elif action == 'hold':
            result['invested_in_next_step'] = invested
        result['next_round'] = self.round + 1
        
        if invested:
            result['absolute_change_of_balance'] = self.get_pct_change_for_round(
                self.round) * self.balance
        else:
            result['absolute_change_of_balance'] = 0
        
        result['next_price'] = self.get_price_for_round(self.round)
        result['next_price_level'] =  self.transform_price_to_price_level(result['next_price'])
        result['signal_sma'] = self.get_signal_sma(self.round)
        result['next_balance'] = self.balance + result['absolute_change_of_balance']
        result['reward'] = result['absolute_change_of_balance']
        return result

    
    def step(self, action):
        result = self.get_next_state_and_reward(
            state=(self.price_level, self.signal_sma, self.invested),
            action=action
            )
        self.round = result['next_round']
        self.balance = result['next_balance']
        self.invested = result['invested_in_next_step']
        self.price = result['next_price']
        self.price_level = result['next_price_level']
        self.signal_sma = result['signal_sma']
        state = (self.price_level, self.signal_sma, self.invested)
        reward = result['reward']
        done = self.is_terminal()
        info = {'round': self.round, 'price': self.price, 'balance': self.balance}
        return state, reward, done, info
        
    def get_price_for_round(self, round):
        return self.asset.set_index('round').loc[round, 'price']
    
    def get_pct_change_for_round(self, round):
        return self.asset.set_index('round').loc[round, 'pct_change']
    
    def get_signal_sma(self, round):
        # +1: buy signal
        # -1: sell signal
        short_long_diff = self.asset.set_index('round').loc[round, 'short_long_diff']
        if np.isnan(short_long_diff):
            return np.nan
        else:
            return np.sign(short_long_diff)


## Base policy

In [10]:
def base_policy(states):
    """
    Defining a base policy. States are mapped to actions.

    Parameters
    ----------
    states : tuple
        Environment states

    Returns
    -------
    dict
        The resulting policy. Keys are states while the values are itself dictionaries composed 
        of action probability pairs.
    """
    policy = {}
    for state in states:
        action_probability_pairs = dict()
        # business logic
        # signal sma could be positive, negative or nan
        round, balance, price, signal_sma, invested = state
        if np.isnan(signal_sma):
            policy[state] = action_probability_pairs['hold'] = 1.0
        else:
            if invested & (signal_sma >= 0):
                action_probability_pairs['sell'] = 0.2
                action_probability_pairs['hold'] = 0.8
            elif invested & (signal_sma < 0):
                action_probability_pairs['sell'] = 0.8
                action_probability_pairs['hold'] = 0.2
            if (not invested) & (signal_sma >= 0):
                action_probability_pairs['buy'] = 0.8
                action_probability_pairs['hold'] = 0.2
            elif (not invested) & (signal_sma < 0):
                action_probability_pairs['buy'] = 0.2
                action_probability_pairs['hold'] = 0.8
        policy[state] = action_probability_pairs
    return policy


In [11]:

def choose_action(state, policy):
    """
    Return an action based on the current policy and state.

    Parameters
    ----------
    state : tuple
        Environment states 
    policy : dict
        The resulting policy. Keys are states while the values are itself dictionaries composed 
        of action probability pairs.

    Returns
    -------
    string
        One of the investment options: buy, sell, hold
    """
    action_probability_pairs = policy[state]
    action = np.random.choice(
        a=list(action_probability_pairs.keys()),
        p=list(action_probability_pairs.values()),
    )
    return action

In [12]:
def simulate_policy(policy, num_episodes):
    investmentGame = InvestmentGameSimple()
    rewards = []
    balances = []
    actions = []
    for i in range(num_episodes):
        state = investmentGame.reset()
        done = False
        episode_reward = 0
        episode_actions = []
        while not done:
            action = choose_action(state, policy)
            state, reward, done, info = investmentGame.step(action)
            episode_reward += reward
            episode_actions.append(action)
        rewards.append(episode_reward)
        balances.append(info['balance'])
        actions.append(episode_actions)
    print(f"Average balance: {np.mean(balances)}")
    return rewards, balances, actions


In [13]:
# rewards, balances, actions = simulate_policy(policy, 100)

In [14]:
# df_simulation_results = pd.DataFrame(actions, columns=[
#                                      f"action_in_round_{round}" for round in range(InvestmentGameSimple().num_rounds)])

# df_simulation_results['final_balance'] = balances
# df_simulation_results[df_simulation_results['final_balance'] == df_simulation_results['final_balance'].max()]


## Policy Optimization

In [15]:
def get_valid_actions(state):
    invested = state[2]
    if invested:
        return ['sell', 'hold']
    else:
        return ['buy', 'hold']


In [16]:
def get_random_policy(states):
    policy = dict()
    for state in states:
        valid_actions = get_valid_actions(state)
        num_actions = len(valid_actions)
        probability = 1 / num_actions
        policy[state] = {action: probability for action in valid_actions}
    return policy

In [17]:
def get_trajectory(env, policy):
    """
    Return the trajectory for one episode under the given policy.
    The trajectory is a list of state-action-reward pairs captured in every step of an episode. 

    Parameters
    ----------
    env : Environment class 
        Simulation environment
    policy : dict
        Mapping from state to action-probability dictionary

    Returns
    -------
    list
        List of state-action-reward tuples
    """
    trajectory = []
    state = env.reset()
    state_action_reward = [state] # initialized
    done = False
    while not done:
        action = choose_action(state, policy)
        state, reward, done, info = env.step(action)
        state_action_reward.append(action)
        state_action_reward.append(reward)
        trajectory.append(state_action_reward)
        state_action_reward = [state]
    return trajectory

In [18]:
def get_eps_greedy(actions, eps, best_action):
    action_probability_pairs = dict()
    num_actions = len(actions)
    for action in actions:
        if action == best_action:
            action_probability_pairs[action] = 1 - eps + eps / num_actions
        else:
            action_probability_pairs[action] = eps / num_actions
    return action_probability_pairs
    

In [19]:
def on_policy_first_visit_mc(env, num_trajectories, eps, gamma):
    # initialize a policy
    states = env.state_space
    policy = get_random_policy(states)
    Q = {state: {action: 0 for action in get_valid_actions(state)} for state in states}
    Q_n = {state: {action: 0 for action in get_valid_actions(state)} for state in states}
    for iteration in range(num_trajectories):
        # simulate trajectory
        trajectory = get_trajectory(env, policy)
        G = 0
        T = len(trajectory) - 1
        for t, state_action_reward in enumerate(reversed(trajectory)):
            state, action, reward = state_action_reward
            G = reward + gamma * G
            first_visit = True
            for j in range(T - t):
                state_j = trajectory[j][0]
                action_j = trajectory[j][1]
                if (state, action) == (state_j, action_j):
                    first_visit = False
                if first_visit:
                    Q[state][action] = Q[state][action] * Q_n[state][action] + G
                    Q_n[state][action] += 1
                    Q[state][action] /= Q_n[state][action]
                    best_action = max(Q[state].items(), key=operator.itemgetter(1))[0]
                    policy[state] = get_eps_greedy(get_valid_actions(state), eps, best_action)
    return policy, Q, Q_n
    
    

In [69]:
# investmentGame = InvestmentGameSimple()
# policy, Q, Q_n = on_policy_first_visit_mc(investmentGame, 100, 0.05, 1)

In [22]:
# file_name = 'resources/data/policy.json'
# with open(file_name, 'w') as fp:
    # fp.write(ujson.dumps(policy))

## One-step Temporal-difference learning 

Temporal difference methods can update a policy after a single or multiple state transitions. We are starting by defining the state-value function of a policy $\pi$ for a one-step reward and the next state:
$$
v_{\pi}(s) = E_{\pi}[R_{t+1} + \gamma v_{\pi}(S_{t+1})|S_t = s]
$$

*The idea in TD learning is that we use this observation to update the existing estimate by moving it in the direction of this new estimate.*
$$

## Off-policy control with Q-Learning

$$ 
\hat{q}_{\pi}(s,a) := \hat{q}_{\pi}(s,a) + \alpha[r + \gamma \max \hat{q}_{\pi}(s', u) - \hat{q}_{\pi}(s,a)]
$$

* the action the agent uses to update the action-value, u, is not necessarily the action that it will use for the next step. It is the action that maximizes the action-value. 

In [56]:
def q_learning(env, gamma, eps, alpha, num_iter):
    states = env.state_space
    # initialize Q
    Q = {state: {action: 0 for action in get_valid_actions(state)} for state in states}
    # initialize a policy
    policy = get_random_policy(states)
    state = env.reset()
    for iter in range(num_iter):
        if (iter % 1000) == 0:
            print(f"Iteration: {iter}")
        best_action = max(Q[state].items(), key=operator.itemgetter(1))[0]
        policy[state] = get_eps_greedy(get_valid_actions(state), eps, best_action)
        action = choose_action(state, policy)
        state_next, reward, done, info = env.step(action)
        max_q = max(Q[state_next].values())
        Q[state][action] += alpha * (reward + gamma * max_q - Q[state][action])
        if done:
            state = env.reset()
        else:
            state = state_next
        # strip policy
        policy = {state: {max(policy[state].items(), key=operator.itemgetter(1))[
            0]: 1} for state in states}
    return policy, Q

In [57]:
investmentGame = InvestmentGameSimple()

In [62]:
policy, Q = q_learning(investmentGame, 1, 0.1, 0.01, 10000)

Iteration: 0
Iteration: 1000
Iteration: 2000
Iteration: 3000
Iteration: 4000
Iteration: 5000
Iteration: 6000
Iteration: 7000
Iteration: 8000
Iteration: 9000


In [68]:
Q

{(4, -1, True): {'sell': 0.03801341865769722, 'hold': -0.002402239865334292},
 (4, -1, False): {'buy': 0.01762268124650116, 'hold': 0.0807708657198195},
 (4, 1, True): {'sell': 0.187980066256735, 'hold': 0.06003252787364823},
 (4, 1, False): {'buy': 0.1573246329495929, 'hold': 0.02924141074785779},
 (7, -1, True): {'sell': -0.023377641052261196, 'hold': -0.02755673315875193},
 (7, -1, False): {'buy': -0.005624337580907651, 'hold': 0.045709185081051845},
 (7, 1, True): {'sell': 0.07373333371973284, 'hold': 0.02705669075543443},
 (7, 1, False): {'buy': 0.06467519838296341, 'hold': 0.022967445493144993},
 (10, -1, True): {'sell': 0, 'hold': 0},
 (10, -1, False): {'buy': 0, 'hold': 0},
 (10, 1, True): {'sell': -0.0003300288361633123,
  'hold': -0.002482873243326878},
 (10, 1, False): {'buy': 0.00257067298334964, 'hold': 0.021615746062951923}}