<img src='http://hilpisch.com/taim_logo.png' width="350px" align="right">

# Artificial Intelligence in Finance

## Reinforcement Learning

&copy; Dr Yves J Hilpisch | The Python Quants GmbH

http://aimachine.io | http://twitter.com/dyjh

## Imports

In [4]:
import os
import math
import random
import numpy as np
import pandas as pd
from pylab import plt, mpl
plt.style.use('seaborn')
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['font.family'] = 'serif'
np.set_printoptions(precision=4, suppress=True)
os.environ['PYTHONHASHSEED'] = '0'

  plt.style.use('seaborn')


## `CartPole` Environment 

In [9]:
import gym

In [10]:
env = gym.make('CartPole-v0')

In [16]:

env.action_space.seed(100)

[100]

In [20]:
#状态空间，分别为 推车位置、推车速度、极角、极角速度
env.observation_space

Box([-4.8000e+00 -3.4028e+38 -4.1888e-01 -3.4028e+38], [4.8000e+00 3.4028e+38 4.1888e-01 3.4028e+38], (4,), float32)

In [25]:
env.observation_space.low.astype(np.float16)

array([-4.8000e+00, -3.4028e+38, -4.1888e-01, -3.4028e+38], dtype=float32)

In [26]:
env.observation_space.high.astype(np.float16)

  env.observation_space.high.astype(np.float16)


array([4.8  ,   inf, 0.419,   inf], dtype=float16)

In [27]:
state = env.reset(seed=100)

In [28]:
state

(array([ 0.0335,  0.0097, -0.0211, -0.0457], dtype=float32), {})

In [29]:
env.action_space

Discrete(2)

In [30]:
env.action_space.n

2

In [31]:
env.action_space.sample()

1

In [35]:
env.action_space.sample() 

0

In [38]:
a = env.action_space.sample()
a

0

In [42]:
state, reward, done,truncated, info = env.step(a)
state, reward, done,truncated, info

(array([ 0.0438, -0.3814,  0.0367,  0.6447], dtype=float32),
 1.0,
 False,
 False,
 {})

In [56]:
state = env.reset(seed=100)
for e in range(1, 200):
    a = env.action_space.sample()
    state, reward, done,truncated, info = env.step(a)
    print(f'step={e:2d} | state={state} | action={a} | reward={reward}')
    if done and (e + 1) < 200:
        print('*** FAILED ***')
        break

step= 1 | state=[ 0.0337  0.2051 -0.022  -0.345 ] | action=1 | reward=1.0
step= 2 | state=[ 0.0378  0.0103 -0.0289 -0.0593] | action=0 | reward=1.0
step= 3 | state=[ 0.038  -0.1844 -0.0301  0.2241] | action=0 | reward=1.0
step= 4 | state=[ 0.0343  0.0111 -0.0256 -0.0779] | action=1 | reward=1.0
step= 5 | state=[ 0.0345  0.2066 -0.0272 -0.3786] | action=1 | reward=1.0
step= 6 | state=[ 0.0387  0.4021 -0.0348 -0.6797] | action=1 | reward=1.0
step= 7 | state=[ 0.0467  0.2075 -0.0484 -0.3982] | action=0 | reward=1.0
step= 8 | state=[ 0.0509  0.4032 -0.0563 -0.7057] | action=1 | reward=1.0
step= 9 | state=[ 0.0589  0.5991 -0.0704 -1.0156] | action=1 | reward=1.0
step=10 | state=[ 0.0709  0.7951 -0.0907 -1.3295] | action=1 | reward=1.0
step=11 | state=[ 0.0868  0.6012 -0.1173 -1.0666] | action=0 | reward=1.0
step=12 | state=[ 0.0988  0.7977 -0.1387 -1.3936] | action=1 | reward=1.0
step=13 | state=[ 0.1148  0.9942 -0.1665 -1.7263] | action=1 | reward=1.0
step=14 | state=[ 0.1347  1.1908 -0.20

In [44]:
done

True

## Dimensionality Reduction

See http://kvfrans.com/simple-algoritms-for-solving-cartpole/.

In [57]:
np.random.seed(100)

In [58]:
weights = np.random.random(4) * 2 - 1

In [60]:
weights

array([ 0.0868, -0.4433, -0.151 ,  0.6896])

In [65]:
state = env.reset()

In [67]:
state

(array([0.0129, 0.0081, 0.01  , 0.0035], dtype=float32), {})

In [66]:
s = np.dot(state, weights)
s

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

## Action Rule

In [None]:
if s < 0:
    a = 0
else:
    a = 1

In [None]:
a

## Total Reward per Episode

In [None]:
def run_episode(env, weights):  
    state = env.reset()
    treward = 0
    for _ in range(200):
        s = np.dot(state, weights)
        a = 0 if s < 0 else 1
        state, reward, done, info = env.step(a)
        treward += reward
        if done:
            break
    return treward

In [None]:
run_episode(env, weights)

## Simple Learning 

In [None]:
def set_seeds(seed=100):
    random.seed(seed)
    np.random.seed(seed)
    env.seed(seed)

In [None]:
set_seeds()
num_episodes = 1000

In [None]:
besttreward = 0
for e in range(1, num_episodes + 1):
    weights = np.random.rand(4) * 2 - 1
    treward = run_episode(env, weights)
    if treward > besttreward:
        besttreward = treward
        bestweights = weights
        if treward == 200:
            print(f'SUCCESS | episode={e}')
            break
        print(f'UPDATE  | episode={e}')

In [None]:
weights

## Testing the Results

In [None]:
res = []
for _ in range(100):
    treward = run_episode(env, weights)
    res.append(treward)
res[:10]

In [None]:
sum(res) / len(res)

## DNN Learning

In [None]:
import logging
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)

In [None]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [None]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam, RMSprop
from sklearn.metrics import accuracy_score

In [None]:
def set_seeds(seed=100):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    env.seed(seed)
    env.action_space.seed(seed)

In [None]:
class NNAgent:
    def __init__(self):
        self.max = 0
        self.scores = list()
        self.memory = list()
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=4,
                        activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer=RMSprop(lr=0.001))
        return model
        
    def act(self, state):
        if random.random() <= 0.5:
            return env.action_space.sample()
        action = np.where(self.model.predict(
            state, batch_size=None)[0, 0] > 0.5, 1, 0)
        return action
                    
    def train_model(self, state, action):
        self.model.fit(state, np.array([action,]),
                       epochs=1, verbose=False)
    
    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state = env.reset()
            for _ in range(201):
                state = np.reshape(state, [1, 4])
                action = self.act(state)
                next_state, reward, done, info = env.step(action)
                if done:
                    score = _ + 1
                    self.scores.append(score)
                    self.max = max(score, self.max)
                    print('episode: {:4d}/{} | score: {:3d} | max: {:3d}'
                          .format(e, episodes, score, self.max), end='\r')
                    break
                self.memory.append((state, action))
                self.train_model(state, action)
                state = next_state

In [None]:
set_seeds(100)
agent = NNAgent()

In [None]:
episodes = 1000

In [None]:
agent.learn(episodes)

In [None]:
sum(agent.scores) / len(agent.scores)

In [None]:
f = np.array([m[0][0] for m in agent.memory])
f

In [None]:
l = np.array([m[1] for m in agent.memory])
l

In [None]:
accuracy_score(np.where(agent.model.predict(f) > 0.5, 1, 0), l)

## Q Learning

See https://keon.io/deep-q-learning/

In [None]:
from collections import deque
from keras.optimizers import Adam, RMSprop

In [None]:
class DQLAgent:
    def __init__(self, gamma=0.95, hu=24, opt=Adam,
           lr=0.001, finish=False):
        self.finish = finish
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.gamma = gamma
        self.batch_size = 32
        self.max_treward = 0
        self.averages = list()
        self.memory = deque(maxlen=2000)
        self.osn = env.observation_space.shape[0]
        self.model = self._build_model(hu, opt, lr)
        
    def _build_model(self, hu, opt, lr):
        model = Sequential()
        model.add(Dense(hu, input_dim=self.osn,
                        activation='relu'))
        model.add(Dense(hu, activation='relu'))
        model.add(Dense(env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer=opt(lr=lr))
        return model
        
    def act(self, state):
        if random.random() <= self.epsilon:
            return env.action_space.sample()
        action = self.model.predict(state)[0]
        return np.argmax(action)
    
    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            if not done:
                reward += self.gamma * np.amax(
                    self.model.predict(next_state)[0])
            target = self.model.predict(state)
            target[0, action] = reward
            self.model.fit(state, target, epochs=1,
                           verbose=False)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def learn(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state = env.reset()
            state = np.reshape(state, [1, self.osn])
            for _ in range(5000):
                action = self.act(state)
                next_state, reward, done, info = env.step(action)
                next_state = np.reshape(next_state,
                                        [1, self.osn])
                self.memory.append([state, action, reward,
                                     next_state, done])
                state = next_state
                if done:
                    treward = _ + 1
                    trewards.append(treward)
                    av = sum(trewards[-25:]) / 25
                    self.averages.append(av)
                    self.max_treward = max(self.max_treward, treward)
                    templ = 'episode: {:4d}/{} | treward: {:4d} | '
                    templ += 'av: {:6.1f} | max: {:4d}'
                    print(templ.format(e, episodes, treward, av,
                                       self.max_treward), end='\r')
                    break
            if av > 195 and self.finish:
                print()
                break
            if len(self.memory) > self.batch_size:
                self.replay()
    def test(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state = env.reset()
            for _ in range(5001):
                state = np.reshape(state, [1, self.osn])
                action = np.argmax(self.model.predict(state)[0])
                next_state, reward, done, info = env.step(action)
                state = next_state
                if done:
                    treward = _ + 1
                    trewards.append(treward)
                    print('episode: {:4d}/{} | treward: {:4d}'
                          .format(e, episodes, treward), end='\r')
                    break
        return trewards

In [None]:
episodes = 1000

In [None]:
set_seeds(100)
agent = DQLAgent(finish=True)

In [None]:
%time agent.learn(episodes)

In [None]:
plt.figure(figsize=(10, 6))
x = range(len(agent.averages))
y = np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='moving average')
plt.plot(x, y, 'r--', label='trend')
plt.xlabel('episodes')
plt.ylabel('total reward')
plt.legend();

In [None]:
trewards = agent.test(100)

In [None]:
sum(trewards) / len(trewards)

## Finance Environment

In [None]:
class observation_space:
    def __init__(self, n):
        self.shape = (n,)

In [None]:
class action_space:
    def __init__(self, n):
        self.n = n
    def seed(self, seed):
        pass
    def sample(self):
        return random.randint(0, self.n - 1)

In [None]:
class Finance:
    url = 'http://hilpisch.com/aiif_eikon_eod_data.csv'
    def __init__(self, symbol, features):
        self.symbol = symbol
        self.features = features
        self.observation_space = observation_space(4)
        self.osn = self.observation_space.shape[0]
        self.action_space = action_space(2)
        self.min_accuracy = 0.475
        self._get_data()
        self._prepare_data()
    def _get_data(self):
        self.raw = pd.read_csv(self.url, index_col=0,
                               parse_dates=True).dropna()
    def _prepare_data(self):
        self.data = pd.DataFrame(self.raw[self.symbol])
        self.data['r'] = np.log(self.data / self.data.shift(1))
        self.data.dropna(inplace=True)
        self.data = (self.data - self.data.mean()) / self.data.std()
        self.data['d'] = np.where(self.data['r'] > 0, 1, 0)
    def _get_state(self):
        return self.data[self.features].iloc[
            self.bar - self.osn:self.bar].values
    def seed(self, seed=None):
        pass
    def reset(self):
        self.treward = 0
        self.accuracy = 0
        self.bar = self.osn
        state = self.data[self.features].iloc[
            self.bar - self.osn:self.bar]
        return state.values
    def step(self, action):
        correct = action == self.data['d'].iloc[self.bar]
        reward = 1 if correct else 0
        self.treward += reward
        self.bar += 1
        self.accuracy = self.treward / (self.bar - self.osn)
        if self.bar >= len(self.data):
            done = True
        elif reward == 1:
            done = False
        elif (self.accuracy < self.min_accuracy and
              self.bar > self.osn + 10):
            done = True
        else:
            done = False
        state = self._get_state()
        info = {}
        return state, reward, done, info

In [None]:
env = Finance('EUR=', 'EUR=')

In [None]:
env.reset()

In [None]:
a = env.action_space.sample()
a

In [None]:
env.step(a)

In [None]:
set_seeds(100)
agent = DQLAgent(gamma=0.5, opt=RMSprop)

In [None]:
episodes = 1000

In [None]:
%time agent.learn(episodes)

In [None]:
agent.test(3)

In [None]:
plt.figure(figsize=(10, 6))
x = range(len(agent.averages))
y = np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='moving average')
plt.plot(x, y, 'r--', label='regression')
plt.xlabel('episodes')
plt.ylabel('total reward')
plt.legend();

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:training@tpq.io">training@tpq.io</a>