# Puyopuyo RL

## Outlook

- action
    - 操作 0: Left, 1: Right, 2: A, 3: B, 4: Bottom
- state
    - puyoの配列
    - 0: red, 1: blue, 2: yellow, 3: green
- NN
    - CNN
    - Linear
    

## Library

In [2]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
from gym import wrappers
from datetime import *
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from enum import Enum
from typing import NamedTuple, List
import pandas as pd
import os
import uuid
from sklearn.model_selection import train_test_split

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



## Types

In [None]:
class PuyoType(Enum)
    NONE = 0
    RED = 1
    BLUE = 2
    YELLOW = 3
    GREEN = 4

## ReplayMemory

In [4]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, next_state, reward):
        """Saves a transition."""
        #print("args", state, action, next_state, reward)
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(state, action, next_state, reward)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

## Environment

In [10]:
### Environment

class EnvParameter(NamedTuple):
    max_lot: int    # 最大数量
    spread: int     # スプレッド
    window_size: int
    
        
class Environment():
    def __init__(self, data, env_param, MONITOR, reward_func):
        self.MONITOR = MONITOR
        self.reward_func = reward_func
        self.param = env_param
        self.forex_data = data # set forex_data  [t=0 data, t=1 data,....., t=x data]
        self.history = []
        self.observ = [[0]*6]*13
        self.steps = self.param.window_size
        
        #if MONITOR:
        #    wrappers.Monitor(self.env, "./tmp", force=True)
        
    def reset(self):
        self.done = False
        self.history = []
        return self.get_observe()
        
    def step(self, action): # action: 0 is None, 1 is Buy, 2 is Sell
        
        action = ActionType(action)
        
        if self.is_build(action):
            ''' 新規取引 '''
            self.position = PositionType.LONG if action == ActionType.BUY else PositionType.SHORT
            self.transaction = Transaction(start_date=date, pair=PairType.USD_JPY, lot=lot, position=self.position, entry_rate=close)

        elif self.is_release(action):
            ''' 決済 '''
            # 取引内容を追記
            self.transaction.settle(end_date=date, settle_rate=close)
            self.history.append(self.transaction)
            # エピソード終了
            self.done = True
            
        self.done, chain_num = self.check_chain()
        
        ''' 報酬計算 '''
        reward = calc_reward(chain_num)
        
        ''' 次のステップの環境を作成 '''
        self.steps += 1
        next_observe = self.get_observe()

        return next_observe, reward, self.done, self.transaction
        
    def get_action_num(self): # action num is 3
        return len(ActionType)
    
    def get_observ_num(self): # observ_num is window_size
        return self.param.window_size
    
    def get_reward(self, chain_num):
        ''' 連鎖の1/10を報酬とする。終了してない場合0.1を加算 '''
        if self.done:
            return chain_num / 10
        else:
            return 0.1
    
    def get_observe(self):
        puyo1 = PuyoType(torch.randn(len(PuyoType)))
        puyo2 = PuyoType(torch.randn(len(PuyoType)))
        self.history.append([puyo1, puyo2])
        return self.history
    
    def check_chain(self):
        return 0
        
        
        
    def show(self):
        return 0
        #if self.MONITOR:
        #    self.env.render()
    
    def close(self):
        return 0
        #if self.MONITOR:
        #    self.env.render(close=True)


## Agent

In [None]:
class Agent:
    def __init__(self, brain):
        '''エージェントが行動を決定するための頭脳を生成'''
        self.brain = brain
        
    def learn(self):
        '''Q関数を更新する'''
        loss = self.brain.optimize()
        return loss
        
    def modify_goal(self):
        '''Target Networkを更新する'''
        self.brain.update_target_model()
        
    def select_action(self, state):
        '''行動を決定する'''
        action = self.brain.decide_action(state)
        return action
    
    def memorize(self, state, action, next_state, reward):
        '''memoryオブジェクトに、state, action, state_next, rewardの内容を保存する'''
        self.brain.memory.push(state, action, next_state, reward)
    
    def predict_action(self, state):
        '''行動を予測する'''
        action = self.brain.predict(state)
        return action
    
    def record(self, name):
        '''モデルを保存する'''
        self.brain.save_model(name)
        
    def remember(self, name):
        '''モデルを読み込む'''
        self.brain.read_model(name)

## Trainer

In [None]:
class Trainer():
    def __init__(self, env, agent):
        self.env = env
        self.agent = agent
        self.profit_durations = [0]
        self.total_profit_durations = [0]
        self.loss_durations = []
        self.TARGET_UPDATE = 10
        self.episode = 0
        
    def train(self, save_name):
        while self.env.is_finish() == False:
            print("episode: ", self.episode)
            state = self.env.reset()
            for t in count():
                ''' 行動を決定する '''
                # Select and perform an action
                action = self.agent.select_action(state) # input ex: <list> [0, 0, 0, 0], output ex: <int> 0 or 1
                print("action", action)
                lot = 1
                
                ''' 行動に対する環境や報酬を取得する '''
                next_state, reward, done, transaction = self.env.step(action, lot)  # state [0,0,0,0...window_size], reward 1.0, done False, input: action 0 or 1 or 2
                
                ''' 終了時はnext_state_valueをNoneとする '''
                if done:
                    next_state_value = None
                else:
                    next_state_value = torch.tensor([next_state], device=device, dtype=torch.float32)
                #print("after action: {0}, state: {1}, next_state: {2}, reward: {3} done: {4}".format(action, state, next_state, reward, done))
                

                ''' エージェントに記憶させる '''
                # Store the transition in memory
                self.agent.memorize(
                    torch.tensor([state], device=device, dtype=torch.float32), 
                    torch.tensor([[action]], device=device), 
                    next_state_value, 
                    torch.tensor([reward], device=device)
                )
                
                # Move to the next state
                state = next_state

                ''' エージェントに学習させる '''
                # Perform one step of the optimization (on the target network)
                # update q network
                loss = self.agent.learn()
                print("loss: ", loss)
                if loss != None:
                    self.loss_durations.append(loss)
                
                if done:
                    ''' 終了時に結果をプロット '''
                    print("sdate: {0}, edate: {1}, position: {2}, profit: {3}".format(transaction.start_date, transaction.end_date, transaction.position, transaction.profit))
                    total_profit = transaction.profit + self.total_profit_durations[-1]
                    self.total_profit_durations.append(total_profit)
                    self.profit_durations.append(transaction.profit)
                    self.plot_durations()
                    self.episode += 1
                    break
            # Update the target network, copying all weights and biases in DQN
            if self.episode % self.TARGET_UPDATE == 0:
                ''' 目標を修正する '''
                self.agent.modify_goal()

        ''' モデルを保存する '''
        # モデルの保存
        self.agent.record(save_name)
        print('Complete')
        
        
    def plot_durations(self):
        #figure()でグラフを表示する領域をつくり，figというオブジェクトにする．
        fig = plt.figure()

        #add_subplot()でグラフを描画する領域を追加する．引数は行，列，場所
        ax1 = fig.add_subplot(2, 2, 1)
        ax2 = fig.add_subplot(2, 2, 2)
        ax3 = fig.add_subplot(2, 2, 3)

        x1 = [s for s in range(len(self.profit_durations))]
        y1 = self.profit_durations
        x2 = [s for s in range(len(self.total_profit_durations))]
        y2 = self.total_profit_durations
        x3 = [s for s in range(len(self.loss_durations))]
        y3 = self.loss_durations

        c1,c2,c3 = "blue","green", "red"      # 各プロットの色
        l1,l2,l3 = "profit","total", "loss"   # 各ラベル

        ax1.plot(x1, y1, color=c1, label=l1)
        ax2.plot(x2, y2, color=c2, label=l2)
        ax3.plot(x3, y3, color=c3, label=l3)
        ax1.legend(loc = 'upper right') #凡例
        ax2.legend(loc = 'upper right') #凡例
        ax3.legend(loc = 'upper right') #凡例
        fig.tight_layout()              #レイアウトの設定
        plt.show()


## Examiner

In [None]:
class Examiner():
    def __init__(self, env, agent):
        self.env = env
        self.agent = agent
        self.profit_durations = [0]
        self.total_profit_durations = [0]
        self.episode = 0
        self.stats = Statistics()
        self.transactions = []
        
    def evaluate(self, file_name):
        self.agent.remember(file_name)
        
        while self.env.is_finish() == False:
            print("episode: ", self.episode)
            state = self.env.reset()
            for t in count():
                #self.env.show()
                
                ''' 行動を決定する '''
                action = self.agent.predict_action(state) # input ex: <list> [0, 0, 0, 0], output ex: <int> 0 or 1
                print("action: ", action)
                lot = 1
                
                ''' 行動に対する環境や報酬を取得する '''
                next_state, reward, done, transaction = self.env.step(action, lot)  # state [0,0,0,0...window_size], reward 1.0, done False, input: action 0 or 1 or 2
                
                # Move to the next state
                state = next_state

                if done:
                    ''' 終了時に結果をプロット '''
                    print("step: ", t)
                    print("sdate: {0}, edate: {1}, position: {2}, profit: {3}".format(transaction.start_date, transaction.end_date, transaction.position, transaction.profit))
                    total_profit = transaction.profit + self.total_profit_durations[-1]
                    self.total_profit_durations.append(total_profit)
                    self.profit_durations.append(transaction.profit)
                    self.plot_durations()
                    self.transactions.append(transaction)
                    self.episode += 1
                    break
                    
        #self.env.close()
        ''' 統計計算 '''
        self.stats.run(self.transactions)
        self.stats.show()
        print('Complete')
                    
    def plot_durations(self):
        #figure()でグラフを表示する領域をつくり，figというオブジェクトにする．
        fig = plt.figure()

        #add_subplot()でグラフを描画する領域を追加する．引数は行，列，場所
        ax1 = fig.add_subplot(1, 2, 1)
        ax2 = fig.add_subplot(1, 2, 2)

        x1 = [s for s in range(len(self.profit_durations))]
        y1 = self.profit_durations
        x2 = [s for s in range(len(self.total_profit_durations))]
        y2 = self.total_profit_durations

        c1,c2 = "blue","green"      # 各プロットの色
        l1,l2 = "profit","total"   # 各ラベル

        ax1.plot(x1, y1, color=c1, label=l1)
        ax2.plot(x2, y2, color=c2, label=l2)
        ax1.legend(loc = 'upper right') #凡例
        ax2.legend(loc = 'upper right') #凡例
        fig.tight_layout()              #レイアウトの設定
        plt.show()

## DQN

In [5]:
class DQN(nn.Module):

    def __init__(self, inputs_num, hidden_size, outputs_num):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(inputs_num, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, outputs_num)

    def forward(self, x):
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        h = F.relu(self.fc3(h))
        y = F.relu(self.fc4(h))
        return y


## Brain

In [6]:
class BrainParameter(NamedTuple):
    batch_size: int
    gamma : float
    eps_start : float
    eps_end: float
    eps_decay: int
    capacity: int
    hidden_size: int

class Brain:
    def __init__(self, param, num_observ, num_actions):
        self.steps_done = 0
        
        # Brain Parameter
        self.BATCH_SIZE = param.batch_size
        self.GAMMA = param.gamma
        self.EPS_START = param.eps_start
        self.EPS_END = param.eps_end
        self.EPS_DECAY = param.eps_decay
        self.CAPACITY = param.capacity
        self.HIDDEN_SIZE = param.hidden_size
        
        # 経験を保存するメモリオブジェクトを生成
        self.memory = ReplayMemory(self.CAPACITY)
        
        #print(self.model) # ネットワークの形を出力
        self.num_observ = num_observ
        #print(self.num_observ)
        self.num_actions = num_actions # 行動の数を取得
        self.policy_net = DQN(self.num_observ, self.HIDDEN_SIZE, self.num_actions).to(device)
        self.target_net = DQN(self.num_observ, self.HIDDEN_SIZE, self.num_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        # 最適化手法の設定
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        
    def optimize(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        
        # 訓練モード
        self.policy_net.train()
        
        ''' batch化する '''
        transitions = self.memory.sample(self.BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        state_batch = torch.cat(batch.state) # state: tensor([[0.5, 0.4, 0.5, 0], ...]) size(32, 4)
        action_batch = torch.cat(batch.action) # action: tensor([[1],[0],[0]...]) size(32, 1) 
        reward_batch = torch.cat(batch.reward) # reward: tensor([1, 1, 1, 0, ...]) size(32)
        #print("state_batch: ", state_batch, state_batch.size())
        #print("action_batch: ", action_batch, action_batch.size())
        #print("reward_batch: ", reward_batch, reward_batch.size())


        ''' 出力データ：行動価値を作成 '''
        # 出力actionの値のうちaction_batchが選んだ方を抽出（.gather()）
        # action_batch = [[0], [1], [1]...] action_value = [[0.01, 0.03], [0.03, 0], [0, 0.02]...]
        # state_action_values = [[0.01], [0], [0.02]]
        state_action_values = self.policy_net(state_batch).gather(1, action_batch) # size(32, 1)
        #print("state_action_values2", self.policy_net(state_batch), self.policy_net(state_batch).size())
        #print("state_action_values", state_action_values, state_action_values.size())

        ''' 教師データを作成する '''
        ''' target = 次のステップでの行動価値の最大値 * 時間割引率 + 即時報酬 '''
         # doneされたかどうか doneであればfalse
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=device, dtype=torch.bool)
        #print("non_final_mask: ", non_final_mask, non_final_mask.size())
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        #print("non_final_next_state: ", non_final_next_states, non_final_next_states.size())
        
        next_state_values = torch.zeros(self.BATCH_SIZE, device=device)
        
        # 大きい方を選択して一次元にする
        # done時は0
        # target_net: [[0, 0.1], [2, 0.2]...], size(32, 2)      next_state_values: [0.1, 2...], size(32)
        # 次の環境での行動価値
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach() # size(32)

        # target = 次のステップでの行動価値の最大値 * 時間割引率 + 即時報酬
        expected_state_action_values = ((next_state_values * self.GAMMA) + reward_batch).unsqueeze(1) # size(32, 1)
        #print("expected_state_value: ", expected_state_action_values, expected_state_action_values.size())

        ''' Loss を計算'''
        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

        ''' 勾配計算、更新 '''
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        
        return loss
    
    def update_target_model(self):
        # モデルの重みをtarget_networkにコピー
        self.target_net.load_state_dict(self.policy_net.state_dict())
    
    def decide_action(self, state):
        state = torch.tensor(state, device=device).float()
        sample = random.random()
        eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \
            math.exp(-1. * self.steps_done / self.EPS_DECAY)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                action = np.argmax(self.policy_net(state).tolist())
                return action
        else:
            return random.randrange(self.num_actions)
    
    
    def save_model(self, name):
        torch.save(self.policy_net.state_dict(), name)
        
    def read_model(self, name):
        param = torch.load(name)
        self.policy_net.load_state_dict(param)
    
    def predict(self, state):
        state = torch.tensor(state, device=device).float()
        self.policy_net.eval() # ネットワークを推論モードに切り替える
        with torch.no_grad():
            action = np.argmax(self.policy_net(state).tolist())
        return action



## RewardFunc

In [7]:
def reward_func(transaction, close, is_none, is_done):
    
    if is_none:
        ''' 取引をしていなければ0 '''
        return -0.1

    elif is_done:    
        ''' 取引終了時にprofitが+であれば+1, -であれば-1 '''
        profit = transaction.profit
        return 1.0 if profit >= 0 else -1.0

    else:
        ''' 現在値がトレード時の値より高ければ0.1, 低ければ-0.1 '''
        entry_rate = transaction.entry_rate # 取引中のトレードの始値
        return 0.1 if close >= entry_rate else -0.1

## Train

In [8]:
if __name__ == "__main__":
    
    ''' 環境生成 '''
    MONITOR = False
    env_param = EnvParameter(max_lot=1, spread=1, window_size=30)
    env = Environment(train_data, env_param, MONITOR, reward_func)
    
    ''' エージェント生成 '''
    num_actions = env.get_action_num() 
    num_observ = env.get_observ_num()
    brain_param = BrainParameter(batch_size=32, gamma=0.99, eps_start=0.9, eps_end=0.05, eps_decay=200, capacity=10000, hidden_size=100)
    brain = Brain(brain_param, num_observ, num_actions)
    agent = Agent(brain)
    
    ''' Trainer '''
    cartpole_trainer = Trainer(env, agent)
    cartpole_trainer.train('test.pth')
    

NameError: name 'EnvParameter' is not defined

## Eval

In [None]:
if __name__ == "__main__":
    
    ''' 環境生成 '''
    MONITOR = False
    env_param = EnvParameter(max_lot=1, spread=1, window_size=30)
    env = Environment(None, valid_data, env_param, MONITOR)
    
    ''' エージェント生成 '''
    num_actions = env.get_action_num() 
    num_observ = env.get_observ_num()
    brain_param = BrainParameter(batch_size=32, gamma=0.99, eps_start=0.9, eps_end=0.05, eps_decay=200, capacity=10000, hidden_size=100)
    brain = Brain(brain_param, num_observ, num_actions)
    agent = Agent(brain)
    
    cartpole_examiner = Examiner(env, agent)
    cartpole_examiner.evaluate('weight.pth')
    