## 1. 라이브러리 불러오기

In [389]:
import gymnasium as gym
from gymnasium import Env 
import numpy as np
from gymnasium.spaces import Box, Discrete

import random
random.seed(42)

## 2. 환경 만들기

In [390]:
def All_Cooperate(state, game_step): # 항상 협력
    return 1

def All_Cheat(state, game_step): # 항상 배신
    return 0

def Copycat(state, game_step): # 처음에는 협력 이후, 상대방의 전 전략을 따라함
    if game_step == 0:
        return 1
    else:
        return state[game_step-2]
    
def Grudger(state, game_step): # 항상 협력, 하지만 상대방이 한번이라도 배신하면 끝까지 배신
    for i in range(0, game_step-2+1, 2):        
        if state[i] == 0:            
            return 0
    return 1

def Detective(state, game_step): # 1-0-1-1으로 시작하고, 이 때 상대가 한번이라도 배신하면 Copycat 방식, 그렇지 않으면 All_Cheat 방식을 따른다.
    start = [1, -1, 0, -1, 1, -1, 1]
    if game_step <= 6:
        return start[game_step]
    
    for i in range(0, 7, 2):
        if state[i] == 0:
            return Copycat(state, game_step)
    return All_Cheat(state, game_step)

def Copykitten(state, game_step): # 첫번 째는 협력으로 시작한다. 상대가 두번연속 배신하면 배신한다.
    if game_step == 0 or game_step == 2:
        return 1
    elif state[game_step-2] == 0 and state[game_step-4] == 0:
        return 0
    return 1

def Simpleton(state, game_step): # 첫번 째는 협력으로 시작한다. 상대가 협력하면 내 마지막 수와 같은 수를, 상대가 배신하면 내 마지막 수와 다른 수를 둔다.    
    if game_step == 0:
        return 1
    elif state[game_step-2] == 1:
        return state[game_step-1]
    elif state[game_step-2] == 0:
        if state[game_step-1] == 1:
            return 0
        elif state[game_step-1] == 0:
            return 1
        
def Random_Game(state, game_step):
    return random.choice([0, 1])


In [391]:
class PrisonersGame(Env):
    def __init__(self):
        super().__init__()

        self.action_space = Discrete(2)
        self.observation_space = Box(low=-1, high=1, shape=(30,), dtype=np.int32)

        self.state = np.array([-1 for i in range(30)]).astype(np.int32)
        self.game_step = 0

        self.Opponent_Game_type = ["All_Cooperate", "All_Cheat", "Copycat", "Grudger", "Detective", "Copykitten", "Simpleton", "Random_Game"]        
    
    def get_observation(self):  
        obs = np.reshape(self.state, (30,))   
        return obs
    
    def get_done(self):
        done=False
        if self.state[29] != -1:
            done = True
        return done

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.array([-1 for i in range(30)]).astype(np.int32)
        self.game_step = 0        

        obs = self.get_observation()
        info = {}
        return (obs, info)
    
    def step(self, action):

        self.state[self.game_step] = action      
        rand = random.randint(0, 7)
        self.state[self.game_step+1] = eval(self.Opponent_Game_type[rand])(self.state, self.game_step)
        
        reward = 0
        if self.state[self.game_step] == 1: # 나는 협력을 했는데
            if self.state[self.game_step+1] == 1: # 상대도 협력
                reward = +2
            elif self.state[self.game_step+1] == 1: # 상대는 배신
                reward = -1
        if self.state[self.game_step] == 0: # 나는 배신을 했는데
            if self.state[self.game_step+1] == 1: # 상대는 협력
                reward = +3
            elif self.state[self.game_step+1] == 0: # 상대도 배신
                reward = 0
        
        self.game_step += 2
        
        observation = self.get_observation()
        done = self.get_done() 
        truncated = False
        info = {}
        return observation, reward, truncated, done, info

    def render(self):
        pass

In [392]:
env = PrisonersGame()

In [393]:
obs=env.get_observation()
print(obs)

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1]


In [394]:
for episode in range(10):
    obs = env.reset()
    done = False  
    total_reward   = 0
    while not done: 
        obs, reward, truncated, done, info =  env.step(env.action_space.sample())
        total_reward  += reward
    print('Total Reward for episode {} is {}'.format(episode, total_reward))


Total Reward for episode 0 is 15
Total Reward for episode 1 is 16
Total Reward for episode 2 is 19
Total Reward for episode 3 is 20
Total Reward for episode 4 is 21
Total Reward for episode 5 is 12
Total Reward for episode 6 is 11
Total Reward for episode 7 is 22
Total Reward for episode 8 is 13
Total Reward for episode 9 is 14


## 3. 모델 만들기

In [395]:
from gymnasium.utils.env_checker import check_env

env = PrisonersGame()
check_env(env , warn = True , skip_render_check = True )

In [397]:
import gymnasium as gym

from stable_baselines3 import DQN

vec_env = PrisonersGame()

model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10, log_interval=4)
model.save("dqn_PrisonersGame")

del model # remove to demonstrate saving and loading

model = DQN.load("dqn_PrisonersGame")

obs, info = env.reset()
while True:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        print(obs)
        obs, info = env.reset()        

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
[1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1]
