In [1]:
## From https://github.com/egeromin/mastermind/

import config
import numpy as np
import itertools
import random
from collections import Counter


class Environment:
    '''generate episode for Agent to learn using Q-learning algorithm'''
    

    def __init__(self, secret):
        if isinstance(secret, int):
            secret = self._number_from_index(secret)
        self.secret = secret

    @staticmethod
    def _index_from_number(number):
        """
        Convert a 4-digit guess to an index between 0 and 6**4 
        """
        assert(len(number) <= 4)
        assert(set(number) <= set(map(str, range(6))))
        return int(number, base=6)

    @staticmethod
    def _number_from_index(index):
        assert(0 <= index < config.max_guesses)
        digits = []
        while index > 0:
            digits.append(str(index % 6))
            index = index // 6
        return "".join(reversed(digits)).zfill(4)
    
    @staticmethod
    def score(p, q):
        hits = sum(p_i == q_i for p_i, q_i in zip(p, q))
        misses = sum((Counter(p) & Counter(q)).values()) - hits
        return hits, misses
 
    def generate_random_episodes(self):
        
        lst = []
        
        for idx in range(config.max_episode_length):
            guess = random.randint(0, 6**4 - 1)
            guess = self._number_from_index(guess)
            lst.append((guess, self.score(self.secret, guess)))
            if guess == self.secret:
                return lst
                       
        return lst
    
    def reward(self, guess):
        if guess == self.secret:
            return 1
        else:
            return -1
         
    

Testing Episode class

# Q-Learning

In [17]:
class Agent:
    
    def __init__(self, epsilon=0.1, alpha=1.0):
        self.initialize_V()
        self.epsilon = epsilon
        self.alpha = alpha
        self.reset_possible_states()
        
        
    
    def initialize_V(self):
        '''initializes State Value function with zeros'''
        self.V = {}
        for idx in range(0, 6**4):
            self.V[Environment._number_from_index(idx)] = 0

    def reset_possible_states(self):
        self.possible_states = list(self.V.keys())
    
    def restrict_possible_states(self, guess, feedback):
        new_states = [state for state in self.possible_states if Environment.score(guess, state)==feedback]
        self.possible_states = new_states
        
    def learn_select_move(self):
        
        best_move = self.get_best_action()
        
        selected_move = best_move
        if random.random() < self.epsilon:
            selected_move = self.random_action()
        
        return (best_move, selected_move)
    
    def get_best_action(self):
        "For the best possible states, chose randomly amongst them."
        V_values = [self.V[state] for state in self.possible_states]
        max_V = max(V_values)
        chosen_state = random.choice([state for state in self.possible_states if self.V[state] == max_V])
        return chosen_state
    
    def random_action(self):
        return random.choice(self.possible_states)
    
    def make_move(self, action, feedback):
        self.restrict_possible_states(action, feedback)
        
    def learn_from_move(self, action, feedback, reward):
        "The heart of Q-learning."
        
        # TODO: Finish each line with code and comments
        current_state = action  # action = state (guess the agent makes)
        r = reward  # reward for this state
        if r == 1:
            print(f"guess = secret")
            return 
        
        self.make_move(action, feedback) ## restrict the states first
        
        best_next_move, selected_next_move = self.learn_select_move()  # Exploration vs exploitation
        
        current_state_value = self.V[current_state] # current value of state
        best_move_value = self.V[best_next_move]  # best possible value of next state.
        td_target = current_state_value + self.alpha * (r + best_move_value - 
                                                        current_state_value)  # Q-algorithm update
        self.V[current_state] = td_target # This is Q-learning. The previous lines setup this line. 
        
        
        
    
        

In [18]:
agent = Agent()

In [19]:
def guess_the_code(secret='1234'):
    agent.reset_possible_states()
    guess = agent.get_best_action()
    env = Environment(secret)
    print(f"initial guess = {guess}")
    _ = input()
    while guess!= secret:
        feedback = env.score(secret, guess)
        agent.restrict_possible_states(guess, feedback)
        guess = agent.get_best_action()
        print(f"next guess = {guess}")
        _ = input()
        

In [20]:
guess_the_code()

initial guess = 5123


 


next guess = 3231


 


next guess = 2031


 


next guess = 1232


 


next guess = 1234


 


In [132]:
guess_the_code('1223')

initial guess = 4433


 


next guess = 2030


 


next guess = 0421


 


next guess = 1223


 
