In [47]:
## From https://github.com/egeromin/mastermind/

import config
import numpy as np
import itertools
import random
from collections import Counter


class Environment:
    '''Environment for mastermind game'''
    def __init__(self, secret):
        if isinstance(secret, int):
            secret = self._number_from_index(secret)
        self.secret = secret

    @staticmethod
    def _index_from_number(number):
        """
        Convert a 4-digit guess to an index between 0 and 6**4 
        """
        assert(len(number) <= 4)
        assert(set(number) <= set(map(str, range(6))))
        return int(number, base=6)

    @staticmethod
    def _number_from_index(index):
        '''inverse of _index_from_number function'''
        assert(0 <= index < config.max_guesses)
        digits = []
        while index > 0:
            digits.append(str(index % 6))
            index = index // 6
        return "".join(reversed(digits)).zfill(4)
    
    @staticmethod
    def score(p, q):
        '''feedback given during the mastermind game'''
        hits = sum(p_i == q_i for p_i, q_i in zip(p, q))
        misses = sum((Counter(p) & Counter(q)).values()) - hits
        return hits, misses
    
    def get_feedback(self,action):
        '''feedback for the current guess and secret'''
        return self.score(self.secret, action)
    
    def reward(self, guess):
        '''returns reward for a guess'''
        if guess == self.secret:
            return 1
        else:
            return -1
         
    

Testing Episode class

# Q-Learning

In [65]:
class Agent:
    '''Q learning Agent'''
    def __init__(self, epsilon=0.1, alpha=1.0):
        self.initialize_V()
        self.epsilon = epsilon
        self.alpha = alpha
        self.reset_possible_states()
        
        
    
    def initialize_V(self):
        '''initializes State Value function with zeros'''
        self.V = {}
        for idx in range(0, 6**4):
            self.V[Environment._number_from_index(idx)] = 0

    def reset_possible_states(self):
        '''set of possible states for the next action'''
        self.possible_states = list(self.V.keys())
    
    def restrict_possible_states(self, guess, feedback):
        '''restrict the possible states according to previous guesses'''
        new_states = [state for state in self.possible_states if Environment.score(guess, state)==feedback]
        self.possible_states = new_states
        
    def learn_select_move(self):
        
        best_move = self.get_best_action()
        
        selected_move = best_move
        if random.random() < self.epsilon:
            selected_move = self.random_action()
        
        return (best_move, selected_move)
    
    def get_best_action(self):
        "For the best possible states, chose randomly amongst them."
        V_values = [self.V[state] for state in self.possible_states]
        max_V = max(V_values)
        chosen_state = random.choice([state for state in self.possible_states if self.V[state] == max_V])
        return chosen_state
    
    def random_action(self):
        return random.choice(self.possible_states)
    
    def make_move(self, action, feedback):
        self.restrict_possible_states(action, feedback)
        
    def learn_from_move(self, action, feedback, reward):
        "The heart of Q-learning."
        
        # TODO: Finish each line with code and comments
        current_state = action  # action = state (guess the agent makes)
        r = reward  # reward for this state

        
        self.make_move(action, feedback) ## restrict the states first
        
        best_next_move, selected_next_move = self.learn_select_move()  # Exploration vs exploitation
        
        current_state_value = self.V[current_state] # current value of state
        best_move_value = self.V[best_next_move]  # best possible value of next state.
        td_target = current_state_value + self.alpha * (r + best_move_value - 
                                                        current_state_value)  # Q-algorithm update
        self.V[current_state] = td_target # This is Q-learning. The previous lines setup this line. 
        

## Baseline 

For the baseline we will use the worst case

## Training Agent using Q-learning

In [145]:
def train(agent, n_episodes):
    '''
    Train the agent for n_episodes.
    '''
    for _ in range(n_episodes):
        secret = Environment._number_from_index(random.randint(0, 6**4 - 1))
        env = Environment(secret)
        agent.reset_possible_states()
        action = agent.random_action()  # init action
        
        if action == secret: # if init guess is crt skip this episode
            continue
            
        run = True
        while run:
            feedback = env.get_feedback(action)
            reward   = env.reward(action)
            agent.learn_from_move(action, feedback, reward)
            
            if action == secret:
                break  # correct guess stop episode
            else:
                action = agent.random_action()  # else next guess
            

In [139]:
def num_guesses(agent, secret='1234'):
    '''return number of guesses needed by the agent to
    get to the secret'''
    agent.reset_possible_states()
    guess = agent.get_best_action()
    env = Environment(secret)
    num_guess = 1
    while guess!= secret:
        feedback = env.score(secret, guess)
        agent.restrict_possible_states(guess, feedback)
        guess = agent.get_best_action()
        num_guess += 1
    return num_guess

In [140]:
def avg_num_guesses_needed(agent):
    '''average number guesses needed'''
    
    nums = [] 
    
    for idx in range(6**4):
        secret = Environment._number_from_index(idx)
        length = num_guesses(agent, secret)
        nums.append(length)
        
#     print(nums)
    return sum(nums)/len(nums)
    

In [159]:
def worst_case_length(agent):
    '''average number guesses needed'''
    
    nums = [] 
    
    for idx in range(6**4):
        secret = Environment._number_from_index(idx)
        length = num_guesses(agent, secret)
        nums.append(length)
        
#     print(nums)
    return max(nums)
    

In [141]:
def interactive_play(agent, secret='1234'):
    '''interactive guesses on pressing enter'''
    agent.reset_possible_states()
    guess = agent.get_best_action()
    env = Environment(secret)
    print(f"initial guess = {guess}")
    _ = input()
    while guess!= secret:
        feedback = env.score(secret, guess)
        agent.restrict_possible_states(guess, feedback)
        guess = agent.get_best_action()
        print(f"next guess = {guess}")
        _ = input()
        

In [161]:
agent = Agent()

In [162]:
worst_case_length(agent)

7

In [158]:
for i in range(1,5):
    train(agent, 2000 * i)
    print(f"after {2000*i} episodes - average case = {avg_num_guesses_needed(agent)}")

after 2000 episodes - average case = 4.677469135802469
after 4000 episodes - average case = 4.68287037037037


KeyboardInterrupt: 

In [160]:
worst_case_length(agent)

7