In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import collections
from collections import defaultdict

In [2]:
# Make my dictionary
num_letters = 5
words = pd.read_csv("dict.csv", header=None)
words = words.rename(columns={0:'word'})

for i in range(1,21):
    new_name = "letter"+str(i)
    words[new_name] = words['word'].str[i-1] # -1 because they are indexed from 0, so letter1 is words['word'].str[0]
words = words[words['letter1'].str.lower() == words['letter1']]

masks = dict()
words_subsets = dict()

for i in range(1,21):
    words_subset_name = "words"+str(i)
    words_subsets[words_subset_name] = words.loc[(words['word'].str.len() == i)]

words = words_subsets['words'+str(num_letters)][['word']]

# Checking if a given word is a possibility given an outcome

Let's start with a basic function to check if a given word is possible after we guessed something and got an outcome string.

In [3]:
def is_word_possible(guess, outcome, target):
    '''
    Checks if target word is possible given the outcome of the guess
    Parameters
    ----------
    guess : str
        word you entered
    
    outcome : str
        str of strict type like BBYYG
        where 
        B is black (letter not in word)
        Y is yellow (right letter wrong spot)
        G is green (right letter right spot)
    
    target : str
        word to check if it is possible
    
    Returns
    --------
    TRUE is possible
    FALSE if impossible
    
    Examples
    --------
    >>> is_word_possible('whaup', 'GGGBB', 'whack')
    '''

    res = True
    outcome = outcome.lower()
    
    yellows = dict()
    blacks = set()
    
    # Build my lookup objects
    for i in range(len(guess)):
        if outcome[i] == 'b':
            blacks.add(guess[i])
        elif outcome[i] == 'y':
            yellows[guess[i]]=i
        
    # print('yellows:')
    # print(yellows)
    for i in range(len(target)):
        # if we whiff on green, exit loop and return False
        if outcome[i] == 'g':
            if guess[i] != target[i]:
                #print("failed greens")
                res = False
                break

        
        # if we have a black letter in target, exit loop and return False
        elif target[i] in blacks:
           # print(f"failed blacks at letter#{i+1} for letter {guess_arr[i]}")
            res = False
            break
        
        elif target[i] in yellows:
            if i == yellows[target[i]]: #if it's same spot it woulda been green
                res = False
                break
            yellows.pop(target[i], None)
            
    if(len(yellows)) > 0:
        res = False
    return res

Let's test it on a couple of guesses:

In [4]:
is_word_possible('puree', 'BBGYG', 'serle') #should be True

True

In [5]:
is_word_possible('puree', 'BBGYG', 'siren') #should be False

False

# Getting outcome string from guess and target

Now, let's look at the reverse functionality, where we get an outcome string from a given guess and target

In [6]:
def get_outcome_str(guess, target):
    '''
    Return outcome str given guess and target
    
    Parameters
    ----------
    guess : str
        word you entered
    
    target : str
        correct word

    Returns
    --------
    str of strict type like BBYYG
        where 
        B is black (letter not in word)
        Y is yellow (right letter wrong spot)
        G is green (right letter right spot)
    
    Examples
    --------
    >>> get_outcome_str('whaup', 'whack')
    '''

    missing = collections.Counter(t for g, t in zip(guess, target) if g != t)
    
    res = []
    
    for target_c, guess_c in zip(target, guess):
        
        if target_c == guess_c:
            res.append('G')
            
        elif guess_c in target and missing[guess_c] > 0:
            res.append('Y')
            missing[guess_c] -= 1
            
        else:
            res.append('B')

    #return score
    res = ''.join(res)
    return res

Quick little test:

In [7]:
get_outcome_str('puree', 'eriee')

'BBYGG'

In [8]:
get_outcome_str('whata', 'whack')

'GGGBB'

In [9]:
words['word']

4         aalii
20        abaca
27        aback
35        abaff
36        abaft
          ...  
235714    zudda
235749    zygal
235796    zygon
235840    zymic
235841    zymin
Name: word, Length: 8497, dtype: object

In [10]:
def filter_dict(guess, outcome, words):
    
    '''
    Filters possible remaining words based on the one you provided
    Parameters
    ----------
    guess : str
        word you entered
    
    outcome : str
        str of strict type like BBYYG
        where 
        B is black (letter not in word)
        Y is yellow (right letter wrong spot)
        G is green (right letter right spot)
    
    words : dataframe
        all possible words
    
    Returns
    --------
    a dataframe with all possible words given the info provided by word
    
    Examples
    --------
    >>> filter_dict('whaup', 'BYBGG', dict)
    '''

    res_set = set(words['word'])
    for target in words['word']:
        if(not is_word_possible(guess, outcome, target)):
            res_set.remove(target)
    
    res = pd.DataFrame({'word': data} for data in res_set)    
    return(res)

In [12]:
filter_dict('whale', 'GGGBB', words)

Unnamed: 0,word
0,wharf
1,whand
2,whamp
3,whack
4,whaup
5,whank
6,whata
7,whauk
8,whats
9,whart


# Finding optimal guess from the set of possible guesses and possible targets

So this function is second top-level function in the solver.

You choose your **targets**. Could be, for instance, filtered by the outcome and word from before, via filter_dict.

You then choose your **guesses**. Usually, you will either want it to be:
- All remaining possible words. This will give you a chance to win next turn
- All legal words. This may result in better outcomes if, for instance, it's optimal to try all new letters.

In [13]:
def optimal_guess(guesses, targets):
    '''
    Spits out the optimal word to try (both among possible targets, and among all words)
    
    Parameters
    ----------
    guesses : dataframe
        dataframe of all possible remaining words
    
    targets : dataframe
        dataframe of words to try
    
    words : dataframe
        dictionary of remaining possible words
    
    Returns
    --------
    List of two values: best word and average remaining words. EG
    ('whata', 10.4)
    
    Examples
    --------
    >>> optimal_guess(possible_words, possible_words)
    '''
    
    averages = defaultdict(float)
    
    # loop all guesses
    #guesses = pd.DataFrame(data={'word': ['abaca', 'knelt', 'spare', 'whank']})
    #print(guesses['word'].values)

    for guess in guesses['word'].values:
        
        ave_guess = 0.0
        rem_words_count = 0
        rem_words_sum = 0
        
        # loop all possible possible target words
        for target in targets['word'].values:
            if target != guess:
                # given this guess + target, calc outcome

                outcome = get_outcome_str(guess, target)
                # how many words are still possible with this outcome?
                
                rem_poss_guesses = len(filter_dict(guess, outcome, targets))
                # print(f"target: {target}, outcome: {outcome}, poss guesses: {rem_poss_guesses}")
                rem_words_sum = rem_words_sum + rem_poss_guesses
                rem_words_count = rem_words_count + 1
        
        # get the average
        ave_guess = rem_words_sum / rem_words_count if rem_words_sum != 0 else 0
        averages[guess] = round(ave_guess,2)
        
    res = pd.Series(averages, name='ave')
    res.index.name = 'guess'
    res = res.reset_index()
    res = pd.DataFrame(res).sort_values('ave')
    
    return res

# Wordle Turn 2 Solver

I'm calling it Turn 2 solver because you can easily give it your guess, outcome and dictionary and it will find optimal Turn 2. However, you can simply call filter_dict function for the latter turns and use new guess and outcome. Eventually, I may improve this function by allowing it to take in a dictionary of guesses and outcomes and pre-filter like so.

In [15]:
def wordle_solver(guess, outcome, words):
    '''
    Returns word with least remaining possibilities on average, picked from remaining words (if you want a chance to win next turn),
    and within all dictionary
    Why ever check for all and not just the possible answers? 
    Because maybe you are optimizing overall, and don't care about winning next turn
    
    Parameters
    ----------
    guess : str
        word you entered
    
    outcome : str
        str of strict type like BBYYG
        where 
        B is black (letter not in word)
        Y is yellow (right letter wrong spot)
        G is green (right letter right spot)
    
    words : dataframe
        dictionary of remaining possible words
    
    Returns
    --------
    String with embedded values to help you decide your next step
    eg
    "There are 78 possible words remaining
    Of them, the best guess for narrowing words down is TUBSY for average remaining number of 15
    Of all possible words, the best guess for narrowing words down is COOKY for average remaining number of 3."
    
    Examples
    --------
    >>> wordle_solver('whaup', 'GGGBB', words)
    '''
    
    possible_words = filter_dict(guess, outcome, words)
    
    ave_yolo_opt = optimal_guess(possible_words, possible_words)   
    ave_opt = optimal_guess(words, possible_words)
       
    return ave_yolo_opt, ave_opt

# Filtered after Turn 1
Create filtered list of possible targets that remain possible after guessing SIREN and outcome of BBGYB

In [16]:
afterT1 = filter_dict('siren', 'BBGYB', words)
afterT1

Unnamed: 0,word
0,egret
1,corke
2,cerer
3,aurae
4,beret
...,...
112,verre
113,ceral
114,kerat
115,rerow


# Optional: Filter repeated letters out

For early turns, you may want to filter out the repeated letters 

In [17]:
def filter_repeats(x):
    if len(set(x)) == len(x):
        return True
    else:
        return False

In [18]:
mask = afterT1.word.apply(filter_repeats)
afterT1 = afterT1[mask]
afterT1

Unnamed: 0,word
1,corke
6,hertz
7,large
9,gerah
10,herma
...,...
106,derby
108,beray
110,deray
113,ceral


# Try it out for Optimal Guess after Turn 1

Here we look for words with lowest average number of words given all possible targets. So, this is "prefiltered" set.

When we run the actual solver, we create the filtered set for the guesses based on the outcome of the guess with "filter_dict"

In [19]:
t2solve = optimal_guess(afterT1, afterT1)

In [20]:
t2solve

Unnamed: 0,guess,ave
22,targe,6.65
26,carte,7.11
59,yerga,7.40
3,gerah,7.77
44,garce,7.96
...,...,...
61,perdu,19.59
64,curve,19.80
20,jerky,20.23
8,burke,20.33


# Run the solver.

**WARNING**: On my mac, it takes like an hour to run.

In [None]:
solver = wordle_solver('siren', 'BBGYB', words)

In [None]:
solver[0]

In [None]:
solver[1]