<a href="https://colab.research.google.com/github/TranPhu1999/Wordle_solver/blob/main/wordle_solver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reference:
- 3blue1brown video: https://www.youtube.com/watch?v=v68zYyaEmEA&t=848s
- 3blue1brown repo: https://github.com/3b1b/videos/tree/870a6cbf30938793f93a2c9235c82bdeed31c7c6/_2022/wordle

# Install requirement

In [None]:
!pip install tqdm english_words



# Dev_v1

In [None]:
# import
import requests
from tqdm import tqdm as ProgressDisplay
from itertools import product
from math import log
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from english_words import get_english_words_set
import numpy as np

# Disable InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

In [None]:
# Global variable
web2lowerset = get_english_words_set(['web2'], lower=True, alpha =False) # gcide / web2
web2lowerset = list(web2lowerset)
common_letter_rank = "etaonrishdlfcmugypwbvkjxzq"


CORRECT, PRESENT, ABSENT = 2, 1, 0

In [None]:
def check_answer(guess_word: str, seed: int, text_size: int) -> list:
  # Check answer from API and return next pattern
  query = f"https://wordle.votee.dev:8000/random?guess={guess_word}&seed={seed}&size={text_size}"
  r = requests.get(query, verify = False)
  result = [check["result"] for check in r.json()]
  return result

In [None]:
# check_answer("thefo",1234,5)

In [None]:
text_basket_with_size = dict()
for text in web2lowerset:
  if len(text) in text_basket_with_size:
    text_basket_with_size[len(text)].append(text)
  else:
    text_basket_with_size[len(text)] = [text]

In [None]:
def get_the_remain_words_list(guess_word: str, pattern: list, allowed_words_list: list) -> list:
  remain_list = []

  for word in allowed_words_list:
    check = True
    for i, possible in enumerate(pattern): # "absent": 0, "present": 1, "correct": 2,
      if (possible == ABSENT and guess_word[i] in word) or \
      (possible == PRESENT and guess_word[i] not in word) or \
      (possible == CORRECT and guess_word[i] != word[i]):
        check = False
    if check:
      remain_list.append(word)
  return remain_list

In [None]:
def calculate_entropy(word, all_pattern: list, allowed_words_list: list)-> float:
  # Return entropy of the correspond word, check the details explanation here: https://youtu.be/v68zYyaEmEA
  # For a particular word, check for all possible pattern and their probability
  # Then calculate the entropy for each of these word with formular of information theory E = sum(px*log(1/px))

  entropy = 0
  for pattern in all_pattern:
    word_basket = get_the_remain_words_list(word, pattern, allowed_words_list)
    if len(word_basket) > 0:
      px = len(word_basket)/len(allowed_words_list)
      entropy += px * log(1/px,2)
  return entropy

In [None]:
def get_best_guess(allowed_words_list: list, all_pattern: list)-> list:
  # from the allow word list, get the word that give the highest entropy
  # higher entropy mean it wil help split the possibility word even smaller
  highest_entropy = 0
  result_index = 0
  for i, word in enumerate(ProgressDisplay(allowed_words_list)):
    entropy = calculate_entropy(word, all_pattern, allowed_words_list)
    if entropy > highest_entropy:
      highest_entropy = entropy
      result_index = i

  return result_index

In [None]:
def recursive_guess(inital_guess: str, allowed_words_list: list, all_pattern: list, seed: int) -> str:
  # recursive function, shrink the allowed_words_list down untill there is one word left or the guess was correct
  if len(allowed_words_list) == 1:
    return allowed_words_list[0]

  guess_pattern = check_answer(inital_guess, seed, len(inital_guess))
  print(f"\ninitial guess: {inital_guess}")
  print(f"guess_pattern: {guess_pattern}")
  if set(guess_pattern) == set(["correct"]):
    return inital_guess
  else:
    #if not true, filter the allowed_words_list with the guess_pattern above
    pattern = {"absent": ABSENT, "present": PRESENT, "correct": CORRECT}
    guess_pattern = [pattern[x] for x in guess_pattern]
    remain_words = get_the_remain_words_list(inital_guess, guess_pattern, allowed_words_list)
    best_guess_index = get_best_guess(remain_words, all_pattern)
    best_guess = remain_words.pop(best_guess_index)
    return recursive_guess(best_guess, remain_words, all_pattern, seed)

In [None]:
def wordle_solver(seed: int, text_size: int) -> str:
  all_pattern = np.array(list(product(*[[ABSENT ,PRESENT ,CORRECT]]*text_size)),dtype=int) # "absent": 0, "present": 1, "correct": 2,
  # main function solve wordle

  allowed_text = []

  for text in web2lowerset:
    if len(text) == text_size and text not in allowed_text:
      allowed_text.append(text)

  answer = recursive_guess(inital_guess = common_letter_rank[:text_size],\
                           allowed_words_list = allowed_text, all_pattern=all_pattern, seed = seed)

  return answer

In [None]:
%%time
wordle_solver(1234, 5)


initial guess: etaon
guess_pattern: ['present', 'present', 'absent', 'absent', 'absent']


100%|██████████| 405/405 [01:14<00:00,  5.42it/s]



initial guess: slite
guess_pattern: ['absent', 'absent', 'absent', 'present', 'present']


100%|██████████| 135/135 [00:07<00:00, 18.38it/s]



initial guess: berth
guess_pattern: ['absent', 'present', 'absent', 'present', 'present']


100%|██████████| 26/26 [00:00<00:00, 91.88it/s]



initial guess: wheft
guess_pattern: ['absent', 'correct', 'correct', 'correct', 'correct']


100%|██████████| 1/1 [00:00<00:00, 1007.28it/s]



initial guess: theft
guess_pattern: ['correct', 'correct', 'correct', 'correct', 'correct']
CPU times: user 1min 21s, sys: 385 ms, total: 1min 21s
Wall time: 1min 28s


'theft'

# Dev_v2

In [None]:
# import
import requests
from tqdm import tqdm as ProgressDisplay
from itertools import product
from math import log
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from english_words import get_english_words_set
import numpy as np
import os
from scipy.stats import entropy
import json

# Disable InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

In [None]:
# Global variable
web2lowerset = get_english_words_set(['web2'], lower=True, alpha =False) # gcide / web2
web2lowerset = list(web2lowerset)

PATTERN_MATRIX_PATH = "/content/drive/MyDrive/Projects/Wordle_solver/pattern_matrix"
INITIAL_GUESS_PATH = "/content/drive/MyDrive/Projects/Wordle_solver/"
TEXT_BASKET_PATH = "/content/drive/MyDrive/Projects/Wordle_solver/"
CORRECT, PRESENT, ABSENT = np.uint8(2), np.uint8(1), np.uint8(0)


In [None]:
def words_to_int_arrays(words):
    return np.array([[ord(c)for c in w] for w in words], dtype=np.uint8)

In [None]:
# https://github.com/3b1b/videos/blob/870a6cbf30938793f93a2c9235c82bdeed31c7c6/_2022/wordle/simulations.py#L104
def create_pattern_matrix(allowed_words_list_1, allowed_words_list_2):
  """
  A pattern for two words represents the wordle-similarity
  pattern (grey -> 0, yellow -> 1, green -> 2) but as an integer
  between 0 and 3^5. Reading this integer in ternary gives the
  associated pattern.

  This function computes the pairwise patterns between two lists
  of words, returning the result as a grid of hash values. Since
  this can be time-consuming, many operations that can be are vectorized
  (perhaps at the expense of easier readibility), and the the result
  is saved to file so that this only needs to be evaluated once, and
  all remaining pattern matching is a lookup.
  """

  text_size = len(allowed_words_list_1[0])
  matrix_dimension_1 = len(allowed_words_list_1)
  matrix_dimension_2_big = len(allowed_words_list_2)

  # Modify: handle word list that have size too big and cause out of ram memory by split it into small list so that the pattern_matrix size don't exceed 10.000^2
  allowed_words_arr_2_big = []
  limit_matrix_dimension_2 = matrix_dimension_2_big
  if matrix_dimension_1 * matrix_dimension_2_big > 10000**2:
    limit_matrix_dimension_2 = 10000**2 // matrix_dimension_1
    for i in range(0,matrix_dimension_2_big,limit_matrix_dimension_2):
      allowed_words_arr_2_big.append(allowed_words_list_2[i:i+limit_matrix_dimension_2])
  else:
    allowed_words_arr_2_big.append(allowed_words_list_2)

  # Modify: For each small words list, calculate pattern matrix and then concat the result afterward
  concat_pattern_matrix = []
  for allowed_words_arr_2_small in allowed_words_arr_2_big:
    matrix_dimension_2 = len(allowed_words_arr_2_small)
    allowed_words_arr_1, allowed_words_arr_2_small = map(words_to_int_arrays,(allowed_words_list_1, allowed_words_arr_2_small))

    # equality_grid keeps track of all equalities between all pairs
    # of letters in words. Specifically, equality_grid[a, b, i, j]
    # is true when words[i][a] == words[b][j]
    equality_grid = np.zeros((matrix_dimension_1, matrix_dimension_2, text_size, text_size), dtype=bool)

    for i, j in product(range(text_size), range(text_size)):
        equality_grid[:, :, i, j] = np.equal.outer(allowed_words_arr_1[:, i], allowed_words_arr_2_small[:, j])
    # full_pattern_matrix[a, b] should represent the 5-color pattern
    # for guess a and answer b, with 0 -> grey, 1 -> yellow, 2 -> green
    full_pattern_matrix = np.zeros((matrix_dimension_1, matrix_dimension_2, text_size), dtype=np.uint8)

    # Yellow pass
    for i, j in product(range(text_size), range(text_size)):
        matches = equality_grid[:, :, i, j].flatten()
        full_pattern_matrix[:, :, i].flat[matches] = PRESENT

    # Green pass
    for i in range(text_size):
        matches = equality_grid[:, :, i, i].flatten()  # matches[a, b] is true when words[a][i] = words[b][i]
        full_pattern_matrix[:, :, i].flat[matches] = CORRECT

    # Rather than representing a color pattern as a lists of integers,
    # store it as a single integer, whose ternary representations corresponds
    # to that list of integers.
    pattern_matrix = np.dot(
        full_pattern_matrix,
        (3**np.arange(text_size)).astype(np.uint8)
    )

    if len(concat_pattern_matrix) == 0:
      concat_pattern_matrix = pattern_matrix
    else:
      concat_pattern_matrix = np.append(concat_pattern_matrix,pattern_matrix,axis=1)

  return concat_pattern_matrix

In [None]:
def generate_full_pattern_matrix(save_file_name, allowed_text):
    pattern_matrix = create_pattern_matrix(allowed_text, allowed_text)
    # Save to file
    np.save(save_file_name, pattern_matrix)
    return pattern_matrix

In [None]:
text_basket_with_size = dict()
text_basket_file = os.path.join(TEXT_BASKET_PATH,"text_basket.json")

if not os.path.exists(text_basket_file):
  for text in web2lowerset:
    if len(text) in text_basket_with_size:
      text_basket_with_size[len(text)].append(text)
    else:
      text_basket_with_size[len(text)] = [text]


  with open(text_basket_file, "w") as json_file:
    json.dump(text_basket_with_size, json_file)
else:
  with open(text_basket_file, "r") as json_file:
    text_basket_with_size = json.load(json_file)

In [None]:
# for length, list_word in ProgressDisplay(text_basket_with_size.items()):
#   import os
#   if length != 4 and length != 5 and length != 6:
#     pattern_file = os.path.join(PATTERN_MATRIX_PATH,f"pattern_matrix_size_{length}.npy")
#     if not os.path.exists(pattern_file):
#       generate_full_pattern_matrix(pattern_file ,list_word)
#     print(length)
#   print(f"{length}:{len(list_word)}")

In [None]:
def get_pattern_matrix(length, list_word):
  pattern_file = os.path.join(PATTERN_MATRIX_PATH,f"pattern_matrix_size_{length}.npy")
  if not os.path.exists(pattern_file):
    pattern_matrix = generate_full_pattern_matrix(pattern_file ,list_word)
  else:
    pattern_matrix = np.load(pattern_file)

  return pattern_matrix

In [None]:
def check_answer(guess_word: str, seed: int, text_size: int) -> list:
  # Check answer from API and return next pattern
  query = f"https://wordle.votee.dev:8000/random?guess={guess_word}&seed={seed}&size={text_size}"
  r = requests.get(query, verify = False)
  result = [check["result"] for check in r.json()]
  return result

In [None]:
# check_answer("thefo",1234,5)

In [None]:
def calculate_entropy(word_index, text_size: int, pattern_matrix: np.ndarray)-> float:
  """
  Return entropy of the correspond word, check the details explanation here: https://youtu.be/v68zYyaEmEA
  For a particular word, check for all possible pattern and their probability
  Then calculate the entropy for each of these word with formular of information theory E = sum(px*log(1/px))
  """

  list_probability = np.zeros(3**text_size)
  for i in range(3**text_size):
    word_basket = np.where(pattern_matrix[word_index]==i)[0].tolist()
    if len(word_basket) > 0:
      list_probability[i] = len(word_basket)/len(pattern_matrix)
  entropy_result = entropy(list_probability,base=2)
  return entropy_result

In [None]:
def get_best_guess(allowed_words_list: list, text_size: int, pattern_matrix: np.ndarray)-> list:
  """
  From the allow word list, get the word that give the highest entropy
  higher entropy mean it wil help split the possibility word even smaller
  """
  highest_entropy = 0
  result_index = 0
  for i, word in enumerate(ProgressDisplay(allowed_words_list)):
    entropy = calculate_entropy(i, text_size, pattern_matrix)
    if entropy > highest_entropy:
      highest_entropy = entropy
      result_index = i

  return result_index

In [None]:
def wordle_solver(seed: int, text_size: int) -> str:
  # main function solve wordle
  correct_pattern_int = 3**text_size - 1

  allowed_words_list = text_basket_with_size[str(text_size)]
  pattern_matrix = get_pattern_matrix(text_size ,allowed_words_list)

  inital_guess_dict = dict()
  inital_guess_file = os.path.join(INITIAL_GUESS_PATH,"initial_guess.json")
  if os.path.exists(inital_guess_file):
    with open(inital_guess_file, "r") as json_file:
      inital_guess_dict = json.load(json_file)
      try:
        inital_guess_index = allowed_words_list.index(inital_guess_dict[str(text_size)])
      except:
        inital_guess_index = get_best_guess(allowed_words_list, text_size, pattern_matrix)
        inital_guess_dict[str(text_size)] = allowed_words_list[inital_guess_index]
        with open(inital_guess_file, "w") as json_file:
          json.dump(inital_guess_dict, json_file)
  else:
    inital_guess_index = get_best_guess(allowed_words_list, text_size, pattern_matrix)
    inital_guess_dict[str(text_size)] = allowed_words_list[inital_guess_index]
    with open(inital_guess_file, "w") as json_file:
      json.dump(inital_guess_dict, json_file)

  while True:
    guess_pattern = check_answer(allowed_words_list[inital_guess_index], seed, text_size)

    print(f"initial guess: {allowed_words_list[inital_guess_index]}")
    print(f"guess_pattern: {guess_pattern}")
    pattern_respone_dict = {"absent": ABSENT, "present": PRESENT, "correct": CORRECT}
    pattern_int = np.dot(
        [pattern_respone_dict[check] for check in guess_pattern],
        (3**np.arange(text_size)).astype(np.uint8)
    )
    if len(allowed_words_list) == 1 and pattern_int != correct_pattern_int:
      return "Can not find word!"
    if pattern_int == correct_pattern_int:
      break
    print(f"pattern_int:{inital_guess_index}")
    remain_words_indexes = np.where(pattern_matrix[inital_guess_index]==pattern_int)[0].tolist()
    print(remain_words_indexes)
    pattern_matrix = pattern_matrix[remain_words_indexes,:][:,remain_words_indexes]
    allowed_words_list = [allowed_words_list[index] for index in remain_words_indexes]
    inital_guess_index = get_best_guess(allowed_words_list, text_size, pattern_matrix)


  return allowed_words_list[inital_guess_index]

In [None]:
%%time
wordle_solver(1234, 5)

initial guess: tarie
guess_pattern: ['correct', 'absent', 'absent', 'absent', 'present']
pattern_int:1286
[18, 63, 224, 225, 274, 283, 295, 419, 646, 836, 923, 1220, 1249, 1556, 2023, 3097, 3196, 4082, 4145, 4380, 4596, 4742, 4811, 5351, 5509, 5674, 5753, 5789, 6084, 6209, 6292, 6585, 6722, 6896, 6931, 6938, 7215, 7226, 7620, 7756, 8845, 8939, 9017, 9050, 9117, 9134, 9204, 9516, 9644]


100%|██████████| 49/49 [00:00<00:00, 416.49it/s]


initial guess: tween
guess_pattern: ['correct', 'absent', 'correct', 'present', 'absent']
pattern_int:46
[2, 25, 32, 35, 43]


100%|██████████| 5/5 [00:00<00:00, 274.04it/s]


initial guess: teeth
guess_pattern: ['correct', 'present', 'correct', 'present', 'present']
pattern_int:0
[2]


100%|██████████| 1/1 [00:00<00:00, 374.89it/s]


initial guess: theft
guess_pattern: ['correct', 'correct', 'correct', 'correct', 'correct']
CPU times: user 230 ms, sys: 119 ms, total: 349 ms
Wall time: 4.27 s


'theft'

In [None]:
!python3 /content/drive/MyDrive/Projects/Wordle_solver/main_v2.py

text_basket_with_size:['banky', 'askar', 'quale', 'peavy', 'sleek', 'whish', 'sapin', 'chuck', 'urase', 'sulfa']
initial guess: tarie
guess_pattern: ['correct', 'absent', 'absent', 'absent', 'present']
100% 49/49 [00:00<00:00, 607.86it/s]
initial guess: tween
guess_pattern: ['correct', 'absent', 'correct', 'present', 'absent']
100% 5/5 [00:00<00:00, 481.61it/s]
initial guess: teeth
guess_pattern: ['correct', 'present', 'correct', 'present', 'present']
100% 1/1 [00:00<00:00, 610.17it/s]
initial guess: theft
guess_pattern: ['correct', 'correct', 'correct', 'correct', 'correct']
         102731 function calls (101471 primitive calls) in 5.407 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        8    1.945    0.243    1.945    0.243 {method 'read' of '_ssl._SSLSocket' objects}
        1    1.429    1.429    1.429    1.429 {built-in method numpy.fromfile}
        4    0.919    0.230    0.919    0.230 {method 'do_handshake' of