# Wordle Analysis

This notebook uses the word list scraped from the Wordle game to identify the optimal opening words.

## Loading word lists

In [1]:
# Load all the valid answers
with open('wordle-answers-alphabetical.txt', 'r') as f:
    answers = [l[:-1] for l in f]

# Display the first 10 answers
answers[:10]

['aback',
 'abase',
 'abate',
 'abbey',
 'abbot',
 'abhor',
 'abide',
 'abled',
 'abode',
 'abort']

In [2]:
# Load all the valid guesses
with open('wordle-allowed-guesses.txt', 'r') as f:
    guesses = [l[:-1] for l in f]
    
# Display the first 10 guesses
guesses[:10]

['aahed',
 'aalii',
 'aargh',
 'aarti',
 'abaca',
 'abaci',
 'abacs',
 'abaft',
 'abaka',
 'abamp']

## Finding yellow letters

In [3]:
letter_frequencies = {chr(c): 0 for c in range(ord("a"), ord("z")+1)}
for word in answers:
    for letter in word:
        letter_frequencies[letter] += 1
        
# Display letters in order of descending freqency
sorted(letter_frequencies.items(), key=lambda x: x[1], reverse=True)

[('e', 1233),
 ('a', 979),
 ('r', 899),
 ('o', 754),
 ('t', 729),
 ('l', 718),
 ('i', 671),
 ('s', 669),
 ('n', 575),
 ('c', 477),
 ('u', 467),
 ('y', 425),
 ('d', 393),
 ('h', 389),
 ('p', 367),
 ('m', 316),
 ('g', 311),
 ('b', 281),
 ('f', 230),
 ('k', 210),
 ('w', 195),
 ('v', 153),
 ('z', 40),
 ('x', 37),
 ('q', 29),
 ('j', 27)]

In [4]:
from itertools import chain


def score_word(word):
    score = 0
    letters = set(word)
    for letter in letters:
        score += letter_frequencies[letter]

    return score


word_scores = [(word, score_word(word)) for word in chain(answers, guesses)]
word_scores.sort(key=lambda x: x[1], reverse=True)
word_scores[:10]

[('oater', 4594),
 ('orate', 4594),
 ('roate', 4594),
 ('realo', 4583),
 ('alert', 4558),
 ('alter', 4558),
 ('later', 4558),
 ('artel', 4558),
 ('ratel', 4558),
 ('taler', 4558)]

In [5]:
def get_word_rank(target_word, word_scores):
    target_score = None

    for word, score in word_scores:
        if word == target_word:
            target_score = score
            break

    if target_score is not None:
        for i, (word, score) in enumerate(word_scores):
            if score <= target_score:
                print(f"{target_word} is ranked {i} / {len(answers) + len(guesses)}")
                break
    else:
        print("Invalid word")

get_word_rank("stark", word_scores)

stark is ranked 2801 / 12972


## Finding green letters

In [6]:
position_frequencies = {}
for c in range(ord("a"), ord("z")+1):
    char = chr(c)
    for pose in range(5):
        position_frequencies[(char, pose)] = 0

for word in answers:
    for pose, letter in enumerate(word):
        position_frequencies[(letter, pose)] += 1

sorted(position_frequencies.items(), key=lambda x: x[1], reverse=True)

[(('e', 4), 424),
 (('s', 0), 366),
 (('y', 4), 364),
 (('e', 3), 318),
 (('a', 2), 307),
 (('a', 1), 304),
 (('o', 1), 279),
 (('r', 1), 267),
 (('i', 2), 266),
 (('t', 4), 253),
 (('o', 2), 244),
 (('e', 1), 242),
 (('r', 4), 212),
 (('i', 1), 202),
 (('l', 1), 201),
 (('c', 0), 198),
 (('u', 1), 186),
 (('n', 3), 182),
 (('e', 2), 177),
 (('b', 0), 173),
 (('s', 3), 171),
 (('u', 2), 165),
 (('a', 3), 163),
 (('r', 2), 163),
 (('l', 3), 162),
 (('i', 3), 158),
 (('l', 4), 155),
 (('c', 3), 152),
 (('r', 3), 152),
 (('t', 0), 149),
 (('h', 1), 144),
 (('p', 0), 142),
 (('a', 0), 141),
 (('h', 4), 139),
 (('n', 2), 139),
 (('t', 3), 139),
 (('f', 0), 136),
 (('o', 3), 132),
 (('n', 4), 130),
 (('d', 4), 118),
 (('g', 0), 115),
 (('k', 4), 113),
 (('l', 2), 112),
 (('d', 0), 111),
 (('t', 2), 111),
 (('m', 0), 107),
 (('r', 0), 105),
 (('l', 0), 88),
 (('n', 1), 87),
 (('w', 0), 83),
 (('u', 3), 82),
 (('s', 2), 80),
 (('t', 1), 77),
 (('g', 3), 76),
 (('d', 2), 75),
 (('e', 0), 72),
 

In [7]:
def score_word(word):
    score = 0
    for pose, letter in enumerate(word):
        score += position_frequencies[(letter, pose)]

    return score


frequency_word_scores = [(word, score_word(word)) for word in chain(answers, guesses)]
frequency_word_scores.sort(key=lambda x: x[1], reverse=True)
frequency_word_scores[:10]

[('saree', 1575),
 ('sooey', 1571),
 ('soree', 1550),
 ('saine', 1542),
 ('soare', 1528),
 ('saice', 1512),
 ('sease', 1510),
 ('seare', 1491),
 ('seine', 1480),
 ('slane', 1480)]

In [8]:
# Top 3 words are tied for yellow rank
for word, score in word_scores[:3]:
    get_word_rank(word, frequency_word_scores)

oater is ranked 2667 / 12972
orate is ranked 576 / 12972
roate is ranked 241 / 12972


## Conclusion

Roate is the best chance to get the most yellows AND greens on the first guess