In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# How many letters?

Classic Wordle is 5 letters but you can change it here for flexibility!

In [2]:
num_letters = 5

# Pick your dictionary

I'm using all words in the mac dictionary, but you can substitute your own file.

Or even get the ones that we know are included in Wordle explicitly (https://gist.github.com/cfreshman/a03ef2cba789d8cf00c08f767e0fad7b)

In [3]:
words = pd.read_csv("dict.csv", header=None)

In [4]:
#words = pd.read_fwf('dict3.txt', header=None)
words.head(10)

Unnamed: 0,0
0,A
1,a
2,aa
3,aal
4,aalii
5,aam
6,Aani
7,aardvark
8,aardwolf
9,Aaron


## You can uncomment below to add words in
Right now, I'm using Mac words dictionary so there is no plurals.   
**Why**? Two reasons. Because I want the small chance to win on turn1, and because keeping them overvalues S severely

In [5]:
words = words.rename(columns={0:'word'})
words = words.append({'word':'donut'}, ignore_index=True)

In [6]:
words.head()

Unnamed: 0,word
0,A
1,a
2,aa
3,aal
4,aalii


## Split the words by letters for positional analysis

In [7]:
for i in range(1,21):
    new_name = "letter"+str(i)
    words[new_name] = words['word'].str[i-1] # -1 because they are indexed from 0, so letter1 is words['word'].str[0]
words = words[words['letter1'].str.lower() == words['letter1']]

In [8]:
masks = dict()
words_subsets = dict()

for i in range(1,21):
    words_subset_name = "words"+str(i)
    words_subsets[words_subset_name] = words.loc[(words['word'].str.len() == i)]

In [9]:
words = words_subsets['words'+str(num_letters)]

In [10]:
totals = dict()
for i in range(1,21):
    new_name = "total"+str(i)
    words_subset_name = "words"+str(i)
    totals[new_name] = len(words_subsets[words_subset_name])
total = totals['total'+str(num_letters)]
total

8498

## Total words by number of letters

In [11]:
for i in range(1,21):
    print('there are', totals['total'+str(i)], i, 'letter words')

there are 26 1 letter words
there are 121 2 letter words
there are 1134 3 letter words
there are 4346 4 letter words
there are 8498 5 letter words
there are 15066 6 letter words
there are 20552 7 letter words
there are 26434 8 letter words
there are 28833 9 letter words
there are 27924 10 letter words
there are 23773 11 letter words
there are 18837 12 letter words
there are 13877 13 letter words
there are 9151 14 letter words
there are 5585 15 letter words
there are 3223 16 letter words
there are 1738 17 letter words
there are 815 18 letter words
there are 417 19 letter words
there are 194 20 letter words


In [12]:
words_subsets['words'+str(num_letters)].head()

Unnamed: 0,word,letter1,letter2,letter3,letter4,letter5,letter6,letter7,letter8,letter9,...,letter11,letter12,letter13,letter14,letter15,letter16,letter17,letter18,letter19,letter20
4,aalii,a,a,l,i,i,,,,,...,,,,,,,,,,
20,abaca,a,b,a,c,a,,,,,...,,,,,,,,,,
27,aback,a,b,a,c,k,,,,,...,,,,,,,,,,
35,abaff,a,b,a,f,f,,,,,...,,,,,,,,,,
36,abaft,a,b,a,f,t,,,,,...,,,,,,,,,,


In [13]:
all_letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

In [14]:
letter_percents = pd.DataFrame()

## Frequencies calculator

TODO: DRY this

In [15]:
letter_count1 = words['letter1'].value_counts()
letter_count2 = words['letter2'].value_counts()
letter_count3 = words['letter3'].value_counts()
letter_count4 = words['letter4'].value_counts()
letter_count5 = words['letter5'].value_counts()
letter_percents1 = round(100 * words['letter1'].value_counts() / total, 2)
letter_percents2 = round(100 * words['letter2'].value_counts() / total, 2)
letter_percents3 = round(100 * words['letter3'].value_counts() / total, 2)
letter_percents4 = round(100 * words['letter4'].value_counts() / total, 2)
letter_percents5 = round(100 * words['letter5'].value_counts() / total, 2)
letter_percents_df = pd.concat([letter_count1,
                                letter_count2,
                                letter_count3,
                                letter_count4,
                                letter_count5,
                                letter_percents1, 
                                letter_percents2,
                                letter_percents3,
                                letter_percents4,
                                letter_percents5,
                               ],    
                               keys=['count1',
                                     'count2',
                                     'count3',
                                     'count4',
                                     'count5',
                                     'letter1',
                                     'letter2',
                                     'letter3',
                                     'letter4',
                                     'letter5',
                                     ], axis=1).fillna(0)
letter_percents_df['overall'] = round(100 * letter_percents_df.iloc[:, 0:5].sum(axis=1) / (total*5), 2)
letter_percents_df = letter_percents_df[["letter1", "letter2", "letter3", "letter4", "letter5", "overall"]]

# Positional (and overall) Frequencies by Letter!

In [16]:
letter_percents_df.sort_values(by='overall', ascending=False)

Unnamed: 0,letter1,letter2,letter3,letter4,letter5,overall
a,7.45,16.55,10.26,10.32,7.99,10.51
e,2.29,12.11,6.73,14.14,14.79,10.01
r,4.65,8.05,9.48,5.92,7.71,7.16
o,2.05,13.46,8.28,5.68,3.49,6.59
i,1.51,9.36,8.28,8.79,2.44,6.07
s,13.31,0.98,4.02,5.53,4.2,5.61
t,6.87,2.62,5.0,5.54,7.99,5.61
l,3.78,6.02,5.97,5.91,6.19,5.57
n,2.44,4.12,7.08,6.04,6.39,5.21
u,2.89,8.57,5.57,4.39,0.73,4.43


# Word Scorer!!!

overall_bonus argument rewards getting a yellow using the overall frequency rather than positional frequency (and penalizes something like Y in the end). Play with it depending on how much you think it helps to get a yellow letter.

#TODO: maybe give separate overall_bonus to vowels?

In [17]:
def word_score(word, df=letter_percents_df, overall_bonus=2):
    arr = list(word)
    score = 0
    ave = letter_percents_df['overall'].mean()
    
    for i in range(len(word)):
        # average base_score comes out to average of 3.846(100/26) * num
        # which is 19.23 for 5 letter words
        # overall average is 3.846 as well, with 10.51 most common
        ave_i = letter_percents_df['letter'+str(i+1)].mean() # positional average for that index
        
        base_score = df.loc[arr[i],'letter'+str(i+1)] - ave_i
        yellow_bonus = (df.loc[arr[i],'overall'] - ave) * overall_bonus
        
        score = score + base_score + yellow_bonus

    return score
word_score('heart', letter_percents_df)

54.49076923076923

In [18]:
def filter_repeats(x):
    if len(set(x)) == len(x):
        return True
    else:
        return False

In [19]:
words_scored = words
words_scored['score'] = words_scored.apply(lambda row: word_score(row['word']), axis = 1)
words_scored.head()

Unnamed: 0,word,letter1,letter2,letter3,letter4,letter5,letter6,letter7,letter8,letter9,...,letter12,letter13,letter14,letter15,letter16,letter17,letter18,letter19,letter20,score
4,aalii,a,a,l,i,i,,,,,...,,,,,,,,,,60.980769
20,abaca,a,b,a,c,a,,,,,...,,,,,,,,,,48.870769
27,aback,a,b,a,c,k,,,,,...,,,,,,,,,,27.290769
35,abaff,a,b,a,f,f,,,,,...,,,,,,,,,,16.610769
36,abaft,a,b,a,f,t,,,,,...,,,,,,,,,,31.820769


In [20]:
mask = words_scored.word.apply(filter_repeats)
words_scored = words_scored[mask]

words_scored.head()

Unnamed: 0,word,letter1,letter2,letter3,letter4,letter5,letter6,letter7,letter8,letter9,...,letter12,letter13,letter14,letter15,letter16,letter17,letter18,letter19,letter20,score
211,abhor,a,b,h,o,r,,,,,...,,,,,,,,,,25.160769
222,abide,a,b,i,d,e,,,,,...,,,,,,,,,,41.390769
246,abilo,a,b,i,l,o,,,,,...,,,,,,,,,,31.040769
318,abler,a,b,l,e,r,,,,,...,,,,,,,,,,50.200769
323,ablow,a,b,l,o,w,,,,,...,,,,,,,,,,16.780769


In [21]:
# Arrange them by score and calculate percentiles
# TODO Add raw and bonus scores

words_scored = words_scored[['word','score']]
words_scored = words_scored.sort_values('score')
words_scored = words_scored.reset_index(drop=True)
words_scored['percentile'] = words_scored.apply(lambda row: 
                                   row.name /len(words_scored) * 100, axis = 1)
words_scored = words_scored[['word','percentile']]

In [22]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

words_scored.tail(20)

Unnamed: 0,word,percentile
5475,serta,99.636033
5476,laney,99.654231
5477,serai,99.672429
5478,saber,99.690628
5479,sayer,99.708826
5480,taise,99.727025
5481,carse,99.745223
5482,carte,99.763421
5483,slare,99.78162
5484,saute,99.799818


# Test your word here!

In [23]:
target = 'donut'
words_scored_res = words_scored[words_scored['word']==target]
words_scored_res

Unnamed: 0,word,percentile
1997,donut,36.342129
