## Data Processing

In [1]:
# Dependencies
import pandas as pd
import json

In [2]:
# Read json file from https://github.com/dwyl/english-words/blob/master/words_dictionary.json
json_file = open("Data/words_dictionary.json")
all_words = json.load(json_file)

In [3]:
# Get list of five-letter words
five_letter_words = []
for word in all_words.keys():
    if len(word) == 5:
        five_letter_words.append(word)

%store five_letter_words

Stored 'five_letter_words' (list)


## Letter Scores

In [4]:
# Get a dictionary of how many times each letter appears in five-letter words
letter_appearences = {"a": 0, "b": 0, "c": 0, "d": 0, "e": 0, "f": 0, "g": 0, "h": 0, "i": 0, "j": 0, "k": 0, "l": 0,
                      "m": 0, "n": 0, "o": 0, "p": 0, "q": 0, "r": 0, "s": 0, "t": 0, "u": 0, "v": 0, "w": 0, "x": 0,
                      "y": 0, "z": 0}

for word in five_letter_words:
    for letter in letter_appearences.keys():
        if letter in word:
            letter_appearences[letter] += 1

letter_appearences

{'a': 7247,
 'b': 1936,
 'c': 2588,
 'd': 2639,
 'e': 6728,
 'f': 1115,
 'g': 1867,
 'h': 2223,
 'i': 4767,
 'j': 372,
 'k': 1663,
 'l': 3923,
 'm': 2361,
 'n': 3773,
 'o': 4613,
 'p': 2148,
 'q': 139,
 'r': 4864,
 's': 5871,
 't': 3866,
 'u': 3241,
 'v': 853,
 'w': 1160,
 'x': 357,
 'y': 2476,
 'z': 435}

In [5]:
# Create dictionary of how many times each letter appears in each position in five-letter words
letter_positions = {
    "First":  {"a": 0, "b": 0, "c": 0, "d": 0, "e": 0, "f": 0, "g": 0, "h": 0, "i": 0, "j": 0, "k": 0, "l": 0, "m": 0,
               "n": 0, "o": 0, "p": 0, "q": 0, "r": 0, "s": 0, "t": 0, "u": 0, "v": 0, "w": 0, "x": 0,"y": 0, "z": 0},
    "Second": {"a": 0, "b": 0, "c": 0, "d": 0, "e": 0, "f": 0, "g": 0, "h": 0, "i": 0, "j": 0, "k": 0, "l": 0, "m": 0,
               "n": 0, "o": 0, "p": 0, "q": 0, "r": 0, "s": 0, "t": 0, "u": 0, "v": 0, "w": 0, "x": 0,"y": 0, "z": 0},
    "Third":  {"a": 0, "b": 0, "c": 0, "d": 0, "e": 0, "f": 0, "g": 0, "h": 0, "i": 0, "j": 0, "k": 0, "l": 0, "m": 0,
               "n": 0, "o": 0, "p": 0, "q": 0, "r": 0, "s": 0, "t": 0, "u": 0, "v": 0, "w": 0, "x": 0,"y": 0, "z": 0},
    "Fourth": {"a": 0, "b": 0, "c": 0, "d": 0, "e": 0, "f": 0, "g": 0, "h": 0, "i": 0, "j": 0, "k": 0, "l": 0, "m": 0,
               "n": 0, "o": 0, "p": 0, "q": 0, "r": 0, "s": 0, "t": 0, "u": 0, "v": 0, "w": 0, "x": 0,"y": 0, "z": 0},
    "Fifth":  {"a": 0, "b": 0, "c": 0, "d": 0, "e": 0, "f": 0, "g": 0, "h": 0, "i": 0, "j": 0, "k": 0, "l": 0, "m": 0,
               "n": 0, "o": 0, "p": 0, "q": 0, "r": 0, "s": 0, "t": 0, "u": 0, "v": 0, "w": 0, "x": 0,"y": 0, "z": 0}
}

for word in five_letter_words:
    for i in range(5):
        if i == 0:
            letter_position = "First"
        elif i == 1:
            letter_position = "Second"
        elif i == 2:
            letter_position = "Third"
        elif i == 3:
            letter_position = "Fourth"
        else:
            letter_position = "Fifth"
        letter_positions[letter_position][word[i]] += 1

# Make the ending "s" 0 to make plurals less valuable
# letter_positions["Fifth"]["s"] = 0

letter_positions

{'First': {'a': 1173,
  'b': 1141,
  'c': 1196,
  'd': 801,
  'e': 421,
  'f': 684,
  'g': 737,
  'h': 571,
  'i': 301,
  'j': 260,
  'k': 473,
  'l': 679,
  'm': 849,
  'n': 405,
  'o': 334,
  'p': 944,
  'q': 85,
  'r': 681,
  's': 1813,
  't': 981,
  'u': 328,
  'v': 287,
  'w': 468,
  'x': 27,
  'y': 167,
  'z': 112},
 'Second': {'a': 2871,
  'b': 108,
  'c': 254,
  'd': 136,
  'e': 1970,
  'f': 40,
  'g': 102,
  'h': 720,
  'i': 1669,
  'j': 19,
  'k': 101,
  'l': 866,
  'm': 233,
  'n': 557,
  'o': 2281,
  'p': 283,
  'q': 21,
  'r': 1151,
  's': 173,
  't': 316,
  'u': 1403,
  'v': 81,
  'w': 174,
  'x': 74,
  'y': 279,
  'z': 36},
 'Third': {'a': 1481,
  'b': 446,
  'c': 531,
  'd': 514,
  'e': 1027,
  'f': 198,
  'g': 461,
  'h': 208,
  'i': 1267,
  'j': 57,
  'k': 309,
  'l': 1060,
  'm': 649,
  'n': 1238,
  'o': 1154,
  'p': 434,
  'q': 27,
  'r': 1544,
  's': 682,
  't': 783,
  'u': 787,
  'v': 287,
  'w': 276,
  'x': 126,
  'y': 229,
  'z': 143},
 'Fourth': {'a': 1585,
  '

## Word Scores

In [6]:
# Create dictionary of words with their word scores
word_scores = {}

for word in five_letter_words:
    word_score = 0
    for letter in word:
        if word.count(letter) < 2:
            letter_score = letter_appearences[letter]
            word_score += letter_score
    word_scores[word] = word_score

%store word_scores

Stored 'word_scores' (dict)


In [7]:
# Create dictionary of words with their word scores based on letter positioning
positional_word_scores = {}

for word in five_letter_words:
    word_score = 0
    for i in range(5):
        if word.count(word[i]) < 2:
            if i == 0:
                letter_position = "First"
            elif i == 1:
                letter_position = "Second"
            elif i == 2:
                letter_position = "Third"
            elif i == 3:
                letter_position = "Fourth"
            else:
                letter_position = "Fifth"
            letter_score = letter_positions[letter_position][word[i]]
            word_score += letter_score
    positional_word_scores[word] = word_score

%store positional_word_scores

Stored 'positional_word_scores' (dict)


In [8]:
word_scores_df = pd.DataFrame({
    "Word": word_scores.keys(),
    "Word Score": word_scores.values(),
    "Positional Score": positional_word_scores.values()
})

word_scores_df = word_scores_df.sort_values(by=["Word Score", "Positional Score"], ascending=False)
word_scores_df.head()

Unnamed: 0,Word,Word Score,Positional Score
866,aries,29477,9248
12345,serai,29477,7421
11268,raise,29477,7413
873,arise,29477,6185
229,aesir,29477,6041
