## Creating better clue datasets

To create some clues that are based on having similar meaning or based on words triggered by a codeword, we can use the [Datamuse API](https://www.datamuse.com/api/).

In [2]:
# load word2vec model
import word2vec_loader as wv_loader

limit = 200_000
print(f"Loading {limit} keys")
google_news_wv = wv_loader.load_word2vec_keyedvectors(wv_loader.google_news_path_name, limit)

Loading 200000 keys


AttributeError: module 'word2vec_loader' has no attribute 'google_news_path_name'

To create some clues that are based on having similar meaning or based on words triggered by a codeword, we can use the [Datamuse API](https://www.datamuse.com/api/).

In [None]:

import asyncio
import aiohttp
import decryptogame as dg
import json
import os

def datamuse_url(endpoint: str, words: list[str]): # can add stuff for prefix/suffix support later
    query_str = '+'.join(words)
    return f"https://api.datamuse.com/{endpoint}={query_str}"

async def fetch_text_response(session, url, return_id=None):
    # return ID let's us associate the result with a paramater
    # this allows us to know which word the reponse text is associated with
    # despite being called asynchronously
    async with session.get(url) as response:
        text = await response.text()
        return return_id, text

async def fetch_text_responses(urls, return_ids):
    async with aiohttp.ClientSession() as session:
        api_calls = [fetch_text_response(session, *args) for args in zip(urls, return_ids)]
        return [await response for response in asyncio.as_completed(api_calls)]

# process responses for local storage

def create_dataset_dict(responses):
    meaning_dataset = {}
    for word, response in responses:
        response_object = json.loads(response)
        meaning_dataset[word] = response_object
    return meaning_dataset

async def load_dataset_from_path(path_str, endpoint: str, words):
    if not os.path.exists(path_str):

        urls = [datamuse_url(endpoint, [word]) for word in words]
        responses = await fetch_text_responses(urls, words)

        dataset = create_dataset_dict(responses)

        with open(path_str, 'w') as f:
            json.dump(dataset, f)
    else:
        with open(path_str) as f:
            dataset = json.load(f)
    return dataset


meaning_dataset_path = "meaning.json"
triggerword_dataset_path = "trigger_word.json"

official_words = list(map(wv_loader.official_keyword_to_word, dg.official_words.english.words))

print("Loading meaning dataset")
meaning_dataset = await load_dataset_from_path(meaning_dataset_path, "words?ml", official_words)

print("Loading triggerword dataset")
triggerword_dataset = await load_dataset_from_path(triggerword_dataset_path, "words?rel_trg", official_words)


print("Done!")

Loading meaning dataset
Loading triggerword dataset
Done!


Let's see if we can use the similar meaning and trigger word datasets to come up with reasonable clues that would be more of a challenge for our Guesser. That is, let's see if we can make clues that follow the rules and that I might be able to guess myself.

In [None]:
import random
import numpy as np

def filter_illegal_cluewords(legal_clue_func, datamuse_dataset):
    filtered_dataset = {}
    for keyword, info in datamuse_dataset.items():
        legal_info = [word_info for word_info in info if legal_clue_func(keyword, word_info["word"])]
        filtered_dataset[keyword] = legal_info
    return filtered_dataset        

def clueword_from_dataset(datamuse_dataset, code_word, seed=400):
    candidate_words = []
    scores = []
    if not datamuse_dataset[code_word]:
        return "garbage"
    for word_info in datamuse_dataset[code_word]:
        candidate_words.append(word_info["word"])
        scores.append(word_info["score"])
    np_scores = np.asarray(scores)
    probabilities = np_scores / np.sum(np_scores)
    [clue] = random.Random(seed).choices(candidate_words, probabilities)
    return clue

def clue_from_codewords(datamuse_dataset, codewords, seed=200):
    return tuple(clueword_from_dataset(datamuse_dataset, word, seed=seed) for word in code_words)

def legal(keyword, word):
    return word not in keyword and word in google_news_wv

def codewords(keyword_card, code):
    return  [wv_loader.official_keyword_to_word(keyword_card[i]) for i in code]


meaning_dataset = filter_illegal_cluewords(legal, meaning_dataset)
triggerword_dataset = filter_illegal_cluewords(legal, triggerword_dataset)


[test_keyword_card] = next(dg.generators.RandomKeywordCards(card_lengths=[4], seed=200))
[test_code] = next(dg.generators.RandomCodes([test_keyword_card], seed=200))

print(test_keyword_card)
print(test_code)

meaning_clue = clue_from_codewords(meaning_dataset, codewords(test_keyword_card, test_code))
triggerword_clue = clue_from_codewords(triggerword_dataset, codewords(test_keyword_card, test_code))

print(meaning_clue)
print(triggerword_clue)

NameError: name 'meaning_dataset' is not defined

Those are some reasonable clues!