## Creating better clue datasets

I don't have a labelled dataset because none of my friends want to come up with thousands of clues with me :(
    
To create some clues that are based on having similar meaning or based on words triggered by a codeword, we can use the [Datamuse API](https://www.datamuse.com/api/).

In [1]:
import decryptoai.word2vec_loader.loader as wv_loader

limit = 200_000
print(f"Loading {limit} keys")
google_news_wv = wv_loader.load_word2vec_keyedvectors(limit=limit, debug=True)

Loading 200000 keys


In [2]:

import asyncio
import aiohttp
import decryptogame as dg
import decryptoai.config as cfg
import json
import pathlib

def datamuse_url(endpoint: str, words: list[str]): # can add stuff for prefix/suffix support later
    query_str = '+'.join(words)
    return f"https://api.datamuse.com/{endpoint}={query_str}"

async def fetch_text_response(session, url, return_id=None):
    # return ID let's us associate the result with a paramater
    # this allows us to know which word the reponse text is associated with
    # despite being called asynchronously
    async with session.get(url) as response:
        text = await response.text()
        return return_id, text

async def fetch_text_responses(urls, return_ids):
    async with aiohttp.ClientSession() as session:
        api_calls = [fetch_text_response(session, *args) for args in zip(urls, return_ids)]
        return [await response for response in asyncio.as_completed(api_calls)]

def create_dataset_dict(responses):
    meaning_dataset = {}
    for word, response in responses:
        response_object = json.loads(response)
        if response_object:
            meaning_dataset[word] = response_object
    return meaning_dataset


# process responses for local storage

async def load_dataset_from_path(path: pathlib.Path, endpoint: str, words):
    if not path.exists():
        if not path.parent.exists():
            path.parent.mkdir()
        urls = [datamuse_url(endpoint, [word]) for word in words]
        responses = await fetch_text_responses(urls, words)

        dataset = create_dataset_dict(responses)

        with open(str(path), 'w') as f:
            json.dump(dataset, f)
    else:
        with open(str(path)) as f:
            dataset = json.load(f)
    return dataset


meaning_dataset_path = cfg.MEANING_JSON_PATH
triggerword_dataset_path = cfg.TRIGGERWORD_JSON_PATH

official_words = list(map(wv_loader.official_keyword_to_word, dg.official_words.english.words))

print("Loading meaning dataset")
meaning_dataset = await load_dataset_from_path(meaning_dataset_path, "words?ml", official_words)

print("Loading triggerword dataset")
triggerword_dataset = await load_dataset_from_path(triggerword_dataset_path, "words?rel_trg", official_words)


print("Done!")

Loading meaning dataset


FileNotFoundError: [Errno 2] No such file or directory: '/Users/jadenrodriguez/Projects/decrypto-ai-research/data/meaning.json'

Let's see if we can use the similar meaning and trigger word datasets to come up with reasonable clues that would be more of a challenge for our Guesser. That is, let's see if we can make clues that follow the rules and that I might be able to guess myself.

In [None]:
import random
import numpy as np

def filter_illegal_cluewords(legal_clue_func, datamuse_dataset):
    filtered_dataset = {}
    for keyword, info in datamuse_dataset.items():
        legal_info = [word_info for word_info in info if legal_clue_func(keyword, word_info["word"])]
        filtered_dataset[keyword] = legal_info
    return filtered_dataset        

def clueword_from_dataset(datamuse_dataset, code_word, seed=400):
    candidate_words = []
    scores = []
    if code_word not in datamuse_dataset:
        return "garbage"
    for word_info in datamuse_dataset[code_word]:
        candidate_words.append(word_info["word"])
        scores.append(word_info["score"])
    np_scores = np.asarray(scores)
    probabilities = np_scores / np.sum(np_scores)
    [clue] = random.Random(seed).choices(candidate_words, probabilities)
    return clue

def clue_from_codewords(datamuse_dataset, codewords, seed=100):
    return tuple(clueword_from_dataset(datamuse_dataset, word, seed=seed) for word in codewords)

def legal(keyword, word):
    no_inclusion = (keyword not in word) and (word not in keyword)
    no_british = word not in ["armour", "moustache", "theatre", "mustache", "armor", "theater"]
    return no_inclusion and no_british and word in google_news_wv 

def codewords(keyword_card, code):
    return  [wv_loader.official_keyword_to_word(keyword_card[i]) for i in code]


meaning_dataset = filter_illegal_cluewords(legal, meaning_dataset)
triggerword_dataset = filter_illegal_cluewords(legal, triggerword_dataset)

keyword_card_length = 4

[test_keyword_card] = next(dg.generators.RandomKeywordCards(card_lengths=[keyword_card_length], seed=200))
[test_code] = next(dg.generators.RandomCodes([test_keyword_card], seed=200))
test_codewords = codewords(test_keyword_card, test_code)

print(test_keyword_card)
print(test_code)

meaning_clue = clue_from_codewords(meaning_dataset, test_codewords)
triggerword_clue = clue_from_codewords(triggerword_dataset, test_codewords)

print(meaning_clue)
print(triggerword_clue)

('WINTER', 'PATH', 'FESTIVAL', 'POISON')
(0, 1, 3)
('weather', 'street', 'drug')
('skiing', 'graph', 'pill')


Those are some reasonable clues! Let's save a csv for ease-of-use.

In [None]:
import pandas
from itertools import permutations

def all_possible_codes(keyword_card_length=4, clue_length=3):
    return list(permutations(range(keyword_card_length), clue_length))


meaning_csv_path = cfg.MEANING_CSV_PATH
triggerword_csv_path = cfg.TRIGGERWORD_CSV_PATH

if not meaning_csv_path.exists() or not triggerword_csv_path.exists():

    num_keyword_cards = 1500
    codes = all_possible_codes()
    keyword_card_generator = dg.generators.RandomKeywordCards(card_lengths=[keyword_card_length], seed=100)

    meaning_data = []
    triggerword_data = []
    for _, [keyword_card] in zip(range(num_keyword_cards), keyword_card_generator):
        for i, code in enumerate(codes):
            meaning_clue = clue_from_codewords(meaning_dataset, codewords(keyword_card, code))
            meaning_data.append(keyword_card + meaning_clue + (i,))
            
            triggerword_clue = clue_from_codewords(triggerword_dataset, codewords(keyword_card, code))
            triggerword_data.append(keyword_card + triggerword_clue + (i,))

    header = ["keyword1", "keyword2", "keyword3", "keyword4", "clue1",  "clue2",  "clue3", "code_index"]
    meaning_df = pandas.DataFrame(meaning_data, columns=header)
    triggerword_df = pandas.DataFrame(triggerword_data, columns=header)
                                      
    meaning_df.to_csv(str(meaning_csv_path), index=False)
    triggerword_df.to_csv(str(triggerword_csv_path), index=False)

else:
    meaning_df = pandas.read_csv(str(meaning_csv_path))

meaning_df.sample(frac=1).head()

Unnamed: 0,keyword1,keyword2,keyword3,keyword4,clue1,clue2,clue3,code_index
0,BUTTERFLY,PLUMBER,FOOT,BLOOD,throttle,electricians,leg,0
1,BUTTERFLY,PLUMBER,FOOT,BLOOD,throttle,electricians,ancestry,1
2,BUTTERFLY,PLUMBER,FOOT,BLOOD,throttle,leg,electricians,2
3,BUTTERFLY,PLUMBER,FOOT,BLOOD,throttle,leg,ancestry,3
4,BUTTERFLY,PLUMBER,FOOT,BLOOD,throttle,ancestry,electricians,4


Now we have tens of thousands of clues to reference.