In [7]:
import pandas as pd
import spacy as sp
import itertools
from tqdm import tqdm
import random
import math
pd.options.mode.chained_assignment = None  # default='warn'
%config Completer.use_jedi = False

In [None]:
######################################################################
######################################################################
################################### SETUP ############################
######################################################################
######################################################################

In [3]:
# Load 400 words from codenames
codenames_words = pd.read_csv('codenames_words.csv',index_col=False)
codenames_words = codenames_words['word']

In [6]:
# Download word vectors if not downloaded.
# !python -m spacy download en_core_web_lg
# Load word vectors from spacy
nlp = sp.load('en_core_web_lg')

Collecting en-core-web-lg==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0-py3-none-any.whl (778.8 MB)
[K     |████████████████████████████████| 778.8 MB 45 kB/s s eta 0:00:01     |█████████████████████▎          | 517.2 MB 11.2 MB/s eta 0:00:24
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
######################################################################
######################################################################
################################### PREPROCESS #######################
######################################################################
######################################################################

In [8]:
# Create word permutation matrix, will be used to make semantic distance dataset
ab = list(itertools.product(codenames_words,codenames_words))
codenames_pairs = pd.DataFrame(ab,columns=("source","destination"))
codenames_pairs['equi'] = codenames_pairs.source != codenames_pairs.destination
codenames_pairs = codenames_pairs[codenames_pairs['equi']]
codenames_pairs['mixed_string'] = codenames_pairs['source'] + codenames_pairs['destination']

# Cleanup
codenames_pairs.reset_index(inplace=True)
codenames_pairs.drop(columns = 'index', inplace=True)

In [9]:
# Get rid of duplicates to save time
for i in tqdm(range(0, len(codenames_pairs))):
    codenames_pairs['mixed_string'].iloc[i] = ''.join(sorted(codenames_pairs['mixed_string'].iloc[i]))

codenames_pairs.drop_duplicates(subset='mixed_string', keep="last", inplace = True)
codenames_pairs.drop(columns = ['equi'], inplace = True)

100%|██████████| 159600/159600 [03:19<00:00, 801.89it/s]


In [None]:
######################################################################
######################################################################
################################### NLP ##############################
######################################################################
######################################################################

In [10]:
# Import pre-calculated distances, skip to Algo section
codenames_pairs = pd.read_csv('codenames_with_distances.csv')

In [None]:
# NLP time
codenames_pairs['semantic_proximity'] = 0.0

for i in tqdm(range(0, len(codenames_pairs))):
    t1 = nlp(codenames_pairs.source.iloc[i])
    t2 = nlp(codenames_pairs.destination.iloc[i])
    codenames_pairs['semantic_proximity'].iloc[i] = t1.similarity(t2)

In [None]:
codenames_pairs.to_csv('codenames_with_distances.csv')

In [None]:
######################################################################
######################################################################
################################### ALGO #############################
######################################################################
######################################################################

In [None]:
# There are 3 * 10^39 possible board combinations so doing it completely would be impractical...
# We have to use a heuristic

# Heuristic: two words that have little or a lot to do with each other have bias. We leverage this idea.

In [11]:
def generate_distance(word_distance, iterations = 5):
    """
    Returns a list of 25 words for the game of Codenames.
    Choose how close you want your words to be!
    """
    if word_distance < 1 or word_distance > 21:
        raise('Word distance must be between 1 and 21')

    # Create dataframe to store list values and total score
    results_frame = pd.DataFrame(columns = ['word_board', 'score'])
    
    # Algo start
    for i in tqdm(range(1,iterations+1,1), desc="Cooking Words"):
        seed_subset = codenames_pairs[(codenames_pairs.semantic_proximity <= codenames_pairs.semantic_proximity.quantile([1-(word_distance-1)/21]).iloc[0]) & (codenames_pairs.semantic_proximity >= codenames_pairs.semantic_proximity.quantile([1-word_distance/21]).iloc[0])]
        seed_pair = seed_subset.iloc[random.randint(0,len(seed_subset))]
        chosen_values = []
        chosen_values.append(seed_pair.source)
        chosen_values.append(seed_pair.destination)
        
        if word_distance > 10:
            seed_pair = seed_subset.iloc[random.randint(0,len(seed_subset))]
            chosen_values.append(seed_pair.source)
            chosen_values.append(seed_pair.destination)
        
        chosen_values = list(set(chosen_values))

        while len(chosen_values) < 25:
            # get a subset of the first word's pairs to choose from
            leg_1 = codenames_pairs[(codenames_pairs['source'] == chosen_values[-2]) | (codenames_pairs['destination'] == chosen_values[-2])]
            # get the best candidates in this subset
            if word_distance > 10:
                dirty_values = list(leg_1.sort_values(by=['semantic_proximity']).head(45).source) + list(leg_1.sort_values(by=['semantic_proximity']).head(45).destination)
            else:
                dirty_values = list(leg_1.sort_values(by=['semantic_proximity'], ascending=False).head(45).source) + list(leg_1.sort_values(by=['semantic_proximity'], ascending=False).head(45).destination)
            # filter out already existing values
            search_values = [x for x in dirty_values if x not in chosen_values]

            # Create subset of the second word
            leg_2 = codenames_pairs[(codenames_pairs['source'] == chosen_values[-1]) | (codenames_pairs['destination'] == chosen_values[-1])]
            # Get the best candidate out of the candidates and append to chosen_values
            if word_distance > 10:
                leg_2_list = list(leg_2[(leg_2['source'].isin(search_values)) | (leg_2['destination'].isin(search_values))].sort_values(by='semantic_proximity').iloc[random.randint(word_distance,round(word_distance*((1+math.sqrt(5))/2), 0))][['source', 'destination']])
            else:
                leg_2_list = list(leg_2[(leg_2['source'].isin(search_values)) | (leg_2['destination'].isin(search_values))].sort_values(by='semantic_proximity', ascending = False).iloc[random.randint(word_distance, round(word_distance*((1+math.sqrt(5))/2), 0))][['source', 'destination']])
            leg_2_list = [x for x in leg_2_list if x not in chosen_values]
            chosen_values.extend(leg_2_list)

        results_frame = results_frame.append({'word_board' : chosen_values,
                        'score' : 0} , 
                        ignore_index=True)
    
    # Tabulate score 
    if iterations == 1:
        pass
    else:
        for i in tqdm(range(0,len(results_frame)), desc="Garnishing"):
            ab = list(itertools.product(results_frame.word_board[i],results_frame.word_board[i]))
            score_pairs = pd.DataFrame(ab,columns=("source","destination"))
            score_pairs['equi'] = score_pairs.source != score_pairs.destination
            score_pairs = score_pairs[score_pairs['equi']]
            score_pairs['mixed_string'] = score_pairs['source'] + score_pairs['destination']

            for j in range(0, len(score_pairs)):
                score_pairs['mixed_string'].iloc[j] = ''.join(sorted(score_pairs['mixed_string'].iloc[j]))

            score_pairs.drop_duplicates(subset='mixed_string', keep="last", inplace = True)
            score_pairs.drop(columns = ['equi'], inplace = True)

            t = pd.merge(score_pairs, codenames_pairs, how='left', on=['mixed_string'])
            total = sum(t.semantic_proximity)

            if sum(t.semantic_proximity.isnull().values.ravel()) > 0:
                print("One or more missing semantic proximity values. Check if record exists between score_pairs and codenames_words")
                break
            results_frame.iat[i, 1] = total
    
    # Return list
    if word_distance > 10:
        the_result = results_frame.sort_values(by='score').iloc[0].word_board
    else:
        the_result = results_frame.sort_values(by='score', ascending = False).iloc[0].word_board
    random.shuffle(the_result)
    return '[%s]' % ', '.join(map(str, the_result))

In [33]:
generate_distance(21, 20)

Cooking Words: 100%|██████████| 20/20 [00:07<00:00,  2.82it/s]
Garnishing: 100%|██████████| 20/20 [00:04<00:00,  4.26it/s]


'[Venus, Page, Smoothie, Coach, Avalanche, Blacksmith, Kilt, Patient, Sumo, Goldilocks, Mill, Tattoo, Crusader, Hit, Reindeer, Beam, Joker, Director, Book, Saloon, Mile, Manicure, Meter, Disk, Newton]'