In [408]:
import pandas as pd
import spacy as sp
import itertools
from tqdm import tqdm
import random
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
######################################################################
######################################################################
################################### SETUP ############################
######################################################################
######################################################################

In [7]:
# Load 400 words from codenames
codenames_words = pd.read_csv('codenames_words.csv',index_col=False)
codenames_words = codenames_words['word']

In [16]:
# Load word vectors from spacy
nlp = sp.load('en_core_web_lg')

In [None]:
######################################################################
######################################################################
################################### PREPROCESS #######################
######################################################################
######################################################################

In [167]:
# Create word permutation matrix, will be used to make semantic distance dataset
ab = list(itertools.product(codenames_words,codenames_words))
codenames_pairs = pd.DataFrame(ab,columns=("source","destination"))
codenames_pairs['equi'] = codenames_pairs.source != codenames_pairs.destination
codenames_pairs = codenames_pairs[codenames_pairs['equi']]
codenames_pairs['mixed_string'] = codenames_pairs['source'] + codenames_pairs['destination']

# Cleanup
codenames_pairs.reset_index(inplace=True)
codenames_pairs.drop(columns = 'index', inplace=True)

In [351]:
# Get rid of duplicates to save time
for i in tqdm(range(0, len(codenames_pairs))):
    codenames_pairs['mixed_string'].iloc[i] = ''.join(sorted(codenames_pairs['mixed_string'].iloc[i]))

completed_sort_backup = codenames_pairs.copy()
codenames_pairs.drop_duplicates(subset='mixed_string', keep="last", inplace = True)
codenames_pairs.drop(columns = ['equi'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
100%|██████████| 78960/78960 [01:42<00:00, 769.15it/s]


In [None]:
######################################################################
######################################################################
################################### NLP ##############################
######################################################################
######################################################################

In [197]:
# NLP time
codenames_pairs['semantic_proximity'] = 0.0

for i in tqdm(range(0, len(codenames_pairs))):
    t1 = nlp(codenames_pairs.source.iloc[i])
    t2 = nlp(codenames_pairs.destination.iloc[i])
    codenames_pairs['semantic_proximity'].iloc[i] = t1.similarity(t2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
100%|██████████| 79358/79358 [25:44<00:00, 51.40it/s]  


In [198]:
codenames_pairs.to_csv('codenames_with_distances.csv')

In [None]:
######################################################################
######################################################################
################################### ALGO #############################
######################################################################
######################################################################

In [None]:
# There are 3 * 10^39 possible board combinations so doing it completely would be impractical...
# We have to use a heuristic

# Heuristic: two words that have little to do with each other likely do not share words with high proximity
# So we do the following: start with 2 seed words that are very far apart...
# Then from that subset: pick the 30 least connected words to the first word, and select the least connected word with the second word
    # will have to filter out words already in set
    # do this until 25 are reached
    # store as a list for further computation

In [568]:
def gen_codename_words(word_distance, iterations = 20):
    """
    Returns a list of 25 words for the game of Codenames.
    Choose how close you want your words to be!
    """
    if word_distance < 1 or word_distance > 11:
        raise('Word distance must be between 1 and 11')

    # Create dataframe to store list values and total score
    results_frame = pd.DataFrame(columns = ['word_board', 'score'])
    
    # Algo start
    for i in tqdm(range(1,iterations+1,1), desc="Cooking Words"):
        seed_subset = codenames_pairs[(codenames_pairs.semantic_proximity <= codenames_pairs.semantic_proximity.quantile([1-(word_distance-1)/11]).iloc[0]) & (codenames_pairs.semantic_proximity >= codenames_pairs.semantic_proximity.quantile([1-word_distance/11]).iloc[0])]
        seed_pair = seed_subset.iloc[random.randint(0,len(seed_subset))]
        chosen_values = []
        # take the top 3 values from leg 1, then find the score in leg_2, get the lowest value from leg_2
        chosen_values.append(seed_pair.source)
        chosen_values.append(seed_pair.destination)

        while len(chosen_values) < 25:
            # get a subset of the first word's pairs to choose from
            leg_1 = codenames_pairs[(codenames_pairs['source'] == chosen_values[-2]) | (codenames_pairs['destination'] == chosen_values[-2])]
            # get the best 3 candidates in this subset
            if word_distance > 6:
                dirty_values = list(leg_1.sort_values(by=['semantic_proximity']).head(45).source) + list(leg_1.sort_values(by=['semantic_proximity']).head(45).destination)
            else:
                dirty_values = list(leg_1.sort_values(by=['semantic_proximity'], ascending=False).head(45).source) + list(leg_1.sort_values(by=['semantic_proximity'], ascending=False).head(45).destination)
            # filter out already existing values
            search_values = [x for x in dirty_values if x not in chosen_values]

            # Create subset of the second word
            leg_2 = codenames_pairs[(codenames_pairs['source'] == chosen_values[-1]) | (codenames_pairs['destination'] == chosen_values[-1])]
            # Get the best candidate out of the 3 candidates and append to chosen_values
            if word_distance > 6:
                leg_2_list = list(leg_2[(leg_2['source'].isin(search_values)) | (leg_2['destination'].isin(search_values))].sort_values(by='semantic_proximity').iloc[10][['source', 'destination']])
            else:
                leg_2_list = list(leg_2[(leg_2['source'].isin(search_values)) | (leg_2['destination'].isin(search_values))].sort_values(by='semantic_proximity', ascending = False).iloc[10][['source', 'destination']])
            leg_2_list = [x for x in leg_2_list if x not in chosen_values]
            chosen_values.extend(leg_2_list)

        results_frame = results_frame.append({'word_board' : chosen_values,
                        'score' : 0} , 
                        ignore_index=True)
    
    # Tabulate score 
    for i in tqdm(range(0,len(results_frame)), desc="Garnishing"):
        ab = list(itertools.product(results_frame.word_board[i],results_frame.word_board[i]))
        score_pairs = pd.DataFrame(ab,columns=("source","destination"))
        score_pairs['equi'] = score_pairs.source != score_pairs.destination
        score_pairs = score_pairs[score_pairs['equi']]
        score_pairs['mixed_string'] = score_pairs['source'] + score_pairs['destination']

        for j in range(0, len(score_pairs)):
            score_pairs['mixed_string'].iloc[j] = ''.join(sorted(score_pairs['mixed_string'].iloc[j]))

        score_pairs.drop_duplicates(subset='mixed_string', keep="last", inplace = True)
        score_pairs.drop(columns = ['equi'], inplace = True)

        t = pd.merge(score_pairs, codenames_pairs, how='left', on=['mixed_string'])
        total = sum(t.semantic_proximity)

        if sum(t.semantic_proximity.isnull().values.ravel()) > 0:
            print("One or more missing semantic proximity values. Check if record exists between score_pairs and codenames_words")
            break
        results_frame.iat[i, 1] = total
    
    # Return list
    the_result = results_frame.sort_values(by='score').iloc[int(round(iterations/2,0))].word_board
    random.shuffle(the_result)
    return the_result

In [493]:
for i in tqdm(range(1,iters+1,1)):
    seed_subset = codenames_pairs[(codenames_pairs.semantic_proximity >= codenames_pairs.semantic_proximity.quantile([word_distance/11-1/11]).iloc[0]) & (codenames_pairs.semantic_proximity <= codenames_pairs.semantic_proximity.quantile([(word_distance + 1)/11-1/11]).iloc[0])]
    seed_pair = seed_subset.iloc[random.randint(0,len(seed_subset))]
    chosen_values = []
    # take the top 3 values from leg 1, then find the score in leg_2, get the lowest value from leg_2
    chosen_values.append(seed_pair.source)
    chosen_values.append(seed_pair.destination)


    while len(chosen_values) < 25:
        # get a subset of the first word's pairs to choose from
        leg_1 = codenames_pairs[(codenames_pairs['source'] == chosen_values[-2]) | (codenames_pairs['destination'] == chosen_values[-2])]
        # get the best 3 candidates in this subset
        if word_distance < 6:
            dirty_values = list(leg_1.sort_values(by=['semantic_proximity']).head(30).source) + list(leg_1.sort_values(by=['semantic_proximity']).head(30).destination)
        else:
            dirty_values = list(leg_1.sort_values(by=['semantic_proximity'], ascending=False).head(30).source) + list(leg_1.sort_values(by=['semantic_proximity'], ascending=False).head(30).destination)
        # filter out already existing values
        search_values = [x for x in dirty_values if x not in chosen_values]

        # Create subset of the second word
        leg_2 = codenames_pairs[(codenames_pairs['source'] == chosen_values[-1]) | (codenames_pairs['destination'] == chosen_values[-1])]
        # Get the best candidate out of the 3 candidates and append to chosen_values
        if word_distance < 6:
            leg_2_list = list(leg_2[(leg_2['source'].isin(search_values)) | (leg_2['destination'].isin(search_values))].sort_values(by='semantic_proximity').iloc[0][['source', 'destination']])
        else:
            leg_2_list = list(leg_2[(leg_2['source'].isin(search_values)) | (leg_2['destination'].isin(search_values))].sort_values(by='semantic_proximity', ascending = False).iloc[0][['source', 'destination']])
        leg_2_list = [x for x in leg_2_list if x not in chosen_values]
        chosen_values.extend(leg_2_list)
    
    results_frame = results_frame.append({'word_board' : chosen_values,
                    'score' : 0} , 
                    ignore_index=True)

100%|██████████| 20/20 [00:28<00:00,  1.44s/it]


In [494]:
# Now tabulate the score
# the score = total_proximity = sum of all semantic_proximity, the lower the more distant the board
    # we do this by knowing each value pair, should be 300 pairs, then we just sum it
    # can likely reuse matrix logic from earlier

for i in tqdm(range(0,len(results_frame))):
    ab = list(itertools.product(results_frame.word_board[i],results_frame.word_board[i]))
    score_pairs = pd.DataFrame(ab,columns=("source","destination"))
    score_pairs['equi'] = score_pairs.source != score_pairs.destination
    score_pairs = score_pairs[score_pairs['equi']]
    score_pairs['mixed_string'] = score_pairs['source'] + score_pairs['destination']

    for j in range(0, len(score_pairs)):
        score_pairs['mixed_string'].iloc[j] = ''.join(sorted(score_pairs['mixed_string'].iloc[j]))

    score_pairs.drop_duplicates(subset='mixed_string', keep="last", inplace = True)
    score_pairs.drop(columns = ['equi'], inplace = True)

    t = pd.merge(score_pairs, codenames_pairs, how='left', on=['mixed_string'])
    total = sum(t.semantic_proximity)
#     print(total)
    if sum(t.semantic_proximity.isnull().values.ravel()) > 0:
        print("One or more missing semantic proximity values. Check if record exists between score_pairs and codenames_words")
        break
    results_frame.iat[i, 1] = total

100%|██████████| 20/20 [00:04<00:00,  4.26it/s]


In [None]:
# OK, we got to the finish line. We now hard inputted a list of words and then built an approximate algorithm to generate the hardest boards
# Additional features:
# Being able to choose "Word Distance" from 1 to 11
# 1 means close 11 means far. That can just translate into percentiles so we don't even really have to change sorting logic


In [569]:
gen_codename_words(6, iterations = 2)

Cooking Words: 100%|██████████| 2/2 [00:01<00:00,  1.14it/s]
Garnishing: 100%|██████████| 2/2 [00:00<00:00,  5.01it/s]


['Puppet',
 'Valentine',
 'Bowl',
 'Frog',
 'Wizard',
 'Bucket',
 'Rice',
 'Hammer',
 'Cave',
 'Mud',
 'Easter',
 'Snake',
 'Walrus',
 'Troll',
 'Scarecrow',
 'Big Ben',
 'Zombie',
 'Cow',
 'Vampire',
 'Trick',
 'Elephant',
 'Crow',
 'Rainbow',
 'Cowboy',
 'Christmas']