# Cluster the Spire
Will Wright

### Purpose and Context

[todo]

In [2]:
# Load packages
import shutil
from os import listdir
import json
import glob
import os
import numpy as np
import pandas as pd
import random
import copy

# increase viewable dataframe rows and columns
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 20)

# set random seed
random.seed(30)

All the data currently lives in several zipped tar.gz files within the 'zipped' folder.  These need to be extracted into an unzipped folder.

**PROTIP:** If you have the files already extracted (as they are in the repo), skip this step to avoid the lengthy unpacking process

In [2]:
def extract_all(archives, extract_path, zip_format = "gztar"):
    '''
    input: path to zipped file archives, path to extract, and type of zipped file
    output: unzipped contents of each zipped file within the extract path
    '''
    for filename in listdir(archives):
        shutil.unpack_archive(archives+filename, extract_path, zip_format)

In [32]:
extract_all("../data_raw/zipped/","../data_raw/unzipped/", "gztar")

In [3]:
# Start here if the files are already unzipped
read_files = glob.glob("../data_raw/unzipped/*/*.json", recursive = True)

To give more context about the data we're working with, lets see exactly how many raw game runs we have:

In [4]:
len(read_files)

279848

Almost 280K games! We'll need to subset down to games for The Defect on Ascension 20 that resulted in wins before we can determine the relevant sample size though. In order to do that, we'll want to read these files together and use relevant JSON keys to narrow our focus.

In [5]:
# this approach creates a list of JSON strings from all the read_files
output_list = []

for f in read_files:
    try:
        with open(f, "r") as infile:
            # test if the file isn't empty and that the name doesn't contain 'undefined' (1 file, contents are "File doesn't exists)")
            if (os.path.getsize(f)>0) & (('undefined' in f)==False):
                output_list.append(json.load(infile))
            else:
                pass
    except UnicodeDecodeError: # some unicode can't be read so just don't load those games (I think it's a particular monster name)
        pass
    

In [6]:
len(output_list)

279693

In [7]:
len(read_files)-len(output_list)

155

We've excluded 155 games that were either empty, had unreadable unicode, or were 'undefined'.  It's possible that this may introduce some bias (e.g. removing relevant games with particular qualities), but given that this represents such a small volume of games relative to all 280K and I haven't seen any apparent bias in looking through a sample of the files, I don't think this should be a major concern.

After more attempts to get the data into the right format, it looks like there is a single case where the JSON is wrapped in '[ ]'.  Since this game is for Ironclad, I'll simply remove from the dataset.

In [8]:
len(output_list)

279693

In [9]:
output_list[:] = [s for s in output_list if str(s)[0]!='[']

In [10]:
len(output_list)

279692

With that single exception removed, we can now subset to a list of Defect games, which pass the conditions of being the Defect character, a victory, and Ascension 20.  Since it's possible that I'll want to expand this investigation to the other two characters later, I'll also set aside their games in their own lists.

In [11]:
# Winning Ascension 20 games by character
defect_asc20_win_games = []
ironclad_asc20_win_games = []
silent_asc20_win_games = []

# Losing Ascension 20 games by character
defect_asc20_lose_games = []
ironclad_asc20_lose_games = []
silent_asc20_lose_games = []

for i in range(len(output_list)):
    if output_list[i] is not None:
        # test to ensure the game data has all the required elements (character, ascention level, and victory status)
        if ('character_chosen' in dict(output_list[i])) and \
        ('ascension_level' in dict(output_list[i])) and \
        ('victory' in dict(output_list[i])):
            
            # DEFECT WINNING
            if (output_list[i]['character_chosen']=='DEFECT') & \
            (output_list[i]['victory']==True) & \
            (output_list[i]['ascension_level']==20):
                defect_asc20_win_games.append(output_list[i])
            
            # DEFECT LOSING
            if (output_list[i]['character_chosen']=='DEFECT') & \
            (output_list[i]['victory']==False) & \
            (output_list[i]['ascension_level']==20):
                defect_asc20_lose_games.append(output_list[i])
            
            # IRONCLAD WINNING  
            if (output_list[i]['character_chosen']=='IRONCLAD') & \
            (output_list[i]['victory']==True) & \
            (output_list[i]['ascension_level']==20):
                ironclad_asc20_win_games.append(output_list[i])
                
            # IRONCLAD LOSING  
            if (output_list[i]['character_chosen']=='IRONCLAD') & \
            (output_list[i]['victory']==False) & \
            (output_list[i]['ascension_level']==20):
                ironclad_asc20_lose_games.append(output_list[i])
                
            # SILENT WINNING  
            if (output_list[i]['character_chosen']=='THE_SILENT') & \
            (output_list[i]['victory']==True) & \
            (output_list[i]['ascension_level']==20):
                silent_asc20_win_games.append(output_list[i])
                
            # SILENT LOSING
            if (output_list[i]['character_chosen']=='THE_SILENT') & \
            (output_list[i]['victory']==False) & \
            (output_list[i]['ascension_level']==20):
                silent_asc20_lose_games.append(output_list[i])

I'm curious about character winrates.  Lets compare to the total games per character.

In [12]:
# Calculate all summary statistics
defect_winning = len(defect_asc20_win_games)
defect_losing = len(defect_asc20_lose_games)
defect_total = len(defect_asc20_win_games)+len(defect_asc20_lose_games)
defect_winrate = defect_winning/defect_total

# Calculate all summary statistics
ironclad_winning = len(ironclad_asc20_win_games)
ironclad_losing = len(ironclad_asc20_lose_games)
ironclad_total = len(ironclad_asc20_win_games)+len(ironclad_asc20_lose_games)
ironclad_winrate = ironclad_winning/ironclad_total

# Calculate all summary statistics
silent_winning = len(silent_asc20_win_games)
silent_losing = len(silent_asc20_lose_games)
silent_total = len(silent_asc20_win_games)+len(silent_asc20_lose_games)
silent_winrate = silent_winning/silent_total


asc20_games_summary = pd.DataFrame({'Character':['Defect','Ironclad','Silent'],
                                    'Winning Games':[defect_winning,
                                                     ironclad_winning,
                                                     silent_winning],
                                     'Total Games':[defect_total,
                                                    ironclad_total,
                                                    silent_total],
                                     'Winrate':[defect_winrate,
                                               ironclad_winrate,
                                               silent_winrate]})



In [13]:
asc20_games_summary

Unnamed: 0,Character,Winning Games,Total Games,Winrate
0,Defect,1669,16863,0.098974
1,Ironclad,1716,14798,0.115962
2,Silent,1811,14278,0.126838


It would seem that although Defect is the most-played character, it has the lowest winrate.  This supports the claim that Defect is the hardest character (at least on Ascension 20).  In any case, we have 1669 victorious Defect Ascension 20 games, which should be adequate sample for clustering.

Next, we need to convert this list of JSON objects to a dataframe we can cluster.  Ideally, the shape of the data is one-row-per-game with columns for all the cards and relics. In order to do that, we'll want to create a vector of all unique cards and relics.  

#### Getting Unique Cards and Relics  

In order to get all unique cards and relics, we can simply pull all cards and relics from all games, then apply the `unique()` function.

In [14]:
all_game_decks = []
all_game_relics = []

for i in range(len(output_list)):
    if output_list[i] is not None:
        # ensure the run data has the deck and relics to avoid errors in rare cases
        if ('master_deck' in dict(output_list[i])) and ('relics' in dict(output_list[i])):
            all_game_decks.append(output_list[i]['master_deck'])
            all_game_relics.append(output_list[i]['relics'])

In [15]:
# Within each game, each card and relic needs to be pulled out into a flat list.
all_cards = []

for i in range(len(all_game_decks)):
    for j in range(len(all_game_decks[i])):
        all_cards.append(all_game_decks[i][j])
        
all_relics = []

for i in range(len(all_game_relics)):
    for j in range(len(all_game_relics[i])):
        all_relics.append(all_game_relics[i][j])


In [16]:
# create unique lists
unique_cards = list(np.unique(all_cards))
unique_relics = list(np.unique(all_relics))

In [18]:
len(unique_cards)

3164

In [19]:
len(unique_relics)

876

Looks like we have 3164 unique cards and 876 unique relics.  This is a fair bit more than expected, so lets take a look at the head and tail of cards:

In [20]:
unique_cards[0:10]

['6A',
 '6A+1',
 'A Thousand Cuts',
 'A Thousand Cuts+1',
 'Abandon',
 'Abandon+1',
 'AbeCurse',
 'AbsoluteMagnitude+1',
 'Absolvement',
 'Absolvement+1']

In [21]:
unique_cards[-10:-1]

['vexMod:StarBlast',
 'vexMod:StrikeStorm',
 'vexMod:StrikeStorm+1',
 'vexMod:Taunt+1',
 'vexMod:TrainingStrike',
 'vexMod:TrainingStrike+1',
 'vexMod:UltimateCard',
 'vexMod:VenomSigh',
 'vexMod:VolumeVengeance']

This reveals two issues: there are the standard and "+1" versions of cards (players can upgrade cards once) as well as cards from game mods (essentially, player-made extensions of the game).  Thankfully, my domain expertise makes it fairly easy to know which cards aren't in the base game and it seems like most of the modded cards have a ':' in their name so they should be fairly easy to exclude.  

After testing, it looks like there are a few other exceptions for specific mods that use a '\_' in their name.  I'll go ahead and simply remove those cases as well.

In [22]:
unique_cards[:] = [s for s in unique_cards if '+' not in s \
                   and ':' not in s\
                   and '_' not in s]

In [23]:
len(unique_cards)

679

In review of the new card list, I can still see some non-base cards, but I'm not too concerned with this affecting the final results due to the expected low frequency of those cards (0 in cases where the character isn't one of the base characters).  

Next, the same cleansing will be applied to the relics. Generally speaking, relics have the same issue with mods as the cards, but there are not upgrades available.

In [24]:
unique_relics[:] = [s for s in unique_relics if '_' not in s \
                   and ':' not in s]

In [25]:
len(unique_relics)

391

Again, this isn't a perfect methodology, but since there are no flags for being a mod within the game data, it is difficult to use a single signal as a subsetting criteria to only the base game.

___

At this point, we can build a table of all unique cards and relics and then fill in Trues and Falses for whether the card was present per completed game.

In [56]:
def resource_table_generator(resource_input, game_input):
    '''
    input: a list of resources to be included in the rows of the table (i.e. cards and relics)
           a list of JSON game data to go in the columns
    output: a DataFrame indicating if each resource is in each game with a True or False
    '''
    # Build a 1-column table with the resources in each row
    resource_table = pd.DataFrame({'Resource':resource_input})
    
    # determine the number of preceding zeroes to put in the game name for the column names
    game_volume = len(game_input)
    preceding_zeroes = len(str(game_volume)) # so if you have 1669 games, the first game will be 'game_0001'
    
    # For each game append a column and fill with True/False based on if the card is present
    for i in range(len(game_input)):
        # scrub the '+1's from the cards (just remove the string, not the card) so they'll match on the unique_cards list
        game_deck = game_input[i]['master_deck']
        for j in range(len(game_deck)):
            game_deck[j] = game_deck[j].replace('+1','')
        
        # Create column name for the game
        resource_table['game_'+str('{:0'+str(preceding_zeroes)+'d}').format(i+1)] = False # default to false
        # For each resource, update to True if it is in the final game resources
        for j in range(len(resource_table)):
            if((resource_table['Resource'][j] in game_deck) or \
               (resource_table['Resource'][j] in game_input[i]['relics'])):
                resource_table.iloc[j,i+1] = True
    
    return(resource_table)


#### Defect Resource Table and Summary Frequency Tables

Now, simply plug in the resources and the Defect Ascension 20 victorious games to get a complete table of whether each resource is present in each game.

In [57]:
defect_asc20_win_resources = resource_table_generator(unique_cards+unique_relics, defect_asc20_win_games)

In [58]:
defect_asc20_win_resources

Unnamed: 0,Resource,game_0001,game_0002,game_0003,game_0004,game_0005,game_0006,game_0007,game_0008,game_0009,...,game_1660,game_1661,game_1662,game_1663,game_1664,game_1665,game_1666,game_1667,game_1668,game_1669
0,6A,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A Thousand Cuts,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Abandon,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,AbeCurse,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Absolvement,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,Winged Necklace,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1066,WingedGreaves,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1067,WristBlade,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1068,Yang,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


It'd be useful to see the relative frequency of each resource in a separate summary table:

In [59]:
defect_resource_freq = pd.DataFrame({'Resource':defect_asc20_win_resources['Resource'],
                                     'Frequency':defect_asc20_win_resources.sum(axis = 1)})
defect_resource_freq = defect_resource_freq.sort_values(by = ['Frequency'], ascending = False).reset_index(drop=True)
defect_resource_freq['Percent of Wins'] = defect_resource_freq['Frequency']/len(defect_asc20_win_games)

In [60]:
# subset to resources which are >0
defect_resource_freq = defect_resource_freq[defect_resource_freq['Frequency']>0]
defect_resource_freq['Percentile Rank']=defect_resource_freq['Frequency'].rank(pct=True, ascending = False)
defect_resource_freq['Rank']=defect_resource_freq['Frequency'].rank(ascending = False)

In [76]:
defect_resource_freq[0:10]

Unnamed: 0,Resource,Frequency,Percent of Wins,Percentile Rank,Rank,Resource Type
0,AscendersBane,1646,0.986219,0.002488,1.0,card
1,Dualcast,1513,0.906531,0.004975,2.0,card
2,Zap,1401,0.839425,0.007463,3.0,card
3,Cracked Core,1394,0.835231,0.00995,4.0,relic
4,Coolheaded,1357,0.813062,0.012438,5.0,card
5,Defragment,1163,0.696824,0.014925,6.0,card
6,Hologram,1141,0.683643,0.017413,7.0,card
7,Glacier,1039,0.622528,0.0199,8.0,card
8,Cold Snap,931,0.557819,0.022388,9.0,card
9,Capacitor,889,0.532654,0.024876,10.0,card


These results align with expectations.  'AscendersBane' is the curse card which should be present in every winning deck with Defect on Ascension 20 and thus, it makes sense that it's the most frequent card (though I'm not quire sure how some games completed without it).  Below that are the starting cards 'Dualcast' and 'Zap', then the starting relic 'Cracked Core'.  The first non-starting resource is 'Coolheaded' and it appears in a full **81%** of games! I knew it was a good card, but it seems almost critical to success on Ascension 20 with Defect.

After having used this .csv when playing, it seems like it'd be more user-friendly to have a total of 3 .csvs per character.  We've created one (a view containing all relics and cards), but if I only wanted to select cards in the top 20th percentile, I can't because the relics are mixed in.  This being the case, we'll separate into cards and relics, then get the percentiles and ranks within those groupings.  

To do this, we'll use a function to assign 'card' or 'relic' to `defect_resource_freq` in a 'resource_type' column, then subset on that before performing the ranking.

In [64]:
def resource_typer(resource_name_input):
    '''
    input: name of a resource
    output: string of 'relic' or 'card' based on the input
    '''
    if resource_name_input in unique_cards: return('card')
    if resource_name_input in unique_relics: return('relic')

In [73]:
defect_resource_freq['Resource Type']=''
for i in range(len(defect_resource_freq)):
    defect_resource_freq['Resource Type'][i] = resource_typer(defect_resource_freq['Resource'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [75]:
defect_resource_freq[0:10]

Unnamed: 0,Resource,Frequency,Percent of Wins,Percentile Rank,Rank,Resource Type
0,AscendersBane,1646,0.986219,0.002488,1.0,card
1,Dualcast,1513,0.906531,0.004975,2.0,card
2,Zap,1401,0.839425,0.007463,3.0,card
3,Cracked Core,1394,0.835231,0.00995,4.0,relic
4,Coolheaded,1357,0.813062,0.012438,5.0,card
5,Defragment,1163,0.696824,0.014925,6.0,card
6,Hologram,1141,0.683643,0.017413,7.0,card
7,Glacier,1039,0.622528,0.0199,8.0,card
8,Cold Snap,931,0.557819,0.022388,9.0,card
9,Capacitor,889,0.532654,0.024876,10.0,card


Now, we simply subset on 'Resource Type' and re-apply the rank and percentile:

In [81]:
# cards
defect_card_freq = copy.copy(defect_resource_freq[defect_resource_freq['Resource Type']=='card'])
del defect_card_freq['Resource Type'] # redundant column since this is only cards
defect_card_freq['Percentile Rank']=defect_card_freq['Frequency'].rank(pct=True, ascending = False)
defect_card_freq['Rank']=defect_card_freq['Frequency'].rank(ascending = False)

# relics
defect_relic_freq = copy.copy(defect_resource_freq[defect_resource_freq['Resource Type']=='relic'])
del defect_relic_freq['Resource Type'] # redundant column since this is only cards
defect_relic_freq['Percentile Rank']=defect_relic_freq['Frequency'].rank(pct=True, ascending = False)
defect_relic_freq['Rank']=defect_relic_freq['Frequency'].rank(ascending = False)

With the framework complete for Defect, we can simply apply the same methodology to Silent and Ironclad:

#### Silent Resource Table and Summary Frequency Tables

In [98]:
# Create Resource Table
silent_asc20_win_resources = resource_table_generator(unique_cards+unique_relics, silent_asc20_win_games)
silent_resource_freq = pd.DataFrame({'Resource':silent_asc20_win_resources['Resource'],
                                     'Frequency':silent_asc20_win_resources.sum(axis = 1)})
silent_resource_freq = silent_resource_freq.sort_values(by = ['Frequency'], ascending = False).reset_index(drop=True)
silent_resource_freq['Percent of Wins'] = silent_resource_freq['Frequency']/len(silent_asc20_win_games)

In [99]:
# subset to resources which are >0
silent_resource_freq = silent_resource_freq[silent_resource_freq['Frequency']>0]
silent_resource_freq['Percentile Rank']=silent_resource_freq['Frequency'].rank(pct=True, ascending = False)
silent_resource_freq['Rank']=silent_resource_freq['Frequency'].rank(ascending = False)

# add resource type
silent_resource_freq['Resource Type']=''
for i in range(len(silent_resource_freq)):
    silent_resource_freq['Resource Type'][i] = resource_typer(silent_resource_freq['Resource'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [100]:
# create separate card and relic tables
# cards
silent_card_freq = copy.copy(silent_resource_freq[silent_resource_freq['Resource Type']=='card'])
del silent_card_freq['Resource Type'] # redundant column since this is only cards
silent_card_freq['Percentile Rank']=silent_card_freq['Frequency'].rank(pct=True, ascending = False)
silent_card_freq['Rank']=silent_card_freq['Frequency'].rank(ascending = False)

# relics
silent_relic_freq = copy.copy(silent_resource_freq[silent_resource_freq['Resource Type']=='relic'])
del silent_relic_freq['Resource Type'] # redundant column since this is only cards
silent_relic_freq['Percentile Rank']=silent_relic_freq['Frequency'].rank(pct=True, ascending = False)
silent_relic_freq['Rank']=silent_relic_freq['Frequency'].rank(ascending = False)


#### Ironclad Resource Table and Summary Frequency Tables

In [90]:
# Create Resource Table
ironclad_asc20_win_resources = resource_table_generator(unique_cards+unique_relics, ironclad_asc20_win_games)
ironclad_resource_freq = pd.DataFrame({'Resource':ironclad_asc20_win_resources['Resource'],
                                     'Frequency':ironclad_asc20_win_resources.sum(axis = 1)})
ironclad_resource_freq = ironclad_resource_freq.sort_values(by = ['Frequency'], ascending = False).reset_index(drop=True)
ironclad_resource_freq['Percent of Wins'] = ironclad_resource_freq['Frequency']/len(ironclad_asc20_win_games)

In [93]:
# subset to resources which are >0
ironclad_resource_freq = ironclad_resource_freq[ironclad_resource_freq['Frequency']>0]
ironclad_resource_freq['Percentile Rank']=ironclad_resource_freq['Frequency'].rank(pct=True, ascending = False)
ironclad_resource_freq['Rank']=ironclad_resource_freq['Frequency'].rank(ascending = False)

# add resource type
ironclad_resource_freq['Resource Type']=''
for i in range(len(ironclad_resource_freq)):
    ironclad_resource_freq['Resource Type'][i] = resource_typer(ironclad_resource_freq['Resource'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [94]:
# create separate card and relic tables
# cards
ironclad_card_freq = copy.copy(ironclad_resource_freq[ironclad_resource_freq['Resource Type']=='card'])
del ironclad_card_freq['Resource Type'] # redundant column since this is only cards
ironclad_card_freq['Percentile Rank']=ironclad_card_freq['Frequency'].rank(pct=True, ascending = False)
ironclad_card_freq['Rank']=ironclad_card_freq['Frequency'].rank(ascending = False)

# relics
ironclad_relic_freq = copy.copy(ironclad_resource_freq[ironclad_resource_freq['Resource Type']=='relic'])
del ironclad_relic_freq['Resource Type'] # redundant column since this is only cards
ironclad_relic_freq['Percentile Rank']=ironclad_relic_freq['Frequency'].rank(pct=True, ascending = False)
ironclad_relic_freq['Rank']=ironclad_relic_freq['Frequency'].rank(ascending = False)


#### Saving Frequency Table Results

In [101]:
# Write to .csv
# Defect
defect_resource_freq.to_csv('../results/defect_resource_freq.csv', index = False)
defect_card_freq.to_csv('../results/defect_card_freq.csv', index = False)
defect_relic_freq.to_csv('../results/defect_relic_freq.csv', index = False)

# Silent
silent_resource_freq.to_csv('../results/silent_resource_freq.csv', index = False)
silent_card_freq.to_csv('../results/silent_card_freq.csv', index = False)
silent_relic_freq.to_csv('../results/silent_relic_freq.csv', index = False)

# Ironclad
ironclad_resource_freq.to_csv('../results/ironclad_resource_freq.csv', index = False)
ironclad_card_freq.to_csv('../results/ironclad_card_freq.csv', index = False)
ironclad_relic_freq.to_csv('../results/ironclad_relic_freq.csv', index = False)

___
## Simulating Clusters
There are several algorithms out there for clustering binary data, but I'd like to try my hand at developing my own. After much whiteboarding, my plan is to test and see if I can get my own idea to work.

**Clustering Algorithm**
1. Start with clusters equal to the number of resources (k=n)
2. Calculate the distance between each cluster and every other cluster where distance is the sum of resources that don't match (i.e. if a card is in both decks, that resource adds 0 distance, but it if is in one and not the other, then it adds 1 distance.  "Distance" is the sum of all these resource differences)
3. For each cluster, calculate the nearest resource and add to the cluster
4. For each updated cluster calculate, calculate the distance to all other clusters and drop the two which have the closest distance
5. Repeat 2-4 until k=1
6. Plot average distance for k=1 through k=n
7. Select a reasonable k

### Generating Test Data

Lets imagine we have 20 resources in 100 games and 4 relatively-tight clusters.  In order to do this, we'll create 4 medoids by drawing from the binomial distribution with p = 0.2, 0.4, 0.6, and 0.8 for each draw being a 1 instead of a 0 for each resource within a medoid.

In [102]:
first_medoid = np.random.binomial(1, 0.2, 20)
second_medoid = np.random.binomial(1, 0.4, 20)
third_medoid = np.random.binomial(1, 0.6, 20)
fourth_medoid = np.random.binomial(1, 0.8, 20)

print(first_medoid)
print(second_medoid)
print(third_medoid)
print(fourth_medoid)


[0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0]
[1 1 1 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0]
[1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 1]
[1 1 1 1 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1]


We'll want a function to measure the distance between games (as described in step 2 of the algorithm):

In [103]:
def cluster_distance_calculator(cluster1_input, cluster2_input):
    '''
    input: arrays of binary data for two clusters
    output: a distance measurement
    method: distance is the sum of differences in the binary data, by position
    '''
    distance = sum(abs(cluster1_input-cluster2_input))
    return(distance)

In [104]:
# test it out for the distance between the first medoid and the others
print(cluster_distance_calculator(first_medoid, first_medoid))
print(cluster_distance_calculator(first_medoid, second_medoid))
print(cluster_distance_calculator(first_medoid, third_medoid))
print(cluster_distance_calculator(first_medoid, fourth_medoid))


0
11
10
13


As expected, there is 0 distance for the first medoid compared to itself. Both the second and the third medoids have 9 total differences and the fourth medoid has 14. We'll want to see what the confusion matrix of each cluster's distance from each other cluster in a function for easy and comprehensive evaluation:

In [105]:
def cluster_confusioner(cluster_list_input):
    '''
    input: a list of clusters of equal length
    output: a matrix which applies the cluster_distance_calculator to each pair of clusters
    '''
    distance_matrix = np.empty((len(cluster_list_input), len(cluster_list_input)))
    
    # iterate through each comparison to populate the matrix
    for i in range(len(cluster_list_input)):
        for j in range(len(cluster_list_input)):
            distance_matrix[i,j] = cluster_distance_calculator(cluster_list_input[i], cluster_list_input[j])
    
    return(distance_matrix)
    

In [106]:
cluster_confusioner([first_medoid, second_medoid, third_medoid, fourth_medoid])

array([[ 0., 11., 10., 13.],
       [11.,  0., 11.,  8.],
       [10., 11.,  0.,  7.],
       [13.,  8.,  7.,  0.]])

Here, we can see those same 0, 9, 9, 14 values across the first row and down the first column as well as the distances between the other clusters.  It looks like the maximum distance is 14 (difference between the first and fourth medoid) and the minimum distance is a three-way tie between medoids 1:2, 1:3, and 2:4 with a distance of 9.

Next, we'll want to create 24 similar games per medoid to simulate a situation in which there were 4 winning sets of resources.  We'll do this by randomly selecting 0-25% of the elements in each cluster and flipping them. such that we'll have 75% to 100% similarity between each game intended for a cluster and its medoid (I say _intended_ because it's possible that, after applying the random changes, it becomes more similar to a different medoid).

In [107]:
def cluster_creator(medoid_input, difference_percent_range, n_games):
    '''
    input: medoid_input is a one-dimensional array of binary data
           difference_percent_range is a list with a min and max percent (e.g. [0,0.25] for 0-25%); cannot do <1% 
           n_games is the number of games needed in the output
    output: a list of n games with the speficied similarity to the medoid_input
    '''
    
    simulated_games = []
    
    for i in range(n_games):
        # select how many elements will be changed
        # must multiply by 100 and add 1 due to randrange needing integers and being exclusive with the high end
        percent_change = random.randrange(difference_percent_range[0]*100, (difference_percent_range[1]+0.01)*100, 1)/100
        
        # convert the percent to an integer by multiplying by the total number of elements and rounding
        element_change = round(len(medoid_input)*percent_change)
        
        # select which elements will be changed
        element_change_positions = []
        for j in range(element_change):
            element_change_positions.append(random.randrange(0,len(medoid_input)))
        
        # change those elements
        simulated_game = copy.copy(medoid_input)
        for k in range(len(element_change_positions)):
            if simulated_game[element_change_positions[k]]==1:
                simulated_game[element_change_positions[k]]=0
            else:
                simulated_game[element_change_positions[k]]=1
        
        # append to list of games
        simulated_games.append(simulated_game)
    
    return(simulated_games)
        

In [111]:
# create the cluster and add the medoid to it
first_medoid_cluster = cluster_creator(first_medoid, [0,0.25], 24)

In [112]:
first_medoid_cluster

[array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0]),
 array([0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]),
 array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0]),
 array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]),
 array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0]),
 array([1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0]),
 array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0]),
 array([0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0]),
 array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1]),
 array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0]),
 array([0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]),
 array([0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 array([1, 1, 0, 0, 

Looks right, but just to test, let's make sure the differences between each simulated game and the medoid is less than or equal to 25% (5 elements of the 20):

In [150]:
distances = []
for i in range(len(first_medoid_cluster)):
    distances = distances + [cluster_distance_calculator(first_medoid, first_medoid_cluster[i])]
print(max(distances))

5


Perfect! now to create the other clusters of games around the other medoids.

In [117]:
second_medoid_cluster = cluster_creator(second_medoid, [0,0.25], 24)
third_medoid_cluster = cluster_creator(third_medoid, [0,0.25], 24)
fourth_medoid_cluster = cluster_creator(fourth_medoid, [0,0.25], 24)

Finally, we can stitch all the games together into one dataframe:

In [123]:
simulated_games = [first_medoid] + first_medoid_cluster + [second_medoid] + second_medoid_cluster +\
[third_medoid] + third_medoid_cluster + [fourth_medoid] + fourth_medoid_cluster

In [154]:
simulated_games = pd.DataFrame(np.array(simulated_games))

In [155]:
simulated_games

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
1,0,1,0,0,0,1,0,0,0,0,1,0,1,1,0,1,0,1,0,0
2,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0
3,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0
4,0,1,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1,0,1,1,1,1
96,1,1,1,1,1,0,1,1,1,0,1,0,1,1,1,1,1,1,1,1
97,1,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,0,1,1,1
98,0,1,1,1,1,0,1,0,1,0,1,1,0,0,1,1,1,1,1,1


### Building the Clustering Algorithm
So now, in total, we have 100 games of 20 resources with 25 in each cluster centered around 4 medoids. This is roughly what we should expect to find with the actual game data so it should serve as a relevant proxy.

To restate the goal of the Algorithm:  
**Clustering Algorithm**
1. Start with nodes equal to the number of resources (k=n)
2. Calculate the distance between each node and every other node where distance is the sum of resources that don't match (i.e. if a card is in both decks, that resource adds 0 distance, but it if is in one and not the other, then it adds 1 distance.  "Distance" is the sum of all these resource differences)
3. For each node, calculate the nearest node and add to the node via averaging. 
4. For each updated node, calculate the distance to all other nodes and drop the two which have the closest distance
5. Repeat 2-4 until k=1
6. Plot average distance for k=1 through k=n
7. Select a reasonable k

In [181]:
def node_distancer(resource_dataframe_input):
    '''
    input: resouce_dataframe_input is a dataframe with resources in the columns and games in the rows with 
            binary data filling the table.
           k_input is the number of clusters
    output: k centroids
    '''
    # create empty array to hold all the distances comparing each combination
    all_node_distances = np.zeros([len(resource_dataframe_input),len(resource_dataframe_input)])
    # calculate distance between the ith game and the jth game
    for i in range(len(all_node_distances)):
        for j in range(len(all_node_distances)):
            all_node_distances[i,j] = cluster_distance_calculator(resource_dataframe_input.iloc[i], resource_dataframe_input.iloc[j])
    
    return(all_node_distances)

In [183]:
simulated_node_distances = node_distancer(simulated_games)

In [286]:
simulated_node_distances

array([[ 0.,  2.,  1., ..., 14., 13., 13.],
       [ 2.,  0.,  3., ..., 14., 13., 13.],
       [ 1.,  3.,  0., ..., 13., 12., 12.],
       ...,
       [14., 14., 13., ...,  0.,  5.,  3.],
       [13., 13., 12., ...,  5.,  0.,  2.],
       [13., 13., 12., ...,  3.,  2.,  0.]])

Above is a 100x100 array which shows the distance of each node to every other node.  Below is just the first row, which compares the first node to all 100 other nodes, separated by cluster:

In [298]:
print(simulated_node_distances[0,0:25])
print(simulated_node_distances[0,25:50])
print(simulated_node_distances[0,50:75])
print(simulated_node_distances[0,75:100])

[0. 2. 1. 1. 3. 1. 2. 1. 3. 4. 1. 2. 4. 1. 2. 2. 5. 3. 3. 1. 3. 4. 4. 2.
 3.]
[11. 10. 10.  9. 11. 11. 11. 12. 10. 11. 11. 11. 12. 12. 11. 11. 11.  9.
  9. 11. 13. 11. 11.  9. 11.]
[10.  8.  8. 12.  9. 11. 11. 11. 13. 10. 11. 11. 10. 11. 10. 10.  9.  9.
 12. 11.  9.  9. 10. 10. 10.]
[13. 13. 17. 12. 12. 12. 12. 11. 14. 13. 14. 15. 13. 11. 15. 14. 15. 11.
 15. 12. 11. 14. 14. 13. 13.]


Above is the first row of what I'm calling a 'node' distance separated into each of the medoid-centered groupings.  In this case, the node is 1 game, but we'll eventually be merging games of resources together so it's easier to call these the more generic 'node' than calling it 'the combination of game 1, game 10,..., game n'.  

What it's showing is that the distance between the first node and itself is, as expected, 0 and the distances between the first node and the others centered around that medoid ranges between 0 and 5.  Compared to the second medoid and the simulated games around it, we see values from 9 to 12. We see a difference of 8-13 for the third medoid cluster and 12-17 for the fourth medoid cluster. In all, it seems to be working as expected.

At step 3 in the algorithm, we calculate the nearest node and combine the two by taking the average of the resources. As can be seen, the nearest of all the nodes (that isn't itself) is 1 and there are 7 nodes that have the same distance.  This means that 7 nodes have only a 1-resource difference to the first node. I'm not exactly sure how to handle ties, but it seems reasonable to simply take an average across all nodes that are equally close.  I can imagine a better (and more complex) strategy doing backtracking and seeing which node-pairing results in a smaller distance in the n+1 iteration and going with that, but I'll keep this version simple. 

As per this plan, let's create a function which creates a new node based on an average of the equally-close nodes:

In [398]:
def node_averager(list_of_nodes):
    return(np.mean((list_of_nodes), axis = 0))

Next, for each row of 100 distances, we'll want to select the node(s) with the equally-minimal distance from the original node to send to the `node_averager()` and store the resulting nodes along with indices for the nodes included.  In other words, in the case of the first iteration when the nodes are equal to the games, we're finding the most similar other game(s) and creating an average new node.

In [407]:
def new_noder(resouce_table_input, node_distance_table_input, node_index):
    '''
    inputs: resource_table_input: a table with games in the rows and resources in the columns (mxn)
            node_distance_table_input: an array of node distances (mxm)
            node_index: an integer value (should range from 0 to m)
    output: a tuple of a new average node and a list of the nodes averaged together
    '''
    node_distances = node_distance_table_input[node_index]
    min_distance = min(node_distances[1:]) # find the closest node(s) that aren't the primary node
    closest_node_indices = np.where(node_distances==min_distance)[0] # [0] since these are 1-dimensional slices
    
    # grab the closest nodes and put into a list
    closest_nodes = []
    for i in range(len(closest_node_indices)):
        closest_nodes.append(resource_table_input.iloc[closest_node_indices[i]].values)
    
    # grab primary_node and add to the list of closest_nodes
    primary_node = resource_table_input.iloc[node_index].values
    closest_nodes.append(primary_node)
    
    
    # take an average of the primary and closest node(s)
    new_node = node_averager(closest_nodes)
    
    return([new_node,[node_index,closest_node_indices]])
    

In [413]:
new_node = new_noder(simulated_games, simulated_node_distances, 0)

In [414]:
new_node

[array([0.   , 0.875, 0.   , 0.   , 0.   , 0.875, 0.125, 0.   , 0.   ,
        0.   , 1.   , 0.125, 0.   , 1.   , 0.   , 0.125, 0.125, 1.   ,
        0.   , 0.125]), [0, array([ 2,  3,  5,  7, 10, 13, 19])]]

Excellent! We now have a way of generating a new node, which is the average of the closest nodes, along with the indices for the primary node and the nodes closest.  

As per the algorithm, we'll want to do this for all nodes before calculating the distances between each of them and removing the closest.

In [420]:
def node_updater(resource_table_input, node_distance_table_input):
    '''
    TODO: add deets
    '''
    new_resource_list = []
    for i in range(len(resource_table_input)):
        new_resource_list.append(new_noder(resource_table_input, node_distance_table_input,i)[0])
    
    # convert to dataframe
    new_resource_table = pd.DataFrame(np.array(new_resource_list))
    
    return(new_resource_table)

In [423]:
simulated_games_step2 = node_updater(simulated_games, simulated_node_distances)

In [428]:
simulated_games_step2[0:50]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.875,0.0,0.0,0.0,0.875,0.125,0.0,0.0,0.0,1.0,0.125,0.0,1.0,0.0,0.125,0.125,1.0,0.0,0.125
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
6,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


Hmm... I'm expecting more fractions here... looks like all nodes except the first have an identical node out there.  

**TODO:** Perhaps I should not look for the min, but look for the 0s + the min of the rest.  This will guarantee that a non-identical node is brought in while also giving the primary node more weight for the number of identical nodes.  

In [437]:
new_noder(resource_table_input, node_distance_table_input,0)[0]

array([0.   , 0.875, 0.   , 0.   , 0.   , 0.875, 0.125, 0.   , 0.   ,
       0.   , 1.   , 0.125, 0.   , 1.   , 0.   , 0.125, 0.125, 1.   ,
       0.   , 0.125])

In [438]:
node_index = 1

In [442]:
node_distances = node_distance_table_input[node_index]

In [440]:
node_distances

array([ 2.,  0.,  3.,  1.,  3.,  3.,  4.,  3.,  5.,  6.,  3.,  4.,  6.,
        3.,  4.,  4.,  5.,  5.,  3.,  3.,  5.,  4.,  6.,  4.,  3., 11.,
       10., 10.,  9., 11., 11.,  9., 12., 10., 11., 11., 11., 10., 12.,
       11., 11., 11., 11.,  9.,  9., 13., 11., 11.,  9., 13., 10.,  8.,
        8., 12.,  9., 11., 11., 11., 13., 12., 11., 11., 10., 11., 10.,
       10., 11.,  9., 10., 11., 11., 11., 12., 10., 10., 13., 13., 15.,
       12., 14., 12., 12., 11., 14., 13., 14., 13., 13., 11., 15., 14.,
       13., 11., 13., 12., 13., 12., 14., 13., 13.])

In [444]:
min_distance = min(node_distances[1:])

In [445]:
min_distance

0.0

In [449]:
simulated_games[0:50]==simulated_games_step2[0:50]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,True,False,True,True,True,False,False,True,True,True,True,False,True,True,True,False,False,True,True,False
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [450]:
simulated_games[50:100]==simulated_games_step2[50:100]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
50,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
51,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
52,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
53,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
54,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
55,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
56,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
57,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
58,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
59,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
