## 3. Cluster-preparation and graph generation

Preparation for clustering(we convert the itemsets of shingles/keywords/descriptors and calculate a
similarity measure between each itemsets -> this in turn forms a fully connected
graph(edges == similarity, nodes == games) which we can then prune by removing the edges where
the weight is below a threshold)

### imports

In [1]:
import polars as pl
import numpy as np
import h5py

from pathlib import Path
from statistics import mean
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
DATA = Path('./data')

GEN_DATA = DATA / "gen"
RAW_DATA = DATA / "raw"

### loading data

In [2]:
df = pl.read_csv(RAW_DATA / 'games_detailed_info2025.csv')
# drop rows with null descriptions
df = df.with_columns(pl.col('description').fill_null(""))
df = df.sample(fraction=0.4)
df.head()

Unnamed: 0_level_0,type,id,thumbnail,image,alternate,description,yearpublished,minplayers,maxplayers,suggested_num_players,suggested_playerage,suggested_language_dependence,playingtime,minplaytime,maxplaytime,minage,boardgamecategory,boardgamemechanic,boardgamefamily,boardgameexpansion,boardgameaccessory,boardgamecompilation,boardgameimplementation,boardgamedesigner,boardgameartist,boardgamepublisher,usersrated,average,bayesaverage,Board Game Rank,Strategy Game Rank,Family Game Rank,stddev,median,owned,trading,wanting,wishing,numcomments,numweights,averageweight,boardgameintegration,Abstract Game Rank,Party Game Rank,Thematic Rank,War Game Rank,Customizable Rank,Children's Game Rank,RPG Item Rank,Accessory Rank,name
i64,str,i64,str,str,str,str,i64,i64,i64,str,str,str,i64,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,i64,f64,f64,i64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,f64,str,f64,f64,f64,f64,f64,str,str,str,str
899,"""boardgame""",142379,"""https://cf.geekdo-images.com/J…","""https://cf.geekdo-images.com/J…","""['逃跑计划', '이스케이프 플랜']""","""After a successful bank heist,…",2019,1,5,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '56', '@value': 'N…",120,60,120,12,"""['Adventure']""","""['Area Movement', 'Enclosure',…","""['Crowdfunding: Kickstarter', …","""['Escape Plan: Upgrade Pack']""",,,,"""['Vital Lacerda']""","""[""Ian O'Toole""]""","""['Eagle-Gryphon Games', 'Angry…",5456,7.50464,6.90473,584,355.0,,1.38429,0,9098,212,416,2350,1122,313,3.6709,,,,,,,,,,"""Escape Plan"""
5081,"""boardgame""",149787,"""https://cf.geekdo-images.com/q…","""https://cf.geekdo-images.com/q…",,"""Perdition's Mouth: Abyssal Rif…",2016,1,6,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '26', '@value': 'N…",180,30,180,10,"""['Fantasy', 'Horror', 'Miniatu…","""['Action Points', 'Campaign / …","""['Category: Dungeon Crawler', …","""['DDP Promodeck 2023', ""Perdit…","""[""Perdition's Mouth: Abyssal R…","""[""Perdition's Mouth: Abyssal R…",,"""['Timo Multamäki', 'David Hlad…","""['Matthias Catrein', 'Jere Kas…","""['Dragon Dawn Productions']""",661,7.34298,5.86057,3949,1684.0,,1.79182,0,1511,75,81,629,266,33,3.7576,,,,669.0,,,,,,"""Perdition's Mouth: Abyssal Rif…"
21450,"""boardgame""",378709,"""https://cf.geekdo-images.com/j…","""https://cf.geekdo-images.com/j…",,"""Every year, the two villages o…",2023,2,2,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '1', '@value': 'No…",20,15,20,8,"""['Dice', 'Sports']""","""['Dice Rolling']""","""['Players: Two-Player Only Gam…",,,,"""['Tug of War']""","""['Bernd Eisenstein']""","""['Klemens Franz']""","""['Irongames']""",53,6.34906,5.52034,17774,,,1.35154,0,142,2,4,37,15,0,0.0,,,,,,,,,,"""Ploc"""
18889,"""boardgame""",156557,"""https://cf.geekdo-images.com/I…","""https://cf.geekdo-images.com/I…",,"""The family vacation isn't goin…",2014,1,4,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '41', '@value': 'N…",30,30,30,13,"""['Abstract Strategy', 'Adventu…","""['Grid Movement', 'Modular Boa…","""['Creatures: Zombies', 'Crowdf…",,,,,"""['François Valentyne']""","""['François Valentyne']""","""['The Flux Capacity']""",69,5.70696,5.50506,22063,,,1.6856,0,200,21,6,14,35,6,1.1667,,,,,,,,,,"""Zombie Island"""
12232,"""boardgame""",70532,"""https://cf.geekdo-images.com/o…","""https://cf.geekdo-images.com/o…","""[""Tonkin: La Guerre d'Indochin…","""(from Legion wargames website:…",2012,2,2,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '51', '@value': 'N…",1800,240,1800,0,"""['Civil War', 'Modern Warfare'…","""['Action Points', 'Dice Rollin…","""['History: First Indochina War…",,,,"""[""Tonkin: La Guerre d'Indochin…","""['Kim Kanger']""","""['Kim Kanger', 'Randy Lein', '…","""['Legion Wargames LLC']""",158,7.91266,5.62435,7757,,,1.34779,0,636,10,32,123,89,19,3.3158,,,,,875.0,,,,,"""Tonkin: The First Indochina Wa…"


In [3]:
vectorizer = TfidfVectorizer(stop_words="english", analyzer="word", lowercase=True, strip_accents="unicode", max_features=256, min_df=1, max_df=0.8, token_pattern=r"[A-Za-z]+")
X = vectorizer.fit_transform(df["description"])
# print the vectorizer's vocabulary
print(vectorizer.vocabulary_)

reduced_X = X.toarray()

# # dimensionality reduction with svd
# from sklearn.decomposition import TruncatedSVD
# svd = TruncatedSVD(n_components=256)
# reduced_X = svd.fit_transform(X)

# map ids to signatures (as dense arrays)
sigs = {gid: reduced_X[i] for i, gid in enumerate(df["id"])}

{'good': 96, 'life': 121, 'city': 40, 'team': 215, 'start': 205, 'rsquo': 183, 's': 185, 'need': 142, 'mdash': 133, 'allows': 11, 'money': 135, 'possible': 166, 'create': 52, 't': 210, 'help': 100, 'don': 64, 'players': 161, 'influence': 110, 'moves': 138, 'turn': 228, 'board': 24, 'trying': 227, 'best': 22, 'play': 158, 'cards': 32, 'actions': 5, 'allow': 10, 'set': 194, 'player': 160, 'end': 73, 'wins': 249, 'great': 97, 'new': 143, 'dungeon': 66, 'strategy': 209, 'enemy': 75, 'deck': 56, 'dice': 60, 'combat': 44, 'tactical': 212, 'including': 109, 'rules': 184, 'minutes': 134, 'round': 180, 'strategic': 208, 'heroes': 102, 'make': 128, 'come': 45, 'designed': 58, 'played': 159, 'story': 207, 'campaign': 30, 'fight': 83, 'scenarios': 188, 'level': 120, 'adventure': 8, 'year': 254, 'war': 244, 'win': 246, 'fast': 81, 'used': 236, 'right': 177, 'taking': 214, 'description': 57, 'publisher': 170, 'family': 80, 'island': 112, 'making': 129, 'time': 219, 'tile': 217, 'piece': 152, 'moveme

In [4]:
print("signatures:", len(sigs), "length:", len(next(iter(sigs.values()))))
# print out hte first one

signatures: 11112 length: 256


## Nearest Neighbour Search using sci-kit

In [5]:
nbrs = NearestNeighbors(n_neighbors=10, metric='cosine').fit(reduced_X)

distances, indices = nbrs.kneighbors(reduced_X)

candidate_pairs = set()
game_ids_list = df["id"].to_list()
for i, neighbors in enumerate(indices):
    gid1 = game_ids_list[i]
    for j in neighbors[1:]:  
        gid2 = game_ids_list[j]
        pair = tuple(sorted((gid1, gid2)))
        candidate_pairs.add(pair)


print("LSH candidate pairs:", len(candidate_pairs))
id_to_title = dict(zip(df["id"], df["name"]))


pairs_named = list(map(
    lambda ab: f"{id_to_title[ab[0]]} <-> {id_to_title[ab[1]]}",
    candidate_pairs
))
print(pairs_named[:10])

LSH candidate pairs: 83137
['Café International: Das Kartenspiel <-> Boo-ty Call', 'All Bridges Burning: Red Revolt and White Guard in Finland, 1917-1918 <-> One Page War', 'Warhammer: Diskwars <-> Desert Rats 1940-42', "The Spirit of Eden <-> Unlock!: Short Adventures – Schrödinger's Cat", 'Operation Felix <-> Into the Bastards!: First tank battle', 'The Swarm <-> Super Slopes', 'Saipan 1944: A Panzer Grenadier Game <-> Hellapagos', 'Moscow Burning: The Next Russian Civil War <-> The South Seas Campaign, 1942-43', 'Dominoes <-> NasconDino', 'Curse You, Robin Hood! <-> Bremen']


In [6]:
game_ids = df["id"].to_list()
id_to_index = {gid: i for i, gid in enumerate(game_ids)}
N = len(game_ids)

S = np.zeros((N, N), dtype=np.float32)


for a, b in candidate_pairs:
    i = id_to_index[a]
    j = id_to_index[b]
    sim = cosine_similarity(sigs[a].reshape(1, -1), sigs[b].reshape(1, -1))[0, 0]
    S[i, j] = sim
    S[j, i] = sim


np.fill_diagonal(S, 0.0)  # INFO -> TECHNICALLY SHOULD BE 1, but does not work for my tool

game_titles = [id_to_title[g] for g in game_ids]
# make_duplicate titles different

game_titles = [f"{i}_{title}" for i, title in enumerate(game_titles)]

with h5py.File(GEN_DATA / "game_similarity_tfidf.h5", "w") as f:
    f.create_dataset("matrix", data=S.astype(np.float32))
    f.create_dataset("node_names", data=np.array(game_titles, dtype=h5py.string_dtype()))