## 3. Cluster-preparation and graph generation

Preparation for clustering(we convert the itemsets of shingles/keywords/descriptors and calculate a
similarity measure between each itemsets -> this in turn forms a fully connected
graph(edges == similarity, nodes == games) which we can then prune by removing the edges where
the weight is below a threshold)

### imports

In [1]:
import polars as pl
import numpy as np
import h5py

from pathlib import Path
from statistics import mean
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
DATA = Path('./data')

GEN_DATA = DATA / "gen"
RAW_DATA = DATA / "raw"

### loading data

In [2]:
df = pl.read_csv(RAW_DATA / 'games_detailed_info2025.csv')
# drop rows with null descriptions
df = df.with_columns(pl.col('description').fill_null(""))
# df = df.sample(fraction=0.4)
df.head()

Unnamed: 0_level_0,type,id,thumbnail,image,alternate,description,yearpublished,minplayers,maxplayers,suggested_num_players,suggested_playerage,suggested_language_dependence,playingtime,minplaytime,maxplaytime,minage,boardgamecategory,boardgamemechanic,boardgamefamily,boardgameexpansion,boardgameaccessory,boardgamecompilation,boardgameimplementation,boardgamedesigner,boardgameartist,boardgamepublisher,usersrated,average,bayesaverage,Board Game Rank,Strategy Game Rank,Family Game Rank,stddev,median,owned,trading,wanting,wishing,numcomments,numweights,averageweight,boardgameintegration,Abstract Game Rank,Party Game Rank,Thematic Rank,War Game Rank,Customizable Rank,Children's Game Rank,RPG Item Rank,Accessory Rank,name
i64,str,i64,str,str,str,str,i64,i64,i64,str,str,str,i64,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,i64,f64,f64,i64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,f64,str,f64,f64,f64,f64,f64,str,str,str,str
0,"""boardgame""",13,"""https://cf.geekdo-images.com/P…","""https://cf.geekdo-images.com/P…","""['Catan', 'Catan (Колонизаторы…","""In CATAN (formerly The Settler…",1995,3,4,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '1', '@value': 'No…",120,60,120,10,"""['Economic', 'Negotiation']""","""['Chaining', 'Dice Rolling', '…","""['Animals: Sheep', 'Components…","""['20 Jahre Darmstadt Spielt', …","""['Catan x Goat Simulator 3: Re…","""[""CATAN 3D Collector's Edition…","""['Baden-Württemberg Catan', 'C…","""['Klaus Teuber']""","""['Volkan Baga', 'Tanja Donner'…","""['KOSMOS', '64 Ounce Games', '…",132477,7.09526,6.91526,573,533.0,196.0,1.49966,0,218546,2264,518,7367,22600,8299,2.2881,,,,,,,,,,"""CATAN"""
1,"""boardgame""",822,"""https://cf.geekdo-images.com/o…","""https://cf.geekdo-images.com/o…","""['Carcassonne Jubilee Edition'…","""Carcassonne is a tile placemen…",2000,2,5,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '6', '@value': 'No…",45,30,45,7,"""['Medieval', 'Territory Buildi…","""['Area Majority / Influence', …","""['Category: Dized Tutorial', '…","""['20 Jahre Darmstadt Spielt', …","""['The Adults of Carcassonne', …","""['Carcassonne Big Box', 'Carca…","""['The Ark of the Covenant', 'C…","""['Klaus-Jürgen Wrede']""","""['Marcel Gröber', 'Doris Matth…","""['Hans im Glück', '64 Ounce Ga…",131182,7.41145,7.29556,230,,55.0,1.31135,0,204049,1995,656,9787,22150,8414,1.8894,"""['Carcassonne: Wheel of Fortun…",,,,,,,,,"""Carcassonne"""
2,"""boardgame""",30549,"""https://cf.geekdo-images.com/S…","""https://cf.geekdo-images.com/S…","""['EPIZOotic', 'Pandemia', 'Pan…","""In Pandemic, several virulent …",2008,2,4,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '6', '@value': 'No…",45,45,45,8,"""['Medical']""","""['Action Points', 'Cooperative…","""['Components: Map (Global Scal…","""['Pandemic: Gen Con 2016 Promo…","""['Pandemic: Folded Space Inser…",,"""['Fall of Rome', 'Iberia', 'Pa…","""['Matt Leacock']""","""['Josh Cappel', 'Christian Han…","""['Z-Man Games', '(Unknown)', '…",128935,7.52913,7.42156,158,168.0,32.0,1.33643,0,211600,3228,620,10981,19897,6138,2.3974,,,,,,,,,,"""Pandemic"""
3,"""boardgame""",68448,"""https://cf.geekdo-images.com/3…","""https://cf.geekdo-images.com/3…","""['7 csoda', '7 Cudów Świata', …","""You are the leader of one of t…",2010,2,7,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '1', '@value': 'No…",30,30,30,10,"""['Ancient', 'Card Game', 'City…","""['Closed Drafting', 'Hand Mana…","""['Ancient: Babylon', 'Ancient:…","""['7 Wonders: Armada', '7 Wonde…","""['7 Wonders: Eurohell Design C…",,"""['7 Wonders (Second Edition)',…","""['Antoine Bauza']""","""['Dimitri Chappuis', 'Miguel C…","""['Repos Production', 'ADC Blac…",107506,7.67463,7.56393,101,111.0,18.0,1.27648,0,147129,1896,979,14247,16690,5365,2.3171,,,,,,,,,,"""7 Wonders"""
4,"""boardgame""",167791,"""https://cf.geekdo-images.com/w…","""https://cf.geekdo-images.com/w…","""['A Mars terraformálása', 'Mar…","""In the 2400s, mankind begins t…",2016,1,5,"""[{'@numplayers': '1', 'result'…","""[{'@value': '2', '@numvotes': …","""[{'@level': '91', '@value': 'N…",120,120,120,12,"""['Economic', 'Environmental', …","""['Closed Drafting', 'Contracts…","""['Category: Dized Tutorial', '…","""['Meeple BR Jogos Promo Pack #…","""['Terraforming Mars: reDrewno …",,"""['Terraforming Mars: Ares Expe…","""['Jacob Fryxelius']""","""['Isaac Fryxelius', 'Daniel Fr…","""['FryxGames', 'Arclight Games'…",103923,8.35266,8.2045,7,7.0,,1.42396,0,145458,785,1905,24807,14696,4280,3.2657,,,,,,,,,,"""Terraforming Mars"""


In [3]:
vectorizer = TfidfVectorizer(stop_words="english", analyzer="word", lowercase=True, strip_accents="unicode", max_features=256, min_df=1, max_df=0.8, token_pattern=r"[A-Za-z]+")
X = vectorizer.fit_transform(df["description"])
# print the vectorizer's vocabulary
print(vectorizer.vocabulary_)

reduced_X = X.toarray()

# # dimensionality reduction with svd
# from sklearn.decomposition import TruncatedSVD
# svd = TruncatedSVD(n_components=256)
# reduced_X = svd.fit_transform(X)

# map ids to signatures (as dense arrays)
sigs = {gid: reduced_X[i] for i, gid in enumerate(df["id"])}

{'players': 160, 'try': 227, 'island': 111, 'building': 28, 'cities': 40, 'turn': 229, 'dice': 61, 'resources': 176, 'build': 27, 'cards': 33, 'land': 116, 'type': 231, 'set': 194, 'includes': 107, 'placing': 156, 'tiles': 219, 'types': 232, 'number': 143, 'value': 240, 'roll': 179, 'sided': 198, 'placed': 155, 'tile': 218, 'player': 159, 'collect': 42, 'hand': 98, 'based': 18, 'mdash': 132, 'possible': 165, 'playing': 161, 'card': 32, 'moves': 138, 'new': 142, 'points': 164, 'army': 15, 'certain': 34, 'victory': 243, 'secret': 192, 's': 185, 'win': 247, 'draw': 66, 'place': 154, 'piece': 151, 'french': 85, 'city': 41, 'played': 158, 'way': 246, 'area': 13, 'complete': 49, 'like': 121, 'quot': 170, 'use': 236, 'opponent': 144, 'instead': 110, 'time': 220, 'score': 189, 'turns': 230, 'series': 193, 'world': 253, 'mission': 134, 'board': 23, 'actions': 4, 'deck': 57, 'abilities': 1, 'second': 191, 'taking': 215, 'unique': 233, 'role': 178, 'team': 216, 'strategy': 210, 'order': 146, 'all

In [4]:
print("signatures:", len(sigs), "length:", len(next(iter(sigs.values()))))
# print out hte first one

signatures: 27780 length: 256


## Nearest Neighbour Search using sci-kit

In [5]:
nbrs = NearestNeighbors(n_neighbors=10, metric='cosine').fit(reduced_X)

distances, indices = nbrs.kneighbors(reduced_X)

candidate_pairs = set()
game_ids_list = df["id"].to_list()
for i, neighbors in enumerate(indices):
    gid1 = game_ids_list[i]
    for j in neighbors[1:]:  
        gid2 = game_ids_list[j]
        pair = tuple(sorted((gid1, gid2)))
        candidate_pairs.add(pair)


print("LSH candidate pairs:", len(candidate_pairs))
id_to_title = dict(zip(df["id"], df["name"]))


pairs_named = list(map(
    lambda ab: f"{id_to_title[ab[0]]} <-> {id_to_title[ab[1]]}",
    candidate_pairs
))
print(pairs_named[:10])

LSH candidate pairs: 214758
['Shuttles <-> Volo', '13 Monsters <-> Fate: Defenders of Grimheim', 'Dice 10,000 <-> Monster Pit', 'Operation Felix <-> Into the Bastards!: First tank battle', 'The Swarm <-> Super Slopes', '1000 and One Treasures <-> Sayū', 'Byzantium <-> Pax Pamir', 'Infarkt <-> Bankraub', 'The Lords of Rock <-> Tenkatoitsu', 'Up Scope! Tactical Submarine Warfare in the 20th Century <-> Chickamauga River of Death']


In [6]:
game_ids = df["id"].to_list()
id_to_index = {gid: i for i, gid in enumerate(game_ids)}
N = len(game_ids)

S = np.zeros((N, N), dtype=np.float32)


for a, b in candidate_pairs:
    i = id_to_index[a]
    j = id_to_index[b]
    sim = cosine_similarity(sigs[a].reshape(1, -1), sigs[b].reshape(1, -1))[0, 0]
    S[i, j] = sim
    S[j, i] = sim


np.fill_diagonal(S, 0.0)  # INFO -> TECHNICALLY SHOULD BE 1, but does not work for my tool

game_titles = [id_to_title[g] for g in game_ids]
# make_duplicate titles different

game_titles = [f"{i}_{title}" for i, title in enumerate(game_titles)]

with h5py.File(GEN_DATA / "game_similarity_tfidf.h5", "w") as f:
    f.create_dataset("matrix", data=S.astype(np.float32))
    f.create_dataset("node_names", data=np.array(game_titles, dtype=h5py.string_dtype()))