## 3. Cluster-preparation and graph generation

Preparation for clustering(we convert the itemsets of shingles/keywords/descriptors and calculate a
similarity measure between each itemsets -> this in turn forms a fully connected
graph(edges == similarity, nodes == games) which we can then prune by removing the edges where
the weight is below a threshold)

### imports

In [12]:
import pandas as pd
import numpy as np
from utils import load_itemsets, OutputFormat
import h5py

from itertools import chain
from pathlib import Path

GENERATED_PATH = './data/gen'

generated_data_path = Path(GENERATED_PATH)


### loading data

In [None]:
itemsets = load_itemsets(generated_data_path / "freq_itemsets.json", output_format=OutputFormat.FLAT)

itemsets_short = [i for i in itemsets if len(i["items"]) > 1]

print(f"dropped {len(itemsets) - len(itemsets_short)} / {len(itemsets)} baskets to reduce inflation")

itemsets = itemsets_short

rules_df = pd.read_csv(generated_data_path / "rules.csv")

dropped 713 / 23595 buckets to reduce inflation


In [26]:
all_games = sorted(set(chain.from_iterable(d["items"] for d in itemsets)))
game_index = {g: i for i, g in enumerate(all_games)}
N = len(all_games)


### Similarity matrix

In [28]:

sim = np.zeros((N, N), dtype=np.float32)


for record in itemsets:
    items = record["items"]
    support = record["support"]
    support = record["support"] / len(itemsets)
    if len(items) < 2:
        continue
    for i in range(len(items)):
        for j in range(i + 1, len(items)):
            a, b = game_index[items[i]], game_index[items[j]]
            
            sim[a, b] += support
            sim[b, a] += support  # symmetric



sim /= sim.max()


### Association rules

In [16]:
for _, row in rules_df.iterrows():
    for a in eval(row["antecedents"]):
        for b in eval(row["consequents"]):
            if a in game_index and b in game_index:
                i, j = game_index[a], game_index[b]
                sim[i, j] += row["confidence"] * 0.5
                sim[j, i] += row["confidence"] * 0.5


### Storing as my format 
for use in gxpvis.skumantz.dev

In [29]:


with h5py.File(generated_data_path / "game_similarity.h5", "w") as f:
    f.create_dataset("matrix", data=sim)
    f.create_dataset("node_names", data=np.array(all_games, dtype=h5py.string_dtype()))
