In [None]:
import os
from pprint import pprint
from itertools import accumulate

from dotenv import load_dotenv
from itables import show
import pymongo

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

from PIL import Image

from maths import compute_ppmi

from mlxtend.preprocessing import TransactionEncoder

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
load_dotenv()

mongo_connection = {
    "host": os.getenv("MONGO_HOST"),
    "port": int(os.getenv("MONGO_PORT")),
    "username": os.getenv("MONGO_USERNAME"),
    "password": os.getenv("MONGO_PASSWORD"),
}

client = pymongo.MongoClient(**mongo_connection)
db = client.league
matches = db.matches

In [None]:
print("matches count:", matches.estimated_document_count())

In [None]:
dataset = []

itemkey = [f"info.participants.item{i}" for i in range(0, 7)]

matches_ = matches.find({"info.gameVersion": {"$regex": r"^16\.1\..*$"} }, {*itemkey}).to_list()
for m in matches_:
    m_items = m.get("info").get("participants")
    # print(m_items)
    for item_set in m_items:
        item_set_values = list(item_set.values())
        item_set_values_filtered = list(filter(lambda v: v!=0, item_set_values))
        dataset.append(item_set_values_filtered)

print("item sets:", len(dataset))
# show(pd.DataFrame(dataset))
# print(pd.DataFrame(dataset))

In [None]:
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)

df = pd.DataFrame(te_ary, columns=te.columns_).astype(float)
co_occurrence_similarity = df.T.dot(df)

# compute ppmi to "ignore" frequency
ppmi_matrix = compute_ppmi(co_occurrence_similarity)

print(ppmi_matrix.shape)

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(ppmi_matrix)

pca = PCA(n_components=ppmi_matrix.shape[0])
pca_results = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(data=pca_results, index=ppmi_matrix.index)

plt.figure()
plt.title("Scree plot")
plt.ylabel("Explained Variance Ratio")
plt.xlabel("Componant number")
plt.plot(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_)
plt.show()


plt.figure()
plt.title("Cumulative Explained Variance")
plt.ylabel("Cumulative Explained Variance")
plt.xlabel("Componant number")
plt.plot(range(len(pca.explained_variance_ratio_)), list(accumulate(pca.explained_variance_ratio_)))
plt.show()

# for i, v in enumerate(pca.explained_variance_ratio_):
#     print(f"PC{i} ({v:.2%} variance)")

# print("Sum PCX", sum(pca.explained_variance_ratio_))

In [None]:
plt.figure(figsize=(50, 50))

# still need to scatter the points even though we dont show them
plt.scatter(pca_df[0], pca_df[1], s=0)

ax = plt.gca()

for i, txt in enumerate(pca_df.index):
    path = f"item/{txt}.png"

    img = Image.open(path)
    img = img.resize((64, 64))
    img.putalpha(190)
    img = np.asarray(img)

    imagebox = OffsetImage(img, zoom=0.9)

    x0, y0 = pca_df[0].iloc[i], pca_df[1].iloc[i]
    ab = AnnotationBbox(imagebox, (x0, y0), frameon=False)

    ax.add_artist(ab)

plt.axis('off')
plt.show()