In [3]:
import pandas as pd

from data.game_data import GameData
from database.game_database import GameDatabase
from embedding.description_embedder import DescriptionEmbedder


API_KEY = "3ba8f200-99a7-4b16-8d49-ba671878b6d9"  # Yeah, security
GAMES_FILE = "games_cleaned.csv"

  from .autonotebook import tqdm as notebook_tqdm


## Data

First we need to load the already cleaned and preprocessed data.

In [2]:
games = pd.read_csv(GAMES_FILE)

Any records that we don't want to store in the database should be filtered out now.

In [3]:
games = games[games["name"].str.contains("Tomb")] # Select the games to store in the database

## Embedding
After preparing the data we can set up the transformer for the embedding. We are preparing several embedders that are domain specific to our data. For now we are using the `DescriptionEmbedder` which is a transformer that uses a `sentence-transformer` library to embed the description of our games.

In [4]:
embedder = DescriptionEmbedder(transformer_name="all-MiniLM-L6-v2")

Store the games and embedding in a data wrapper class for easier access later on. Doing so will use the embedder/transformer to create the embeddings from the data and determine the dimensionality of the embeddings.

In [5]:
game_data = GameData(games=games, embedder=embedder)

In [6]:
embeddings = game_data.embeddings
dimension = game_data.embedding_dimension
print(f"Embeddings shape: {embeddings.shape}")

Embeddings shape: (84, 384)


## Database
Now, with the data prepare and the embeddings generated, we can store them in the database (currently using Pinecone).

In [4]:
db = GameDatabase(api_key=API_KEY, dimension=384)

In [12]:
db.load_data(game_data.ids, game_data.metadata, game_data.embeddings)

It takes some time until the data is created in the Pinecone service

In [5]:
db.describe_index()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'steam-games': {'vector_count': 83}},
 'total_vector_count': 83}

Once created, we can query the database to check if the data is stored correctly.

### Get By ID
A record can either be retrieved directly by ID...

In [15]:
game = db.get_by_id("391220")
print("ID:", game.id, ", Name:", game.metadata["name"])

ID: 391220 , Name: Shadow of the Tomb Raider: Definitive Edition


### Get By Embedding
...or we can get all the records that are similar to a given embedding.

In [24]:
matches = db.get_similar(game.values, k=10)
len(matches)

83

In [25]:
for match in matches:
    print("ID:", match.id, "- Name:", match.metadata["name"], ", Similarity:", match.score)

ID: 391220 - Name: Shadow of the Tomb Raider: Definitive Edition , Similarity: 1.0011094
ID: 849167 - Name: Tombé Drums VR , Similarity: 0.7962927
ID: 849179 - Name: Tomb of The Lost Sentry , Similarity: 0.7962927
ID: 849200 - Name: Shadow of the Tomb Raider: Definitive Edition , Similarity: 0.7962927
ID: 849163 - Name: Shadow of the Tomb Raider: Definitive Edition , Similarity: 0.7962927
ID: 849165 - Name: Seductive Tombs , Similarity: 0.7962927
ID: 890031 - Name: EURGAVA™: Tomb of Senza , Similarity: 0.7962927
ID: 849166 - Name: Shadow of the Tomb Raider: Definitive Edition , Similarity: 0.7962927
ID: 890030 - Name: Shadow of the Tomb Raider: Definitive Edition , Similarity: 0.7962927
ID: 905320 - Name: Treasure Tomb VR , Similarity: 0.7962927
ID: 849178 - Name: Scoot Kaboom and the Tomb of Doom , Similarity: 0.7962927
ID: 849160 - Name: Tomb Towers , Similarity: 0.7962927
ID: 750920 - Name: Tomb Defender , Similarity: 0.7962927
ID: 890032 - Name: Shadow of the Tomb Raider: Definitiv