In [1]:
import pandas as pd

from data.game_data import GameData
from database.game_database import GameDatabase
from embedding.description_embedder import DescriptionEmbedder


API_KEY = "3ba8f200-99a7-4b16-8d49-ba671878b6d9"  # Yeah, security
GAMES_FILE = "games_cleaned.csv"

  from tqdm.autonotebook import tqdm, trange


## Data

First we need to load the already cleaned and preprocessed data.

In [2]:
games = pd.read_csv(GAMES_FILE)

Any records that we don't want to store in the database should be filtered out now.

In [3]:
games = games[games["name"].str.contains("Tomb")] # Select the games to store in the database

## Embedding
After preparing the data we can set up the transformer for the embedding. We are preparing several embedders that are domain specific to our data. For now we are using the `DescriptionEmbedder` which is a transformer that uses a `sentence-transformer` library to embed the description of our games.

In [4]:
embedder = DescriptionEmbedder(transformer_name="all-MiniLM-L6-v2")

Store the games and embedding in a data wrapper class for easier access later on. Doing so will use the embedder/transformer to create the embeddings from the data and determine the dimensionality of the embeddings.

In [2]:
game_data = GameData(games=games, embedder=embedder)

NameError: name 'games' is not defined

In [6]:
embeddings = game_data.embeddings
dimension = game_data.embedding_dimension
print(f"Embeddings shape: {embeddings.shape}")

Embeddings shape: (84, 384)


## Database
Now, with the data prepare and the embeddings generated, we can store them in the database (currently using Pinecone).

In [3]:
db = GameDatabase(api_key=API_KEY, dimension=384)

In [8]:
db.load_data(game_data.ids, game_data.metadata, game_data.embeddings)

It takes some time until the data is created in the Pinecone service

In [19]:
db.describe_index()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'steam-games': {'vector_count': 65}},
 'total_vector_count': 65}

Once created, we can query the database to check if the data is stored correctly.

### Get By ID
A record can either be retrieved directly by ID...

In [20]:
game = db.get_by_id("Tomb Raider II")
print("ID:", game.id, ", Name:", game.metadata["name"])

ID: Tomb Raider II , Name: Tomb Raider II


### Get By Embedding
...or we can get all the records that are similar to a given embedding.

In [21]:
matches = db.get_similar(game.values, k=10)
len(matches)

10

In [22]:
for match in matches:
    print("ID:", match.id, "- Name:", match.metadata["name"], ", Similarity:", match.score)

ID: Tomb Raider II - Name: Tomb Raider II , Similarity: 1.0000001
ID: Tomb Raider: Legend - Name: Tomb Raider: Legend , Similarity: 0.6332107
ID: Shadow of the Tomb Raider: Definitive Edition - Name: Shadow of the Tomb Raider: Definitive Edition , Similarity: 0.6131774
ID: Rise of the Tomb Raider - Name: Rise of the Tomb Raider™ , Similarity: 0.60820234
ID: Tomb Raider: Underworld - Name: Tomb Raider: Underworld , Similarity: 0.5933311
ID: Tomb Raider III - Name: Tomb Raider III , Similarity: 0.5847119
ID: Tomb Raider - Name: Tomb Raider , Similarity: 0.57854533
ID: Tomb Raider IV: The Last Revelation - Name: Tomb Raider IV: The Last Revelation , Similarity: 0.5702852
ID: Tomb Raider I-III Remastered Starring Lara Croft - Name: Tomb Raider I-III Remastered Starring Lara Croft , Similarity: 0.5581841
ID: Tomb Raider: Anniversary - Name: Tomb Raider: Anniversary , Similarity: 0.5469401


In [None]:
ids = db.get_ids()
len(ids)