In [None]:
import pandas as pd
import nltk
nltk.download('punkt_tab')

from database.game_data import GameData
from database.game_database import GameDatabase
from embedding.description_embedder import DescriptionEmbedder
from embedding.tags_embedder import TagsEmbedder

API_KEY = "place your pinecone api key here"

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kybur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


## Data

First we need to load the already cleaned and preprocessed data. The path below should point to the csv file containing all games after the data cleaning and preprocessing steps.

In [None]:
GAMES_FILE = "../data/games_cleaned.csv"

In [2]:
games = pd.read_csv(GAMES_FILE, delimiter=";")

They dates have to be formatted in ISO 8601 format.

In [3]:
games["Release date"] = games["Release date"].apply(lambda x: pd.to_datetime(x, dayfirst=True)).dt.date
games["Release date"] = games["Release date"].apply(str)

Any records that we don't want to store in the database should be filtered out now. But since the data preprocessing already filtered out the games that we don't want to store, we can just load the data and store it in the database.

## Embedding
After preparing the data we can set up the transformer for the embedding. We are preparing several embedders that are domain specific to our data. First we are using a `DescriptionEmbedder` which is a transformer that uses a `sentence-transformer` library to embed the description of our games. Additionally we're using a `TagsEmbedder` that creates a wored2vec embedding of the tags of the games.

### Description Embedder
Let's start by preparing the embeddings based on the description.

In [5]:
description_embedder = DescriptionEmbedder()

Store the games and embedding in a data wrapper class for easier access later on. Doing so will use the embedder/transformer to create the embeddings from the data and determine the dimensionality of the embeddings.

In [15]:
description_game_data = GameData(games=games, embedder=description_embedder)

In [18]:
description_dimension = len(description_game_data.embeddings[0])
print(f"Embeddings dimension: {description_dimension}")

Embeddings dimension: 768


### Tags Embedder
Continuing with creating the embeddings from the tags.

In [6]:
tags_embedder = TagsEmbedder()

Store the games and embedding in a data wrapper class for easier access later on. Doing so will use the embedder/transformer to create the embeddings from the data and determine the dimensionality of the embeddings.

In [7]:
tags_game_data = GameData(games=games, embedder=tags_embedder)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [10]:
tags_dimension = len(tags_game_data.embeddings[0])
print(f"Embeddings dimension: {tags_dimension}")

Embeddings dimension: 300


## Database
Now, with the data prepared and the embeddings generated, we can store them in the database (currently using Pinecone).

In [14]:
description_index_name = "description-index"
tags_index_name = "tags-index"

db = GameDatabase(api_key=API_KEY)

### Description Index

Load the description embeddings into the database by creating an extra index for the description embeddings.

In [17]:
db.load_data(index_name=description_index_name, ids=description_game_data.ids,
             data=description_game_data.metadata, embeddings=description_game_data.embeddings)

Inserting records 1 to 100
Inserting records 101 to 200
Inserting records 201 to 300
Inserting records 301 to 400
Inserting records 401 to 500
Inserting records 501 to 600
Inserting records 601 to 700
Inserting records 701 to 800
Inserting records 801 to 900
Inserting records 901 to 1000
Inserting records 1001 to 1100
Inserting records 1101 to 1200
Inserting records 1201 to 1300
Inserting records 1301 to 1400
Inserting records 1401 to 1500
Inserting records 1501 to 1600
Inserting records 1601 to 1700
Inserting records 1701 to 1800
Inserting records 1801 to 1900
Inserting records 1901 to 2000
Inserting records 2001 to 2100
Inserting records 2101 to 2200
Inserting records 2201 to 2300
Inserting records 2301 to 2400
Inserting records 2401 to 2500
Inserting records 2501 to 2600
Inserting records 2601 to 2700
Inserting records 2701 to 2800
Inserting records 2801 to 2900
Inserting records 2901 to 3000
Inserting records 3001 to 3100
Inserting records 3101 to 3200
Inserting records 3201 to 330

It takes some time until the data is created in the Pinecone service

In [43]:
db.describe_index(index_name=description_index_name)

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'steam-games': {'vector_count': 9969}},
 'total_vector_count': 9969}

Once created, we can query the database to check if the data is stored correctly.

#### Get By ID
A record can either be retrieved directly by ID...

In [24]:
game = db.get_by_id(id_="Tomb Raider II", index_name=description_index_name)
print("Name:", game.metadata["Name"], "\nTags:", game.metadata["Tags"])

Name: Tomb Raider II 
Tags: Adventure,Action,Female Protagonist,Classic,Singleplayer,Third Person,Puzzle,Action-Adventure,Platformer,Old School,1990's,Third-Person Shooter,Shooter,Atmospheric,3D Platformer,Retro,Puzzle-Platformer,Exploration


#### Get By Embedding
...or we can get all the records that are similar to a given embedding.

In [25]:
matches = db.get_similar(index_name=description_index_name, embedding=game.values, k=10)
len(matches)

10

In [28]:
for match in matches:
    print("ID/Name:", match.id, ", Similarity:", match.score)

ID/Name: Tomb Raider II , Similarity: 0.9999999
ID/Name: Rise of the Tomb Raider , Similarity: 0.63538927
ID/Name: Tomb Raider: Legend , Similarity: 0.60590494
ID/Name: Tomb Raider IV: The Last Revelation , Similarity: 0.58435434
ID/Name: LARA CROFT AND THE TEMPLE OF OSIRIS , Similarity: 0.5633661
ID/Name: Dragon Age: Origins , Similarity: 0.5630411
ID/Name: Tomb Raider I , Similarity: 0.5431722
ID/Name: Tomb Raider , Similarity: 0.5423658
ID/Name: 9Dragons : Kung Fu Arena , Similarity: 0.5415802
ID/Name: Xuan-Yuan Sword VII , Similarity: 0.53093356


### Tags Index

With the description embeddings stored, we can now store the tags embeddings in the database.

In [40]:
db.load_data(index_name=tags_index_name, ids=tags_game_data.ids,
             data=tags_game_data.metadata, embeddings=tags_game_data.embeddings)

Inserting records 1 to 100
Inserting records 101 to 200
Inserting records 201 to 300
Inserting records 301 to 400
Inserting records 401 to 500
Inserting records 501 to 600
Inserting records 601 to 700
Inserting records 701 to 800
Inserting records 801 to 900
Inserting records 901 to 1000
Inserting records 1001 to 1100
Inserting records 1101 to 1200
Inserting records 1201 to 1300
Inserting records 1301 to 1400
Inserting records 1401 to 1500
Inserting records 1501 to 1600
Inserting records 1601 to 1700
Inserting records 1701 to 1800
Inserting records 1801 to 1900
Inserting records 1901 to 2000
Inserting records 2001 to 2100
Inserting records 2101 to 2200
Inserting records 2201 to 2300
Inserting records 2301 to 2400
Inserting records 2401 to 2500
Inserting records 2501 to 2600
Inserting records 2601 to 2700
Inserting records 2701 to 2800
Inserting records 2801 to 2900
Inserting records 2901 to 3000
Inserting records 3001 to 3100
Inserting records 3101 to 3200
Inserting records 3201 to 330

Once again, it takes some time until the data is created in the Pinecone service

In [44]:
db.describe_index(index_name=tags_index_name)

{'dimension': 300,
 'index_fullness': 0.0,
 'namespaces': {'steam-games': {'vector_count': 10269}},
 'total_vector_count': 10269}

Once created, we can query the database to check if the data is stored correctly.

In [45]:
game = db.get_by_id(index_name=tags_index_name, id_="Tomb Raider II")
print("Name:", game.metadata["Name"])

Name: Tomb Raider II


#### Get By Embedding
...or we can get all the records that are similar to a given embedding.

In [46]:
matches = db.get_similar(index_name=tags_index_name, embedding=game.values, k=10)
len(matches)

10

In [47]:
for match in matches:
    print("ID/Name:", match.id, ", Similarity:", match.score)

ID/Name: Tomb Raider II , Similarity: 1.0
ID/Name: Tomb Raider IV: The Last Revelation , Similarity: 0.9879375
ID/Name: Tomb Raider I , Similarity: 0.979505
ID/Name: Tomb Raider III , Similarity: 0.9678797
ID/Name: Tomb Raider VI: The Angel of Darkness , Similarity: 0.96217847
ID/Name: Tomb Raider: Legend , Similarity: 0.95135796
ID/Name: Tomb Raider: Underworld , Similarity: 0.9410779
ID/Name: Hedon Bloodrite , Similarity: 0.93372333
ID/Name: Tomb Raider: Anniversary , Similarity: 0.9312476
ID/Name: Twinsen's Little Big Adventure 2 Classic , Similarity: 0.9235054
