In [1]:
import pandas as pd

from data.game_data import GameData
from database.game_database import GameDatabase
from embedding.description_embedder import DescriptionEmbedder


API_KEY = "3ba8f200-99a7-4b16-8d49-ba671878b6d9"  # Yeah, security
GAMES_FILE = r"C:\Users\LUKY\OneDrive - Hochschule Luzern\Projects\DSPRO1\Data\clean_data.csv"

  from tqdm.autonotebook import tqdm, trange


## Data

First we need to load the already cleaned and preprocessed data.

In [2]:
games = pd.read_csv(GAMES_FILE, delimiter=";")

Any records that we don't want to store in the database should be filtered out now.

In [3]:
games

Unnamed: 0,Name,Release date,Estimated owners,About the game,Positive,Negative,Developers,Publishers,Categories,Genres,Tags,Popularity
0,Counter-Strike: Global Offensive,21.08.2012,100000000,Counter-Strike: Global Offensive (CS: GO) expa...,5764420,766677,"Valve,Hidden Path Entertainment",Valve,"Multi-player,Steam Achievements,Full controlle...","Action,Free to Play","FPS,Shooter,Multiplayer,Competitive,Action,Tea...",0.785569
1,Dota 2,09.07.2013,200000000,"The most-played game on Steam. Every day, mill...",1477153,300437,Valve,Valve,"Multi-player,Co-op,Steam Trading Cards,Steam W...","Action,Free to Play,Strategy","Free to Play,MOBA,Multiplayer,Strategy,e-sport...",0.566434
2,PUBG: BATTLEGROUNDS,21.12.2017,100000000,"LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...",1154655,895978,"KRAFTON, Inc.","KRAFTON, Inc.","Multi-player,PvP,Online PvP,Stats,Remote Play ...","Action,Adventure,Free to Play,Massively Multip...","Survival,Shooter,Multiplayer,Battle Royale,FPS...",0.434255
3,Team Fortress 2,10.10.2007,100000000,The most fun you can have online - PC Gamer Is...,823693,56683,Valve,Valve,"Multi-player,Cross-Platform Multiplayer,Steam ...","Action,Free to Play","Free to Play,Hero Shooter,Multiplayer,FPS,Shoo...",0.275344
4,Black Myth: Wukong,19.08.2024,100000000,Black Myth: Wukong is an action RPG rooted in ...,663109,28700,Game Science,Game Science,"Single-player,Steam Achievements,Full controll...","Action,Adventure,RPG","Mythology,Action RPG,Action,RPG,Souls-like,Com...",0.257988
...,...,...,...,...,...,...,...,...,...,...,...,...
9574,Deserving Life,13.10.2017,100000,Deserving Life is a room-scale virtual reality...,96,20,"Lukas Paul, Gabriel Mittermair",Deserving Life,"Single-player,Steam Achievements",Free to Play,"Free to Play,Horror,VR",0.000211
9575,Timen runner,26.05.2017,100000,Hardcore pixel 2D platformer with steampunk el...,49,44,REX PEX GAMES,REX PEX GAMES,"Single-player,Steam Achievements,Steam Trading...","Casual,Indie","Indie,Casual,Platformer,2D,Pixel Graphics,Grea...",0.000211
9576,Mahjong Deluxe 2: Astral Planes,20.07.2016,100000,Mahjong Deluxe 2: Astral Planes is an out-of-t...,47,45,EnsenaSoft,EnsenaSoft,Unknown,Casual,"Casual,Puzzle",0.000211
9577,Pizza Frenzy Deluxe,30.08.2006,100000,Pizza Frenzy is a wacky action puzzler that pu...,125,5,"PopCap Games, Inc.","PopCap Games, Inc.",Single-player,Casual,Casual,0.000211


## Embedding
After preparing the data we can set up the transformer for the embedding. We are preparing several embedders that are domain specific to our data. For now we are using the `DescriptionEmbedder` which is a transformer that uses a `sentence-transformer` library to embed the description of our games.

In [4]:
embedder = DescriptionEmbedder(transformer_name="all-MiniLM-L6-v2")

Store the games and embedding in a data wrapper class for easier access later on. Doing so will use the embedder/transformer to create the embeddings from the data and determine the dimensionality of the embeddings.

In [2]:
game_data = GameData(games=games, embedder=embedder)

NameError: name 'games' is not defined

In [6]:
embeddings = game_data.embeddings
dimension = game_data.embedding_dimension
print(f"Embeddings shape: {embeddings.shape}")

Embeddings shape: (9579, 384)


## Database
Now, with the data prepare and the embeddings generated, we can store them in the database (currently using Pinecone).

In [22]:
db = GameDatabase(api_key=API_KEY, dimension=384)

In [23]:
db.load_data(game_data.ids, game_data.metadata, game_data.embeddings)

PineconeException: UNKNOWN:Error received from peer  {grpc_message:"Error, message length too large: found 35585765 bytes, the limit is: 4194304 bytes", grpc_status:11, created_time:"2024-11-29T12:25:42.2927066+00:00"}

It takes some time until the data is created in the Pinecone service

In [18]:
db.describe_index()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'steam-games': {'vector_count': 9570}},
 'total_vector_count': 9570}

Once created, we can query the database to check if the data is stored correctly.

### Get By ID
A record can either be retrieved directly by ID...

In [21]:
game = db.get_by_id("Tomb Raider II")
print("Name:", game.metadata["Name"])

Name: Tomb Raider II


### Get By Embedding
...or we can get all the records that are similar to a given embedding.

In [21]:
matches = db.get_similar(game.values, k=10)
len(matches)

10

In [22]:
for match in matches:
    print("ID:", match.id, "- Name:", match.metadata["name"], ", Similarity:", match.score)

ID: Tomb Raider II - Name: Tomb Raider II , Similarity: 1.0000001
ID: Tomb Raider: Legend - Name: Tomb Raider: Legend , Similarity: 0.6332107
ID: Shadow of the Tomb Raider: Definitive Edition - Name: Shadow of the Tomb Raider: Definitive Edition , Similarity: 0.6131774
ID: Rise of the Tomb Raider - Name: Rise of the Tomb Raider™ , Similarity: 0.60820234
ID: Tomb Raider: Underworld - Name: Tomb Raider: Underworld , Similarity: 0.5933311
ID: Tomb Raider III - Name: Tomb Raider III , Similarity: 0.5847119
ID: Tomb Raider - Name: Tomb Raider , Similarity: 0.57854533
ID: Tomb Raider IV: The Last Revelation - Name: Tomb Raider IV: The Last Revelation , Similarity: 0.5702852
ID: Tomb Raider I-III Remastered Starring Lara Croft - Name: Tomb Raider I-III Remastered Starring Lara Croft , Similarity: 0.5581841
ID: Tomb Raider: Anniversary - Name: Tomb Raider: Anniversary , Similarity: 0.5469401


In [None]:
ids = db.get_ids()
len(ids)