In [None]:
import pandas as pd
import os

In [None]:

INPUT_DIR = "/content/drive/MyDrive/Steam/"
APP_FILE = f"{INPUT_DIR}/applications.csv"
REV_FILE = f"{INPUT_DIR}/reviews.csv"
OUTPUT_FILE = "temp_top_10k_games.csv"

In [None]:
def filter_top_games():

    df_reviews = pd.read_csv(REV_FILE, on_bad_lines='skip')

    review_counts = df_reviews['appid'].value_counts().reset_index()
    review_counts.columns = ['appid', 'real_review_count']

    df_apps = pd.read_csv(APP_FILE, on_bad_lines='skip')

    df_merged = pd.merge(df_apps, review_counts, on='appid', how='inner')

    df_top = df_merged.sort_values(by='real_review_count', ascending=False).head(10000).copy()

    df_top.to_csv(OUTPUT_FILE, index=False)
    print(f" saved top 10k games to {OUTPUT_FILE}")




In [None]:
filter_top_games()

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import os


In [None]:
SOURCE_FILE = "temp_top_10k_games.csv"
OUTPUT_DIR = "/content/drive/MyDrive/Steam/steam-dataset-2025/steam-dataset-2025-v1/notebook-data/02-semantic-game-discovery"
MODEL_NAME = "BAAI/bge-m3"

In [None]:
def generate_files():

    # load top 10k file
    df = pd.read_csv(SOURCE_FILE)

    df['combined_text'] = (
        "Name: " + df['name'].fillna('') + "; " +
        "Description: " + df['short_description'].fillna('') + "; " +
        "Content: " + df['about_the_game'].fillna('')
    )

    print("loading model")
    model = SentenceTransformer(MODEL_NAME)


    print("generating vectors.")
    texts = df['combined_text'].tolist()
    embeddings = model.encode(texts, batch_size=32, show_progress_bar=True, normalize_embeddings=True)

    embeddings = np.array(embeddings).astype('float32')
    print(f"vector shape: {embeddings.shape}")


    vec_path = f"{OUTPUT_DIR}/02_embeddings_vectors.npy"
    print(f"saving vectors to {vec_path}")
    np.save(vec_path, embeddings)


    id_path = f"{OUTPUT_DIR}/02_embeddings_appids.csv"
    print(f"saving appids to {id_path}")
    df[['appid']].to_csv(id_path, index=False)


    meta_cols = [
        'appid', 'name', 'short_description', 'type', 'release_date',
        'price_usd', 'currency', 'mat_supports_windows', 'mat_supports_mac',
        'mat_supports_linux', 'mat_achievement_count', 'metacritic_score',
        'review_count', 'primary_genre', 'all_genres'
    ]


    meta_path = f"{OUTPUT_DIR}/01_game_embeddings_sample.csv"
    print(f"saving metadata to {meta_path}")
    df[meta_cols].to_csv(meta_path, index=False)




In [None]:
generate_files()

In [None]:

mock_data = {
    'appid': [10, 570],
    'name': ['Counter-Strike', 'Dota 2'],
    'short_description': ['Action shooter.', 'Strategy MOBA.'],
    'about_the_game': ["Play the world's number 1 online action game. Engage in an incredibly realistic brand of terrorist warfare in this wildly popular team-based game. Ally with teammates to complete strategic missions. Take out enemy sites. Rescue hostages. Your role affects your team's success. Your team's success affects your role.", "<strong>The most-played game on Steam.</strong><br>Every day, millions of players worldwide enter battle as one of over a hundred Dota heroes. And no matter if it's their 10th hour of play or 1,000th, there's always something new to discover. With regular updates that ensure a constant evolution of gameplay, features, and heroes, Dota 2 has truly taken on a life of its own.<br><br><strong>One Battlefield. Infinite Possibilities.</strong><br>When it comes to diversity of heroes, abilities, and powerful items, Dota boasts an endless array—no two games are the same. Any hero can fill multiple roles, and there's an abundance of items to help meet the needs of each game. Dota doesn't provide limitations on how to play, it empowers you to express your own style.<br><br><strong>All heroes are free.</strong><br>Competitive balance is Dota's crown jewel, and to ensure everyone is playing on an even field, the core content of the game—like the vast pool of heroes—is available to all players. Fans can collect cosmetics for heroes and fun add-ons for the world they inhabit, but everything you need to play is already included before you join your first match.<br><br><strong>Bring your friends and party up.</strong><br>Dota is deep, and constantly evolving, but it's never too late to join. <br>Learn the ropes playing co-op vs. bots. Sharpen your skills in the hero demo mode. Jump into the behavior- and skill-based matchmaking system that ensures you'll <br>be matched with the right players each game."],
    'type': ['game', 'game'],
    'release_date': ['2000-11-01', '2013-07-09'],
    'price_usd': [9.99, 0.00],
    'currency': ['USD', 'USD'],
    'mat_supports_windows': [True, True],
    'mat_supports_mac': [True, True],
    'mat_supports_linux': [True, True],
    'mat_achievement_count': [0, 0],
    'metacritic_score': [88, 90],
    'review_count': [150000, 200000],
    'primary_genre': ['Action', 'Strategy'],
    'all_genres': ['Action', 'Strategy, MOBA', 'Action, Adventure', 'Action', 'RPG, Open World']
}

df_raw = pd.DataFrame(mock_data)
# generate_files(df_raw)