In [None]:
import numpy as np
np.__version__

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Save steam_data.csv under "data" folder

### Load environment

In [None]:
import os
from tqdm import tqdm
import pandas as pd
from sentence_transformers import SentenceTransformer

os.chdir('drive/My Drive/data')
tqdm.pandas()  # Progress bar

### Clean data

In [None]:
# Read csv + Drop NA Title
# Clean price: Replace "Free" with "0" + Remove "$" and "," + Convert to float
# Clean date: coerce: If a date is not in "Jan 1, 2000" format, set it to NaT/NaN
# Combine Title, Description, Tags, and Features to one column "search_text"
df = (
    pd.read_csv("steam_data.csv")
    .dropna(subset=["Title"])
    .assign(
        original_price=lambda df_: df_["Original Price"]
            .replace("Free", "0")
            .str.replace(r"[$,]", "", regex=True)
            .astype(float),
        release_date=lambda df_: pd.to_datetime(
            df_["Release Date"], format="%d %b, %Y", errors="coerce"
        ),
        search_text=lambda df_: df_["Title"]
            + " " + df_["Game Description"]
            + " " + df_["Popular Tags"]
            + " " + df_["Game Features"],
    )
)

### Sentence transformer

In [None]:
# Initialize Sentence Transformer model
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# Other more computationally complex models
# model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
# model = SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens")
# model = SentenceTransformer("LaBSE")

# Add embedding column with progress bar
df["embedding"] = df["search_text"].progress_apply(
    lambda text: model.encode(text).tolist()
)

### Select columns + Write csv

In [None]:
# df[
#     [
#         "Title",
#         "original_price",
#         "release_date",
#         "Game Description",
#         "All Reviews Summary",
#         "Developer",
#         "Supported Languages",
#         "Popular Tags",
#         "Game Features",
#         "embedding"
#     ]
# ]