# Sentiment analysis of BoardGameGeek reviews

In [None]:
import polars as pl
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [9]:
data = pl.read_csv("data/bgg-26m-reviews.csv")
print("All reviews:", data.shape[0])
print("Unique games:", data.select(pl.col("ID").n_unique()).item())
data = data.filter(pl.col("comment").is_not_null())
data = data.sample(fraction=1)

print("Reviews with text:", data.shape[0])
# print all reviews that have a low score
print("Reviews with text and score <= 5:", data.filter(pl.col("rating") <= 5).shape[0])
print("Unique games that have > 0 review text :", data.select(pl.col("ID").n_unique()).item())



All reviews: 26200012
Unique games: 27865
Reviews with text: 4215806
Reviews with text and score <= 5: 720616
Unique games that have > 0 review text : 27851


### Using pretrained emotions detection model trained on GoEmotions dataset

Run the script `emotions_gpu.py` to generate the file `data/bgg-26m-reviews-with-emotions.csv` which contains the emotion probabilities for each review in the BGG dataset. This notebook aggregates these probabilities by game ID and normalizes them to obtain a sentiment profile for each game.

**Note** - processing the dataset takes a long time even with a GPU. The generated CSV file is also quite large (~3.5 GB). 

In [None]:
# load the emotion data obtained from the emotions_gpu.py script if it exists

emotions_data = pl.read_csv("data/bgg-26m-reviews-with-emotions.csv")

positive_emotion_columns = ["prob_admiration", "prob_amusement", "prob_approval", "prob_caring", "prob_curiosity", "prob_excitement", "prob_desire", "prob_gratitude", "prob_joy", "prob_love", "prob_pride", "prob_optimism", "prob_relief"]

negative_emotion_columns = ["prob_anger", "prob_annoyance", "prob_disappointment", "prob_disapproval", "prob_disgust", "prob_embarrassment", "prob_fear", "prob_grief", "prob_nervousness", "prob_remorse", "prob_sadness"]

ambiguous_emotion_columns = ["prob_confusion", "prob_realization", "prob_surprise", "prob_neutral"]

# aggregate emotions by game id
all_emotion_cols = positive_emotion_columns + negative_emotion_columns + ambiguous_emotion_columns
aggregated_emotions = emotions_data.group_by("ID").agg(pl.col(all_emotion_cols).mean())

# l1 normalize emotions
aggregated_emotions = (
    aggregated_emotions.with_columns(
        pl.sum_horizontal(all_emotion_cols).alias("total_mass")
    )
    .with_columns(
        [(pl.col(c) / pl.col("total_mass")).round(2).alias(c) for c in all_emotion_cols]
    )
    .drop("total_mass")
)


#  create sum of negative,  positive and ambiguous emotions
aggregated_emotions = aggregated_emotions.with_columns(
    pl.sum_horizontal(positive_emotion_columns).round(2).alias("positive_emotion"),
    pl.sum_horizontal(negative_emotion_columns).round(2).alias("negative_emotion"),
    pl.sum_horizontal(ambiguous_emotion_columns).round(2).alias("ambiguous_emotion"),
)


aggregated_emotions.write_csv("data/bgg-26m-aggregated-emotions.csv")

The output of this notebook is saved to `data/bgg-26m-aggregated-emotions.csv`. and is provided in the repository to avoid the need for reprocessing.