# Sentiment analysis of BoardGameGeek reviews

In [1]:
import polars as pl
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
from tqdm import tqdm
from afinn import Afinn
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
data = pl.read_csv("data/bgg-26m-reviews.csv")
print("All reviews:", data.shape[0])
data = data.filter(pl.col("comment").is_not_null())
data = data.sample(fraction=1)

print("Reviews with text:", data.shape[0])
# print all reviews that have a low score
print("Reviews with text and score <= 5:", data.filter(pl.col("rating") <= 5).shape[0])



All reviews: 26200012
Reviews with text: 4215806
Reviews with text and score <= 5: 720616


### Loading pretrained sentiment analysis model

In [3]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

### Example evaluation

In [4]:
# Encode text
inputs = tokenizer("I don't love this movie!", return_tensors="pt").to(device)

# Forward pass
outputs = model(**inputs)

# Get prediction
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()
print(predicted_class)  # 0 -> negative, 1 -> positive

0


In [None]:
def batched_predict_sentiment(data, batch_size=32):
    comments = data["comment"].to_list()
    sentiments = []

    for i in tqdm(range(0, len(comments), batch_size)):
        batch = comments[i : i + batch_size]
        # tokenize a batch
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device)
        # predict
        with torch.no_grad():
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1).cpu() * 10
        sentiments.extend(preds.tolist())

    return data.with_columns(pl.Series("sentiment", sentiments))


data_with_sentiment = batched_predict_sentiment(data, batch_size=64)
data_with_sentiment.write_csv('data/bgg-26m-reviews-with-nnet-sentiment.csv')

Approach with NNet model will take too long to process all reviews ( ~ 17 hours with a batch size of 512 running in Google Colab). Therefore develop our own, simpler model.

In [None]:
import time
import polars as pl
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# ensure VADER lexicon is downloaded
nltk.download("vader_lexicon")

# initialize VADER sentiment analyzer
sentiment_lexicon = SentimentIntensityAnalyzer()

# global timing vars
global i
i = 0
t_0 = time.time()

def lexicon_sentiment(text):
    global i, t_0
    if i % 20000 == 0 and i > 0:
        print(f"Time taken for {i} reviews:", time.time() - t_0)
    i += 1

    if not text or not isinstance(text, str):
        return 5.0  # neutral default if text is empty or invalid

    # get VADER compound score in [-1, 1]
    compound = sentiment_lexicon.polarity_scores(text)["compound"]

    # scale to 0â€“10
    scaled_score = (compound + 1) * 5
    return scaled_score

# apply to dataset
lexicon_data = data.with_columns(
    pl.col("comment")
    .map_elements(lexicon_sentiment, return_dtype=pl.Float64)
    .alias("lexicon_sentiment")
)

# reorder columns
lexicon_data = lexicon_data.select(
    [
        "user",
        "rating",
        "lexicon_sentiment",
        "comment",
        "name",
        "ID",
    ]
)

# save to CSV
lexicon_data.write_csv("data/bgg-26m-reviews-with-vader-sentiment.csv")


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import polars as pl

vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(data["comment"].to_list())
print("Finished vectorizing")

# cluster into 3 groups (negative / neutral / positive)
kmeans = KMeans(n_clusters=3, random_state=42, )
labels = kmeans.fit_predict(X)

# map cluster labels to sentiment score (0, 5, 10)
label_to_score = {0: 0, 1: 5, 2: 10}
scores = [label_to_score[l] for l in labels]

unsupervised_data = data.with_columns(pl.Series("unsupervised_sentiment", scores))

unsupervised_data.write_csv("data/bgg-26m-reviews-with-unsupervised-sentiment.csv")

Finished vectorizing
