# Sentiment analysis of BoardGameGeek reviews

In [10]:
import polars as pl
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
data = pl.read_csv('data/bgg-26m-reviews.csv')
print("All reviews:",data.shape[0])
data = data.filter(pl.col('comment').is_not_null())
print("Reviews with text:",data.shape[0])

### Loading pretrained sentiment analysis model

In [None]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

0


### Example evaluation

In [None]:
# Encode text
inputs = tokenizer("I don't love this movie!", return_tensors="pt").to(device)

# Forward pass
outputs = model(**inputs)

# Get prediction
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()
print(predicted_class)  # 1 -> positive

In [None]:
def batched_predict_sentiment(data, batch_size=32):
    comments = data["comment"].to_list()
    sentiments = []

    for i in tqdm(range(0, len(comments), batch_size)):
        batch = comments[i : i + batch_size]
        # tokenize a batch
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device)
        # predict
        with torch.no_grad():
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1).cpu() * 10
        sentiments.extend(preds.tolist())

    return data.with_columns(pl.Series("sentiment", sentiments))


data_with_sentiment = batched_predict_sentiment(data, batch_size=64)

In [None]:
# save to csv
data_with_sentiment.write_csv('data/bgg-26m-reviews-with-sentiment.csv')