# Sentiment analysis of BoardGameGeek reviews

In [16]:
import polars as pl
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
from tqdm import tqdm
from afinn import Afinn
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [21]:
data = pl.read_csv("data/bgg-26m-reviews.csv")
print("All reviews:", data.shape[0])
data = data.filter(pl.col("comment").is_not_null())
data = data.sample(fraction=0.01)

print("Reviews with text:", data.shape[0])
# print all reviews that have a low score
print("Reviews with text and score <= 5:", data.filter(pl.col("rating") <= 5).shape[0])



All reviews: 26200012
Reviews with text: 42158
Reviews with text and score <= 5: 7179


### Loading pretrained sentiment analysis model

In [18]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

### Example evaluation

In [19]:
# Encode text
inputs = tokenizer("I don't love this movie!", return_tensors="pt").to(device)

# Forward pass
outputs = model(**inputs)

# Get prediction
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()
print(predicted_class)  # 0 -> negative, 1 -> positive

0


In [20]:
def batched_predict_sentiment(data, batch_size=32):
    comments = data["comment"].to_list()
    sentiments = []

    for i in tqdm(range(0, len(comments), batch_size)):
        batch = comments[i : i + batch_size]
        # tokenize a batch
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device)
        # predict
        with torch.no_grad():
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1).cpu() * 10
        sentiments.extend(preds.tolist())

    return data.with_columns(pl.Series("sentiment", sentiments))


data_with_sentiment = batched_predict_sentiment(data, batch_size=64)
data_with_sentiment.write_csv('data/bgg-26m-reviews-with-nnet-sentiment.csv')

  4%|â–Ž         | 24/659 [00:19<08:32,  1.24it/s]


KeyboardInterrupt: 

Approach with NNet model will take too long to process all reviews ( ~ 17 hours with a batch size of 512 running in Google Colab). Therefore develop our own, simpler model.

In [None]:
afinn = Afinn()
global i
i = 0
t_0 = time.time()


def lexicon_sentiment(text, max_abs_score=10):
    global i, t_0
    if i % 20000 == 0 and i > 0:
        print(f"Time taken for {i} reviews:", time.time() - t_0)
    i += 1
    
    raw_score = afinn.score(text)  # original AFINN score
    # scale to 0-10
    scaled_score = 5 * (1 + raw_score / max_abs_score)
    # clip to 0-10 in case raw_score exceeds max_abs_score
    scaled_score = max(0, min(10, scaled_score))
    
    return scaled_score

data = data.with_columns(
    pl.col("comment")
    .map_elements(lexicon_sentiment, return_dtype=pl.Float64)
    .alias("lexicon_sentiment")
    # pl.col("comment").map_elements(lambda x: afinn.score(x) + 5, return_dtype=pl.Float64).alias("lexicon_sentiment")
)

# reorder columns
data = data.select(
    [
        "user",
        "rating",
        "lexicon_sentiment",
        "comment",
        "name",
        "ID",
    ]
)

# save
data.write_csv("data/bgg-26m-reviews-with-lexicon-sentiment.csv")


# TODO: parallelize everythings by using vectorized operations 

Time taken for 20000 reviews: 10.986568689346313
Time taken for 40000 reviews: 21.7632052898407


In [None]:
data.filter((pl.col("rating") <= 5) & (pl.col("lexicon_sentiment") > 5)).shape

(3820, 6)