## Sentiment analysis for post comments

In [None]:
from pathlib import Path
import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
DATA_PATH = Path('dataset/final_cleaned_dataset_bb.csv')
OUTPUT_PATH = Path('dataset/beyondb_comments_sentiment.csv')
COMMENT_DELIMITER = ' || '


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(DATA_PATH)
df['cleaned_comments_content'] = df['cleaned_comments_content'].fillna('')
print(f'Total threads: {len(df):,}')
df[['forum_name', 'thread_id']].head()


Total threads: 12,763


Unnamed: 0,forum_name,thread_id
0,anxiety,610659
1,anxiety,611578
2,anxiety,611573
3,anxiety,611587
4,anxiety,180185


In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
id2label = model.config.id2label
label_mapping = {f'LABEL_{idx}': label.lower() for idx, label in id2label.items()}
max_position_embeddings = getattr(model.config, 'max_position_embeddings', None)
if max_position_embeddings and max_position_embeddings > 2:
    MAX_SEQ_LENGTH = max_position_embeddings - 2
else:
    model_max = tokenizer.model_max_length if tokenizer.model_max_length < 10_000 else 512
    MAX_SEQ_LENGTH = model_max if model_max > 0 else 512
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline(
    task='sentiment-analysis',
    model=model,
    tokenizer=tokenizer,
    device=device,
)
label_mapping

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


{'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}

In [4]:
def split_comments(text: str):
    parts = [part.strip() for part in text.split(COMMENT_DELIMITER)]
    return [part for part in parts if part]

comment_lists = df['cleaned_comments_content'].apply(split_comments)
flat_comments = [comment for comments in comment_lists for comment in comments]
print(f'Total comments: {len(flat_comments):,}')

results = []
batch_size = 32
for start in tqdm(range(0, len(flat_comments), batch_size), desc='Scoring comments'):
    batch = flat_comments[start:start + batch_size]
    outputs = classifier(batch, truncation=True, padding=True, max_length=MAX_SEQ_LENGTH)
    results.extend(outputs)

normalized = []
for item in results:
    label = item['label']
    normalized.append(label_mapping.get(label, label.lower()))

comments_sentiment = []
cursor = 0
for comments in comment_lists:
    count = len(comments)
    if count == 0:
        comments_sentiment.append('')
        continue
    sentiments = normalized[cursor:cursor + count]
    cursor += count
    comments_sentiment.append(COMMENT_DELIMITER.join(sentiments))

df['comments_sentiment'] = comments_sentiment
df[['forum_name', 'thread_id', 'comments_sentiment']].head()


Total comments: 50,318


Scoring comments:   1%|          | 10/1573 [00:01<03:42,  7.01it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Scoring comments: 100%|██████████| 1573/1573 [03:33<00:00,  7.35it/s]


Unnamed: 0,forum_name,thread_id,comments_sentiment
0,anxiety,610659,positive || positive
1,anxiety,611578,positive || neutral || negative || positive ||...
2,anxiety,611573,neutral || positive
3,anxiety,611587,neutral || positive
4,anxiety,180185,negative || negative || negative || neutral ||...


In [None]:
output_df = df[['forum_name', 'thread_id', 'cleaned_post_content', 'cleaned_comments_content']].copy()
output_df['comments_sentiment'] = df['comments_sentiment']
output_df.to_csv(OUTPUT_PATH, index=False)
OUTPUT_PATH


Comment sentiment saved to `project_B/dataset/beyondb_comments_sentiment.csv`.