Pre-trained model from: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

In [140]:
from transformers import pipeline
import pandas as pd

MODEL_PATH = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# get the comments and prepare them into a list
df_comments = pd.read_parquet(path="data/comments.parquet.brotli", engine="pyarrow")
comment_contents = df_comments["comment_content"]
comment_contents = comment_contents.to_list()

# run the twitter-roberta-base model
sentiment_task = pipeline("sentiment-analysis", model=MODEL_PATH, tokenizer=MODEL_PATH)
tokenizer_kwargs = {"truncation": True, "max_length": 512}

start, end = 0, 10000
max_row_idx = 983114

while start <= max_row_idx:
    results = sentiment_task(comment_contents[start:end], **tokenizer_kwargs)

    # convert results into a pandas dataframe
    results = pd.DataFrame(results)
    results = results.rename(columns={"label": "sentiment", "score": "sentiment_score"})
    results["sentiment"] = results["sentiment"].replace(
        {"neutral": 0, "negative": -1, "positive": 1}
    )
    results = pd.concat(
        [df_comments.loc[start : end - 1, "comment_id"].reset_index(drop=True), results], axis=1
    )
    results["sentiment"] = results["sentiment"].astype("category")

    # save results to a csv file
    results.to_csv(f"data/comment_sentiment_{start}_{end-1}.csv", index=False)
    
    start += 10000
    end = min(end + 10000, max_row_idx + 1)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
  results["sentiment"] = results["sentiment"].replace(
  results["sentiment"] = results["sentiment"].replace(
  results["sentiment"] = resu

In [141]:
# combine all the comments sentments csv files
import glob

# Define the file pattern
file_pattern = "data/comment_sentiment_*_*.csv"

# Use glob to find all files matching the pattern
file_list = glob.glob(file_pattern)

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through the files and read them into DataFrames
for file in file_list:
    df = pd.read_csv(file)
    dataframes.append(df)

# Concatenate all DataFrames row-wise
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv("data/comment_sentiment.csv", index=False)