In [4]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Define function for sentiment analysis
def analyze_sentiment(text):
    if not isinstance(text, str):  # Ensure the input is a string
        text = ""  # Replace invalid inputs with an empty string
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=1).item()
    labels = ["Negative", "Neutral", "Positive"]
    return labels[predicted_class]

# Load your dataset
input_file = "../data/review_texts.csv"  # Path to your CSV file
df = pd.read_csv(input_file)

# Handle missing or non-string values
df['review_text'] = df['review_text'].fillna("").astype(str)

# Apply the sentiment analysis function with a progress bar
tqdm.pandas(desc="Processing Sentiments")
df['sentiment'] = df['review_text'].progress_apply(analyze_sentiment)

# Save the updated dataset
output_file = "product_comments_with_sentiment.csv"  # Output file path
df.to_csv(output_file, index=False)

print(f"Sentiment analysis completed. Saved to {output_file}")


Processing Sentiments:   0%|          | 0/4926 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing Sentiments: 100%|██████████| 4926/4926 [06:52<00:00, 11.94it/s]

Sentiment analysis completed. Saved to product_comments_with_sentiment.csv



