In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
def map_to_sentiment(score_idx):
    if score_idx <= 1:
        return 'negative'
    elif score_idx == 2:
        return 'neutral'
    else:
        return 'positive'

In [7]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = softmax(logits.numpy()[0])
    pred_idx = np.argmax(probs)
    return map_to_sentiment(pred_idx)

In [9]:
comments = pd.read_csv("../data/processed/comments_processed.csv")
comments = comments.dropna(subset=['body'])

In [10]:
comments["sentiment"] = comments["body"].apply(lambda x: predict_sentiment(str(x)))

In [12]:
comments.to_csv("../data/analice/comments_labeled.csv", index=False)

In [14]:
df = pd.read_csv('../data/analice/comments_labeled.csv')
df['sentiment'].value_counts()

sentiment
negative    354
positive    307
neutral     139
Name: count, dtype: int64