In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import re

This is a notebook that replicates `preprocess_sent_vectors`, but tests whether it's worth it to fix known issues with the juries dataset --- for example, the fact that 'asshole' is a loaded term.

In [2]:
def get_sentiment(text):

    if (pd.isnull(text)):
        return({'positive': np.nan, 'negative': np.nan, 'neutral': np.nan})
    
    encoded = tokenizer(text, return_tensors='pt')
    output = model(**encoded)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # sample output format
    return({'positive': scores[2], 'negative': scores[0], 'neutral': scores[1]})

In [3]:
MODEL  = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
messages = pd.read_csv('../../data/raw_data/jury_conversations_with_outcome_var.csv', encoding='mac_roman')['message'].astype(str)
messages = [re.sub("asshole", "individual at fault", chat) for chat in messages]
messages = pd.Series(messages)

In [None]:
sentiments = messages.apply(get_sentiment)

sent_arr = [list(dict.values()) for dict in sentiments]

sent_df = pd.DataFrame(sent_arr, columns =['positive_bert', 'negative_bert', 'neutral_bert']) 

output_csv_folder = '../../sentiment_bert/'

sent_df.to_csv(output_csv_folder + 'jury_censored_conversations_with_outcome_var.csv')