In [28]:
import nltk
from nltk.tokenize import sent_tokenize
import torch
from torch.nn import functional as F
import numpy as np
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
import json

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/m/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
tokenizer = RobertaTokenizerFast.from_pretrained("arpanghoshal/EmoRoBERTa")
model = RobertaForSequenceClassification.from_pretrained("arpanghoshal/EmoRoBERTa", from_tf=True)

2023-02-20 13:41:28.150726: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


In [9]:
def format_float(num):
    return np.format_float_positional(num, trim='-')

In [38]:
raw_sentences_text = open("../data/mask_16.txt").read()
sentences = sent_tokenize(raw_sentences_text)

scores = []
top_n = 5

# detect emotion in sentences
with torch.no_grad():
    for sentence in sentences:
        # sentence string to tokens
        tokens = tokenizer(sentence, return_tensors="pt")
        # predict emotions
        logits = model(**tokens).logits
        # logits to probabilities
        probabilities = F.softmax(logits, dim = -1).numpy()
        # make floats human readable
        probabilities = [format_float(p) for p in probabilities[0]]
        # get labels for each emotion
        labels = [model.config.id2label[i] for i in range(len(probabilities))]
        # get top n labels and probabilities
        top_n_indexes = np.argpartition(probabilities, -top_n)[-top_n:]
        
        score = {"sentence": sentence,
                 # convert filtered arrays to lists for json serialisation
                 "probabilities": list(np.array(probabilities)[top_n_indexes]),
                 "labels": list(np.array(labels)[top_n_indexes])}
        scores.append(score)
        
with open('../data/mask16_sentences_emotions.json', 'w') as outfile:
    json.dump(scores, outfile, indent=2)        

In [47]:
raw_paragraphs_text = open("../data/mask_16_paragraphs.txt").read()
paragraphs = raw_paragraphs_text.split('\n\n')

# detect emotion in paragraphs

scores = []
top_n = 10

# detect emotion in sentences
with torch.no_grad():
    for paragraph in paragraphs:
        # sentence string to tokens
        tokens = tokenizer(paragraph, return_tensors="pt")
        # predict emotions
        logits = model(**tokens).logits
        # logits to probabilities
        probabilities = F.softmax(logits, dim = -1).numpy()
        # make floats human readable
        probabilities = [format_float(p) for p in probabilities[0]]
        # get labels for each emotion
        labels = [model.config.id2label[i] for i in range(len(probabilities))]
        # get top n labels and probabilities
        top_n_indexes = np.argpartition(probabilities, -top_n)[-top_n:]
        
        score = {"sentence": paragraph,
                 # convert filtered arrays to lists for json serialisation
                 "probabilities": list(np.array(probabilities)[top_n_indexes]),
                 "labels": list(np.array(labels)[top_n_indexes])}
        scores.append(score)
        
with open('../data/mask16_paragraphs_emotions.json', 'w') as outfile:
    json.dump(scores, outfile, indent=2) 