In [26]:
import time
start_time = time.time()

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from scipy.special import softmax
import numpy as np

In [27]:
gtdf = pd.read_csv('./sentiment-analysis/gt.csv')
gtdf.head()

# rename columns 
gtdf.columns = ['id', 'text', 'gt']
print(gtdf.shape)
gtdf.head()

(1500, 3)


Unnamed: 0,id,text,gt
0,6,But sadly this is not working.,-1
1,78,"So, everything builds fine, but when we try to...",-1
2,90,That is what is causing your null pointer exce...,-1
3,139,"All attempts I've made were, in a shortcut, un...",-1
4,162,Don't use.,-1


In [28]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
config = AutoConfig.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

list_of_sentences = gtdf['text'].tolist()

output = model(**tokenizer(list_of_sentences, padding=True, truncation=True, return_tensors="pt"))

list_of_sentences_with_sentiment = []

for j in range(len(list_of_sentences)):
  scores = output[0][j].detach().numpy()
  scores = softmax(scores)
  ranking = np.argsort(scores)
  ranking = ranking[::-1]
  for i in range(scores.shape[0]):
      l = config.id2label[ranking[i]]
      s = scores[ranking[i]]

      if(list_of_sentences[j] not in [ sentence['sentence'] for sentence in list_of_sentences_with_sentiment]):
        list_of_sentences_with_sentiment.append(
          {
            'sentence': list_of_sentences[j],
            l:  np.round(float(s), 4),
          }
        )
      else:
        index =  [ sentence['sentence'] for sentence in list_of_sentences_with_sentiment].index(list_of_sentences[j])
        list_of_sentences_with_sentiment[index][l] = np.round(float(s), 4)
      


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [29]:
# Analyze results
sentences_with_gt = []

for sentence in list_of_sentences_with_sentiment:
  # convert each sentiment scores into numbers
  sentence['positive'] = int(sentence['positive'] * 100)
  sentence['neutral'] = int(sentence['neutral'] * 100)
  sentence['negative'] = int(sentence['negative'] * 100)
  
  # compare three of the scores and get the highest one
  if sentence['positive'] > sentence['neutral'] and sentence['positive'] > sentence['negative']:
    sentence['sentiment'] = 1
  elif sentence['neutral'] > sentence['positive'] and sentence['neutral'] > sentence['negative']:
    sentence['sentiment'] = 0
  elif sentence['negative'] > sentence['positive'] and sentence['negative'] > sentence['neutral']:
    sentence['sentiment'] = -1
  else:
    sentence['sentiment'] = 0
    
  idx = [ sentence['sentence'] for sentence in list_of_sentences_with_sentiment].index(sentence['sentence'])
  sentence['gt'] = gtdf['gt'][idx]
  
  sentences_with_gt.append(sentence)

# convert to dataframe
df = pd.DataFrame(sentences_with_gt)
df

Unnamed: 0,sentence,negative,neutral,positive,sentiment,gt
0,But sadly this is not working.,84,14,1,-1,-1
1,"So, everything builds fine, but when we try to...",50,46,2,-1,-1
2,That is what is causing your null pointer exce...,46,51,2,0,-1
3,"All attempts I've made were, in a shortcut, un...",78,19,1,-1,-1
4,Don't use.,44,51,4,0,-1
...,...,...,...,...,...,...
1495,Very good example of steady pooling readHere.,0,6,92,1,1
1496,Now we're getting to the good part.,0,6,93,1,1
1497,So far i've done this for Twitter and it works...,0,5,94,1,1
1498,I solved the earlier problem.,3,58,38,0,1


In [30]:
# calculate precision, recall, f1
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1, _ = precision_recall_fscore_support(df['gt'], df['sentiment'], average='macro')
print('Precision: {:.2f}'.format(precision))
print('Recall: {:.2f}'.format(recall))
print('F1: {:.2f}'.format(f1))

Precision: 0.68
Recall: 0.69
F1: 0.68
