### Import models

In [1]:
import pandas as pd
from models import CardiffnlpTwitterRobertaBaseSentimentLatest, DistilrobertaFinetunedFinancialNewsSentimentAnalysis

### Instantiate & test models

In [2]:
twitter_roberta = CardiffnlpTwitterRobertaBaseSentimentLatest()
financial_news_roberta = DistilrobertaFinetunedFinancialNewsSentimentAnalysis()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
input_text = "I love the weather today!"
print("Twitter sentiment:", twitter_roberta.evaluate(input_text))
print("Financial news sentiment:", financial_news_roberta.evaluate(input_text))

Twitter sentiment: [{'label': 'positive', 'score': 0.9834305047988892}]
Financial news sentiment: [{'label': 'neutral', 'score': 0.9990911483764648}]


# Load Datasets

In [4]:
from datasets import load_from_disk

tweet_eval = load_from_disk("../data/processed/tweet_eval_homoglyph")
financial_phrasebank = load_from_disk("../data/processed/financial_phrasebank_homoglyph")


In [5]:
print(f'Tweet eval dataset: {tweet_eval}')
print(f'Financial phrasebank dataset: {financial_phrasebank}')

Tweet eval dataset: Dataset({
    features: ['text', 'label'],
    num_rows: 1252
})
Financial phrasebank dataset: Dataset({
    features: ['sentence', 'label'],
    num_rows: 303
})


# Evaluate models on datasets

In [6]:
display(tweet_eval.features)
display(financial_phrasebank.features)

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['non-hate', 'hate'], id=None)}

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)}

In [7]:
tweet_eval_results = {
    'twitter_roberta': twitter_roberta.evaluate_dataset(tweet_eval, feature='text'),
    'financial_news_roberta': financial_news_roberta.evaluate_dataset(tweet_eval, feature='text')
}

In [8]:
display(tweet_eval_results)

{'twitter_roberta': [[{'label': 'neutral', 'score': 0.7997302412986755}],
  [{'label': 'neutral', 'score': 0.6987196207046509}],
  [{'label': 'neutral', 'score': 0.8277277946472168}],
  [{'label': 'neutral', 'score': 0.8162309527397156}],
  [{'label': 'neutral', 'score': 0.8549603819847107}],
  [{'label': 'neutral', 'score': 0.813928484916687}],
  [{'label': 'neutral', 'score': 0.7251846194267273}],
  [{'label': 'neutral', 'score': 0.7976133227348328}],
  [{'label': 'neutral', 'score': 0.802094578742981}],
  [{'label': 'neutral', 'score': 0.7974727749824524}],
  [{'label': 'neutral', 'score': 0.8239505887031555}],
  [{'label': 'neutral', 'score': 0.7930537462234497}],
  [{'label': 'neutral', 'score': 0.8318130373954773}],
  [{'label': 'neutral', 'score': 0.8250578045845032}],
  [{'label': 'neutral', 'score': 0.7436833381652832}],
  [{'label': 'neutral', 'score': 0.8257061243057251}],
  [{'label': 'neutral', 'score': 0.8056649565696716}],
  [{'label': 'neutral', 'score': 0.5827032923698

In [9]:
financial_phrasebank_results = {
    'twitter_roberta': twitter_roberta.evaluate_dataset(financial_phrasebank, feature='sentence'),
    'financial_news_roberta': financial_news_roberta.evaluate_dataset(financial_phrasebank, feature='sentence')
}

In [10]:
display(financial_phrasebank_results)

{'twitter_roberta': [[{'label': 'neutral', 'score': 0.8576387763023376}],
  [{'label': 'neutral', 'score': 0.782501757144928}],
  [{'label': 'neutral', 'score': 0.7774349451065063}],
  [{'label': 'neutral', 'score': 0.8919249176979065}],
  [{'label': 'neutral', 'score': 0.919111430644989}],
  [{'label': 'neutral', 'score': 0.8051819801330566}],
  [{'label': 'neutral', 'score': 0.8023940920829773}],
  [{'label': 'neutral', 'score': 0.8336440324783325}],
  [{'label': 'neutral', 'score': 0.7885507941246033}],
  [{'label': 'neutral', 'score': 0.8087793588638306}],
  [{'label': 'neutral', 'score': 0.8387629389762878}],
  [{'label': 'neutral', 'score': 0.8315011858940125}],
  [{'label': 'neutral', 'score': 0.8861855268478394}],
  [{'label': 'neutral', 'score': 0.8856730461120605}],
  [{'label': 'neutral', 'score': 0.8353820443153381}],
  [{'label': 'neutral', 'score': 0.8219995498657227}],
  [{'label': 'neutral', 'score': 0.8201838135719299}],
  [{'label': 'neutral', 'score': 0.8530777692794

In [11]:
# Map labels from negative, neutral, positive to non-hate, hate
def label_to_unified(label):
    if label == 'neutral' or label == 'positive':
        return 'non-hate'
    elif label == 'negative':
        return 'hate'
    return label

def map_to_hate_labels(dataset):
    for model, results in dataset.items():
        for i, result in enumerate(results):
            # Result is a list of dictionaries, where each dictionary has a 'label' key
            result = result[0]
            dataset[model][i] = label_to_unified(result['label'])
    return dataset

tweet_eval_results = map_to_hate_labels(tweet_eval_results)
financial_phrasebank_results = map_to_hate_labels(financial_phrasebank_results)

display(tweet_eval_results)
display(financial_phrasebank_results)

{'twitter_roberta': ['non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
 

{'twitter_roberta': ['non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
 

In [12]:
%pwd

'D:\\UzK\\OpenLLMs-SS24-SentinAL\\notebooks'

In [13]:
from sklearn.metrics import classification_report

# Evaluate models on tweet eval dataset
for model, results in tweet_eval_results.items():
    y_true = [tweet_eval.features['label'].int2str(r) for r in tweet_eval['label']]
    y_pred = results
    print(f"Classification report tweet_eval for {model}")
    print(classification_report(y_true, y_pred))
    
    
# Evaluate models on financial phrasebank dataset
for model, results in financial_phrasebank_results.items():
    y_true = [label_to_unified(financial_phrasebank.features['label'].int2str(r)) for r in financial_phrasebank['label']]
    y_pred = results
    print(f"Classification report financial_phrasebank for {model}")
    print(classification_report(y_true, y_pred))

Classification report tweet_eval for twitter_roberta
              precision    recall  f1-score   support

        hate       1.00      0.00      0.00      1252
    non-hate       0.00      0.00      0.00         0

    accuracy                           0.00      1252
   macro avg       0.50      0.00      0.00      1252
weighted avg       1.00      0.00      0.00      1252

Classification report tweet_eval for financial_news_roberta
              precision    recall  f1-score   support

        hate       0.00      0.00      0.00    1252.0
    non-hate       0.00      0.00      0.00       0.0

    accuracy                           0.00    1252.0
   macro avg       0.00      0.00      0.00    1252.0
weighted avg       0.00      0.00      0.00    1252.0

Classification report financial_phrasebank for twitter_roberta
              precision    recall  f1-score   support

        hate       0.00      0.00      0.00     303.0
    non-hate       0.00      0.00      0.00       0.0

    ac

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [14]:
import dill

with open('../data/raw/tweet_homoglyph_eval_results.pkl', 'wb') as f:
    dill.dump(tweet_eval_results, f)
    
with open('../data/raw/financial_homoglyph_phrasebank_results.pkl', 'wb') as f:
    dill.dump(financial_phrasebank_results, f)