### Import models

In [1]:
import pandas as pd
from models import CardiffnlpTwitterRobertaBaseSentimentLatest, DistilrobertaFinetunedFinancialNewsSentimentAnalysis

### Instantiate & test models

In [2]:
twitter_roberta = CardiffnlpTwitterRobertaBaseSentimentLatest()
financial_news_roberta = DistilrobertaFinetunedFinancialNewsSentimentAnalysis()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
input_text = "I love the weather today!"
print("Twitter sentiment:", twitter_roberta.evaluate(input_text))
print("Financial news sentiment:", financial_news_roberta.evaluate(input_text))

Twitter sentiment: [{'label': 'positive', 'score': 0.9834305047988892}]
Financial news sentiment: [{'label': 'neutral', 'score': 0.9990911483764648}]


# Load Datasets

In [4]:
from datasets import load_dataset

tweet_eval = load_dataset("cardiffnlp/tweet_eval", "hate", split='test')
financial_phrasebank = load_dataset("takala/financial_phrasebank", "sentences_allagree", split='train', trust_remote_code=True)


In [5]:
print(f'Tweet eval dataset: {tweet_eval}')
print(f'Financial phrasebank dataset: {financial_phrasebank}')

Tweet eval dataset: Dataset({
    features: ['text', 'label'],
    num_rows: 2970
})
Financial phrasebank dataset: Dataset({
    features: ['sentence', 'label'],
    num_rows: 2264
})


# Evaluate models on datasets

In [6]:
display(tweet_eval.features)
display(financial_phrasebank.features)

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['non-hate', 'hate'], id=None)}

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)}

In [7]:
tweet_eval_results = {
    'twitter_roberta': twitter_roberta.evaluate_dataset(tweet_eval, feature='text'),
    'financial_news_roberta': financial_news_roberta.evaluate_dataset(tweet_eval, feature='text')
}

In [8]:
display(tweet_eval_results)

{'twitter_roberta': [[{'label': 'negative', 'score': 0.8241130709648132}],
  [{'label': 'negative', 'score': 0.9301699995994568}],
  [{'label': 'negative', 'score': 0.914478063583374}],
  [{'label': 'negative', 'score': 0.9087288975715637}],
  [{'label': 'negative', 'score': 0.904155969619751}],
  [{'label': 'positive', 'score': 0.48943912982940674}],
  [{'label': 'negative', 'score': 0.518504798412323}],
  [{'label': 'negative', 'score': 0.9179902672767639}],
  [{'label': 'negative', 'score': 0.8886039853096008}],
  [{'label': 'negative', 'score': 0.9379819631576538}],
  [{'label': 'negative', 'score': 0.8902706503868103}],
  [{'label': 'negative', 'score': 0.8986079096794128}],
  [{'label': 'neutral', 'score': 0.5105807185173035}],
  [{'label': 'negative', 'score': 0.9617209434509277}],
  [{'label': 'negative', 'score': 0.5023629069328308}],
  [{'label': 'positive', 'score': 0.9225370287895203}],
  [{'label': 'negative', 'score': 0.9287520051002502}],
  [{'label': 'negative', 'score'

In [9]:
financial_phrasebank_results = {
    'twitter_roberta': twitter_roberta.evaluate_dataset(financial_phrasebank, feature='sentence'),
    'financial_news_roberta': financial_news_roberta.evaluate_dataset(financial_phrasebank, feature='sentence')
}

In [10]:
display(financial_phrasebank_results)

{'twitter_roberta': [[{'label': 'neutral', 'score': 0.8878433108329773}],
  [{'label': 'neutral', 'score': 0.7289584875106812}],
  [{'label': 'positive', 'score': 0.4968768060207367}],
  [{'label': 'neutral', 'score': 0.6082512140274048}],
  [{'label': 'neutral', 'score': 0.6216087937355042}],
  [{'label': 'positive', 'score': 0.5012307167053223}],
  [{'label': 'positive', 'score': 0.6087701916694641}],
  [{'label': 'neutral', 'score': 0.6339824199676514}],
  [{'label': 'neutral', 'score': 0.5943388342857361}],
  [{'label': 'positive', 'score': 0.571374237537384}],
  [{'label': 'neutral', 'score': 0.8820133805274963}],
  [{'label': 'neutral', 'score': 0.559410810470581}],
  [{'label': 'positive', 'score': 0.5084101557731628}],
  [{'label': 'positive', 'score': 0.7037743926048279}],
  [{'label': 'neutral', 'score': 0.5853444933891296}],
  [{'label': 'positive', 'score': 0.6692134737968445}],
  [{'label': 'neutral', 'score': 0.557774543762207}],
  [{'label': 'neutral', 'score': 0.5656400

In [11]:
# Map labels from negative, neutral, positive to non-hate, hate
def label_to_unified(label):
    if label == 'neutral' or label == 'positive':
        return 'non-hate'
    elif label == 'negative':
        return 'hate'
    return label

def map_to_hate_labels(dataset):
    for model, results in dataset.items():
        for i, result in enumerate(results):
            # Result is a list of dictionaries, where each dictionary has a 'label' key
            result = result[0]
            dataset[model][i] = label_to_unified(result['label'])
    return dataset

tweet_eval_results = map_to_hate_labels(tweet_eval_results)
financial_phrasebank_results = map_to_hate_labels(financial_phrasebank_results)

display(tweet_eval_results)
display(financial_phrasebank_results)

{'twitter_roberta': ['hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'non-hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'non-hate',
  'hate',
  'hate',
  'non-hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'non-hate',
  'non-hate',
  'hate',
  'non-hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'non-hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'non-hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'non-hate',
  'hate',
  'non-hate',
  'hate',
  'hate',
  'hate',
  'non-hate',
  'non-hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'non-hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'non-hate',
  'hate',
  'non-hate',
  'non-hate',
  'hate',
  'non-hate',
  'non-hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'hate',
  'non-ha

{'twitter_roberta': ['non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
  'non-hate',
 

In [12]:
%pwd

'D:\\UzK\\OpenLLMs-SS24-SentinAL\\notebooks'

In [13]:
from sklearn.metrics import classification_report

# Evaluate models on tweet eval dataset
for model, results in tweet_eval_results.items():
    y_true = [tweet_eval.features['label'].int2str(r) for r in tweet_eval['label']]
    y_pred = results
    print(f"Classification report tweet_eval for {model}")
    print(classification_report(y_true, y_pred))
    
    
# Evaluate models on financial phrasebank dataset
for model, results in financial_phrasebank_results.items():
    y_true = [label_to_unified(financial_phrasebank.features['label'].int2str(r)) for r in financial_phrasebank['label']]
    y_pred = results
    print(f"Classification report financial_phrasebank for {model}")
    print(classification_report(y_true, y_pred))

Classification report tweet_eval for twitter_roberta
              precision    recall  f1-score   support

        hate       0.50      0.93      0.65      1252
    non-hate       0.86      0.32      0.46      1718

    accuracy                           0.58      2970
   macro avg       0.68      0.62      0.56      2970
weighted avg       0.71      0.58      0.54      2970

Classification report tweet_eval for financial_news_roberta
              precision    recall  f1-score   support

        hate       0.34      0.03      0.06      1252
    non-hate       0.57      0.96      0.72      1718

    accuracy                           0.57      2970
   macro avg       0.46      0.49      0.39      2970
weighted avg       0.47      0.57      0.44      2970

Classification report financial_phrasebank for twitter_roberta
              precision    recall  f1-score   support

        hate       0.89      0.34      0.50       303
    non-hate       0.91      0.99      0.95      1961

    ac

In [14]:
import dill

with open('../data/raw/tweet_eval_results.pkl', 'wb') as f:
    dill.dump(tweet_eval_results, f)
    
with open('../data/raw/financial_phrasebank_results.pkl', 'wb') as f:
    dill.dump(financial_phrasebank_results, f)

In [None]:
import pandas as pd

# Data for tweet_eval dataset
tweet_eval_data = {
    "Model": ["twitter_roberta", "twitter_roberta", "twitter_roberta", "twitter_roberta", "twitter_roberta", 
              "financial_news_roberta", "financial_news_roberta", "financial_news_roberta", "financial_news_roberta", "financial_news_roberta"],
    "Class": ["hate", "non-hate", "accuracy", "macro avg", "weighted avg", 
              "hate", "non-hate", "accuracy", "macro avg", "weighted avg"],
    "Precision": [0.50, 0.86, None, 0.68, 0.71, 0.34, 0.57, None, 0.46, 0.47],
    "Recall": [0.93, 0.32, None, 0.62, 0.58, 0.03, 0.96, None, 0.49, 0.57],
    "F1-score": [0.65, 0.46, 0.58, 0.56, 0.54, 0.06, 0.72, 0.57, 0.39, 0.44],
    "Support": [1252, 1718, 2970, 2970, 2970, 1252, 1718, 2970, 2970, 2970]
}

# Data for financial_phrasebank dataset
financial_phrasebank_data = {
    "Model": ["twitter_roberta", "twitter_roberta", "twitter_roberta", "twitter_roberta", "twitter_roberta", 
              "financial_news_roberta", "financial_news_roberta", "financial_news_roberta", "financial_news_roberta", "financial_news_roberta"],
    "Class": ["hate", "non-hate", "accuracy", "macro avg", "weighted avg", 
              "hate", "non-hate", "accuracy", "macro avg", "weighted avg"],
    "Precision": [0.89, 0.91, None, 0.90, 0.90, 1.00, 1.00, None, 1.00, 1.00],
    "Recall": [0.34, 0.99, None, 0.67, 0.91, 0.99, 1.00, None, 0.99, 1.00],
    "F1-score": [0.50, 0.95, 0.91, 0.72, 0.89, 0.99, 1.00, 1.00, 1.00, 1.00],
    "Support": [303, 1961, 2264, 2264, 2264, 303, 1961, 2264, 2264, 2264]
}

# Create DataFrames
tweet_eval_df = pd.DataFrame(tweet_eval_data)
financial_phrasebank_df = pd.DataFrame(financial_phrasebank_data)

# Save the DataFrames as SVG images
tweet_eval_svg = tweet_eval_df.style.set_table_styles([
    {"selector": "table", "props": [("border-collapse", "collapse")]},
    {"selector": "th, td", "props": [("border", "1px solid black"), ("padding", "8px"), ("text-align", "center")]}
]).render()

financial_phrasebank_svg = financial_phrasebank_df.style.set_table_styles([
    {"selector": "table", "props": [("border-collapse", "collapse")]},
    {"selector": "th, td", "props": [("border", "1px solid black"), ("padding", "8px"), ("text-align", "center")]}
]).render()

with open("/mnt/data/tweet_eval.svg", "w") as f:
    f.write(tweet_eval_svg)

with open("/mnt/data/financial_phrasebank.svg", "w") as f:
    f.write(financial_phrasebank_svg)