# Sentiment Analysis with FinBERT

Install necesssary libraries

In [None]:
!pip install --upgrade accelerate datasets



In [48]:
!pip install transformers
!pip install datasets



In [None]:
pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.52.3-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
Successfully installed transformers-4.52.3


In [1]:
# Pytorch Deep Learning
import torch

# Pandas + Numpy
import numpy as np
import pandas as pd

# Sklearn metrics
from sklearn.metrics import balanced_accuracy_score, accuracy_score

# Hugging Face Transformer Libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer, TrainingArguments

# Hugging Face
from datasets import Dataset


In [2]:
if torch.cuda.is_available():
  print("CUDA available. GPU will be used for computation.")
  device = 0
else:
  print("CUDA not available. CPU will be used for computation.")
  device = -1

CUDA available. GPU will be used for computation.


## Load sentiment dataset

This dataset consists of financial tweets labeled with sentiments: bullish (1), bearish (2), and neutral (0). It includes 17,368 bullish, 8,542 bearish, and 12,181 neutral tweets, sourced from various reputable financial datasets. The data is preprocessed for consistency and quality, making it ideal for fine-tuning machine learning models to predict sentiment trends in financial markets and stock discussions.

In [3]:
df = pd.read_parquet("hf://datasets/TimKoornstra/financial-tweets-sentiment/data/train-00000-of-00001.parquet")
df['label_name'] = df['sentiment'].map({0: 'Neutral', 1: 'Positive', 2: 'Negative'})
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,tweet,sentiment,url,label_name
0,$BYND - JPMorgan reels in expectations on Beyo...,2,https://huggingface.co/datasets/zeroshot/twitt...,Negative
1,$CCL $RCL - Nomura points to bookings weakness...,2,https://huggingface.co/datasets/zeroshot/twitt...,Negative
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",2,https://huggingface.co/datasets/zeroshot/twitt...,Negative
3,$ESS: BTIG Research cuts to Neutral https://t....,2,https://huggingface.co/datasets/zeroshot/twitt...,Negative
4,$FNKO - Funko slides after Piper Jaffray PT cu...,2,https://huggingface.co/datasets/zeroshot/twitt...,Negative


## Looking at the distirbution of the dataset

In [None]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Calculate sentiment counts and percentages
label_count = df['label_name'].value_counts()
label_distribution = df['label_name'].value_counts(normalize=True)

data = pd.DataFrame({
    'Sentiment Label': label_count.index,
    'Count': label_count.values,
    'Percentage': label_distribution.values * 100
})

# Create subplots: one xy for bar chart, one domain for pie chart
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Sentiment Labels Distribution", "Sentiment Distribution (%)"),
    specs=[[{"type": "xy"}, {"type": "domain"}]]
)

# Add horizontal bar chart with count labels
fig.add_trace(go.Bar(
    x=data['Count'],
    y=data['Sentiment Label'],
    orientation='h',
    marker_color='gray',
    text=data['Count'],
    textposition='auto'
), row=1, col=1)

# Update axis labels for the horizontal bar chart
fig.update_xaxes(title_text="Number of Tweets", row=1, col=1)
fig.update_yaxes(title_text="Sentiment", row=1, col=1)

# Define a minimal, neutral color palette (using shades of gray)
pie_colors = ['#808080', '#A9A9A9', '#C0C0C0'][:len(data)]

# Add pie chart with both label and percent shown
fig.add_trace(go.Pie(
    labels=data['Sentiment Label'],
    values=data['Percentage'],
    marker=dict(colors=pie_colors),
    textinfo='label+percent'
), row=1, col=2)

fig.update_layout(title="Sentiment Analysis Overview", showlegend=False)
fig.show()




The dataset is imbalanced. The positive class is represented much more than the neutral or negative classes as it makes up around half of all datapoints.
There is a large disparity between positive and negative sentiment counts as positive sentiment is almost twice as frequent.
The finBERT model may be biased towards predicting positive sentiment more frequently, since it's the majority class, and struggle to effectively identify negative sentiment as it’s underrepresented.


## Using the Transformer Pipeline

The transformer pipeline for NLP streamlines tasks by Auto Tokenizing text, perofrm model inference like text analysis or generation, and provide straightforward results.

In [9]:
# Load sentiment analysis pipeline with FinBERT model

# Model name from Model Hub
model_name = 'yiyanghkust/finbert-tone'

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Neutral",
    "1": "Positive",
    "2": "Negative"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Negative": 2,
    "Neutral": 0,
    "Positive": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30873
}

In [None]:
id_2_label = model.config.id2label
id_2_label

{0: 'Neutral', 1: 'Positive', 2: 'Negative'}

In [10]:
sentiment_pipeline = pipeline(task="sentiment-analysis",
                              model=model,
                              tokenizer = tokenizer,
                              device=device,
                              padding=True,  # Automatically pad sequences to the max length
                              truncation=True,  # Automatically truncate sequences that exceed the max length
                              max_length=512)  # Ensure sequences are capped at 512 tokens



Device set to use cuda:0


In [None]:
# Test on a positive sentence
sentence1 = "The market outlook is very positive thanks to the new economic policies"
sentence2 = "The market outlook is very negative thanks to the new economic policies"
sentence3 = "The market outlook is neutral thanks to the new economic policies"

print(sentiment_pipeline(sentence1))
print(sentiment_pipeline(sentence2))
print(sentiment_pipeline(sentence3))

[{'label': 'Positive', 'score': 1.0}]
[{'label': 'Negative', 'score': 0.9999997615814209}]
[{'label': 'Negative', 'score': 0.9709043502807617}]


Here we can already see that the model is not perfect. it does not seem to be very good at predicting neutral

### Make predictions on entire dataset

In [None]:
preds = sentiment_pipeline(df['tweet'].tolist())

In [None]:
preds[0:20]

[{'label': 'Neutral', 'score': 0.9998947381973267},
 {'label': 'Negative', 'score': 0.9999996423721313},
 {'label': 'Negative', 'score': 0.9999918937683105},
 {'label': 'Neutral', 'score': 0.9985872507095337},
 {'label': 'Neutral', 'score': 0.9999823570251465},
 {'label': 'Neutral', 'score': 0.920272946357727},
 {'label': 'Neutral', 'score': 0.9910681843757629},
 {'label': 'Neutral', 'score': 0.9992935657501221},
 {'label': 'Negative', 'score': 0.9579724073410034},
 {'label': 'Negative', 'score': 0.5939654111862183},
 {'label': 'Neutral', 'score': 0.9825224876403809},
 {'label': 'Neutral', 'score': 0.9999781847000122},
 {'label': 'Neutral', 'score': 0.9987220168113708},
 {'label': 'Neutral', 'score': 0.9997292160987854},
 {'label': 'Neutral', 'score': 0.9996645450592041},
 {'label': 'Positive', 'score': 0.6515815854072571},
 {'label': 'Negative', 'score': 0.9835765957832336},
 {'label': 'Neutral', 'score': 0.9943749904632568},
 {'label': 'Neutral', 'score': 0.9999785423278809},
 {'labe

In [None]:
# Extract prediction name from label key
df['prediction'] = [pred['label'] for pred in preds]

In [None]:
df.groupby(['label_name', 'prediction']).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
label_name,prediction,Unnamed: 2_level_1
Negative,Negative,2244
Negative,Neutral,5784
Negative,Positive,514
Neutral,Negative,609
Neutral,Neutral,10575
Neutral,Positive,997
Positive,Negative,460
Positive,Neutral,12095
Positive,Positive,4813


In [None]:
import plotly.express as px

# Pivot to wide format
conf_matrix = df.groupby(['label_name', 'prediction']).size().unstack().fillna(0)

# Plot heatmap
fig = px.imshow(
    conf_matrix,
    text_auto=True,
    color_continuous_scale='Blues',
    labels=dict(x="Predicted Label", y="True Label", color="Count"),
    x=conf_matrix.columns,
    y=conf_matrix.index,
    title="Confusion Matrix"
)
fig.update_layout(xaxis_side="top")
fig.show()

In [None]:
import plotly.express as px

# Prepare data
grouped = df.groupby(['label_name', 'prediction']).size().reset_index(name='count')

# Plot grouped bar chart
fig = px.bar(
    grouped,
    x='label_name',
    y='count',
    color='prediction',
    barmode='group',
    title='Prediction Distribution per True Label',
    labels={'label_name': 'True Label', 'count': 'Number of Tweets', 'prediction': 'Predicted'}
)
fig.show()



In [None]:
import plotly.express as px
import pandas as pd

# Group the data
grouped = df.groupby(['label_name', 'prediction']).size().reset_index(name='count')

# Add correctness column
grouped['correct'] = grouped['label_name'].str.lower() == grouped['prediction'].str.lower()

# Map patterns and legend labels
def get_pattern_and_label(row):
    if row['correct']:
        return '', 'Correct'
    pred = row['prediction'].lower()
    if pred == 'neutral':
        return '/', 'Misclassified as Neutral'
    elif pred == 'positive':
        return '.', 'Misclassified as Positive'
    elif pred == 'negative':
        return 'x', 'Misclassified as Negative'
    else:
        return 'x', f'Misclassified as {row["prediction"]}'

# Apply to get both pattern and label
grouped[['pattern', 'legend_label']] = grouped.apply(
    get_pattern_and_label, axis=1, result_type='expand'
)

# Plot
fig = px.bar(
    grouped,
    x='label_name',
    y='count',
    pattern_shape='legend_label',  # legend will use this
    pattern_shape_sequence=['', '/', '.', 'x'],
    color='legend_label',          # ensures consistent mapping
    color_discrete_sequence=['lightblue'] * 10,
    hover_data=['prediction', 'count', 'correct'],
    title='Prediction Distribution per True Label',
    labels={'label_name': 'True Label', 'count': 'Number of Tweets', 'legend_label': 'Prediction Type'}
)

# Final layout cleanup
fig.update_layout(
    legend_title_text='Prediction Type',
    legend=dict(traceorder="normal")
)

fig.show()

In [None]:
from sklearn.metrics import classification_report

# Map string labels to numeric
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {v: k.capitalize() for k, v in label_mapping.items()}

y_true = df['label_name'].str.lower().map(label_mapping)
y_pred = df['prediction'].str.lower().map(label_mapping)

# Get per-class scores using classification_report
report = classification_report(y_true, y_pred, target_names=[id2label[i] for i in range(3)], output_dict=True, zero_division=0)

# Convert to DataFrame
report_df = pd.DataFrame(report).transpose()

# Round and clean
report_df = report_df[['precision', 'recall', 'f1-score', 'support']]
report_df = report_df.round(4)

# Optional: Rename index
report_df.index.name = 'Label'
report_df.reset_index(inplace=True)

# Show the table
print(report_df.to_string(index=False))


       Label  precision  recall  f1-score    support
    Negative     0.6773  0.2627    0.3786  8542.0000
     Neutral     0.3717  0.8682    0.5205 12181.0000
    Positive     0.7611  0.2771    0.4063 17368.0000
    accuracy     0.4629  0.4629    0.4629     0.4629
   macro avg     0.6034  0.4693    0.4351 38091.0000
weighted avg     0.6178  0.4629    0.4366 38091.0000


The classification report indicates that the model performs unevenly across sentiment classes. It demonstrates a strong bias toward predicting the Neutral class, achieving high recall (86.8%) but low precision (37.2%), suggesting that many instances are labeled Neutral even when they are not. While the Positive class shows high precision (76.1%), its recall is very low (27.7%), meaning the model correctly predicts Positive only when it's very confident, but misses most actual Positive instances. Similarly, the Negative class has decent precision (67.7%) but poor recall (26.3%). Overall accuracy is 46.3%, and both macro and weighted F1-scores are low (around 0.44), reflecting limited general effectiveness across all classes. These results suggest the model struggles with class imbalance and may benefit from improved sampling strategies or class weighting to better capture underrepresented sentiments.

In [None]:
balanced_accuracy_score(df['label_name'], df['prediction'])

0.46932536881614

The accruacy score does beat random as random would be 33.33%. Using BERT an accuracy score of 0.47% is achieved. Let's finetune the model to make it better.

## Finetuning

For finetuning we need indices of 0, 1, 2 for labels insted of names.

In [4]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,17368
0,12181
2,8542


In [5]:
# Split into train/val/tests for later comparison
train_end_point = int(df.shape[0]*0.6) # 60% train, 20% rest
val_end_point = int(df.shape[0]*0.8)

df_train = df.iloc[:train_end_point,:]
df_val = df.iloc[train_end_point:val_end_point,:]
df_test = df.iloc[val_end_point:,:]

print(df_train.shape, df_val.shape, df_test.shape)

(22854, 4) (7618, 4) (7619, 4)


In [11]:
# Test accuracy before fine-tuning
preds = sentiment_pipeline(df_test['tweet'].tolist())
df_test['prediction'] = [pred['label'] for pred in preds]
balanced_accuracy_score(df_test['label_name'], df_test['prediction'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['prediction'] = [pred['label'] for pred in preds]


np.float64(0.4239523852704969)

In [13]:
from sklearn.metrics import classification_report

# Map string labels to numeric
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {v: k.capitalize() for k, v in label_mapping.items()}

y_true = df_test['label_name'].str.lower().map(label_mapping)
y_pred = df_test['prediction'].str.lower().map(label_mapping)

# Get per-class scores using classification_report
report = classification_report(y_true, y_pred, target_names=[id2label[i] for i in range(3)], output_dict=True, zero_division=0)

# Convert to DataFrame
report_df = pd.DataFrame(report).transpose()

# Round and clean
report_df = report_df[['precision', 'recall', 'f1-score', 'support']]
report_df = report_df.round(4)

# Optional: Rename index
report_df.index.name = 'Label'
report_df.reset_index(inplace=True)

# Show the table
print(report_df.to_string(index=False))

       Label  precision  recall  f1-score   support
    Negative     0.7532  0.1702    0.2777 1398.0000
     Neutral     0.1524  0.9455    0.2625 1027.0000
    Positive     0.8702  0.1561    0.2648 5194.0000
    accuracy     0.2651  0.2651    0.2651    0.2651
   macro avg     0.5919  0.4240    0.2683 7619.0000
weighted avg     0.7520  0.2651    0.2668 7619.0000


In [None]:
import plotly.express as px

# Pivot to wide format
conf_matrix = df.groupby(['label_name', 'prediction']).size().unstack().fillna(0)

# Plot heatmap
fig = px.imshow(
    conf_matrix,
    text_auto=True,
    color_continuous_scale='Blues',
    labels=dict(x="Predicted Label", y="True Label", color="Count"),
    x=conf_matrix.columns,
    y=conf_matrix.index,
    title="Confusion Matrix"
)
fig.update_layout(xaxis_side="top")
fig.show()

Convert to huggingface datasets for prepration for fine-tuning

In [None]:
# Converting pandas df into hugging face dataset objects:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

# Tokenizing the datasets:
dataset_train = dataset_train.map(lambda e: tokenizer(e['tweet'], truncation=True, padding = 'max_length', max_length = 512), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['tweet'], truncation=True, padding = 'max_length', max_length = 512), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['tweet'], truncation=True, padding = 'max_length', max_length = 512), batched=True)

# Shuffle the training dataset
dataset_train_shuffled = dataset_train.shuffle(seed=42)

Map:   0%|          | 0/22854 [00:00<?, ? examples/s]

Map:   0%|          | 0/7618 [00:00<?, ? examples/s]

Map:   0%|          | 0/7619 [00:00<?, ? examples/s]

In [None]:
dataset_train_shuffled = dataset_train_shuffled.rename_column("sentiment", "labels")
dataset_val = dataset_val.rename_column("sentiment", "labels")
dataset_test = dataset_test.rename_column("sentiment", "labels")

In [None]:
dataset_train_shuffled.column_names

['tweet',
 'labels',
 'url',
 'label_name',
 'input_ids',
 'token_type_ids',
 'attention_mask']

Define trainer to fine tune model

In [None]:
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
 )

def compute_metrics(eval_pred):
    prediction, labels = eval_pred
    prediction = np.argmax(prediction, axis=-1)
    return {"balanced_accuracy": balanced_accuracy_score(labels, prediction), "accuracy":accuracy_score(prediction,labels)}

args = TrainingArguments(
    output_dir='temp/',
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps', # logs every x steps
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model = 'balanced_accuracy',
    push_to_hub=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset_train_shuffled,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
1,0.6785,0.851435,0.607808,0.637044
2,0.4361,0.793749,0.663847,0.674193
3,0.2111,0.986941,0.674397,0.667367


TrainOutput(global_step=2145, training_loss=0.4013297483359739, metrics={'train_runtime': 6757.8742, 'train_samples_per_second': 10.145, 'train_steps_per_second': 0.317, 'total_flos': 1.8039582146267136e+16, 'train_loss': 0.4013297483359739, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(dataset_test)
predictions

Save locally

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Save the model and tokenizer after training
model_save_path = '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/ '
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

trainer.save_model(model_save_path)
trainer.state.save_to_json(f"{model_save_path}/trainer_state.json")


In [None]:
# Save the model and tokenizer after training
model_save_path = '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/ '
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/ /tokenizer_config.json',
 '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/ /special_tokens_map.json',
 '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/ /vocab.txt',
 '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/ /added_tokens.json',
 '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/ /tokenizer.json')

In [None]:
# Load the model and tokenizer from Google Drive when needed
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(model_save_path)


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/FinBERT Model'. Use `repo_type` argument if needed.

Load trained model into the pipeline

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer


In [4]:
model_save_path = '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/ '

# Load the model and tokenizer from Google Drive when needed
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_save_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_save_path, local_files_only=True)

In [9]:
trained_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer,device=device)

Device set to use cuda:0


Predict and evaluate accuracy

In [13]:
preds=trained_pipeline(df_test['tweet'].tolist())
df_test['prediction']=[pred['label'] for pred in preds]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['prediction']=[pred['label'] for pred in preds]


In [14]:
from sklearn.metrics import classification_report

# Map string labels to numeric
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {v: k.capitalize() for k, v in label_mapping.items()}

y_true = df_test['label_name'].str.lower().map(label_mapping)
y_pred = df_test['prediction'].str.lower().map(label_mapping)

# Get per-class scores using classification_report
report = classification_report(y_true, y_pred, target_names=[id2label[i] for i in range(3)], output_dict=True, zero_division=0)

# Convert to DataFrame
report_df = pd.DataFrame(report).transpose()

# Round and clean
report_df = report_df[['precision', 'recall', 'f1-score', 'support']]
report_df = report_df.round(4)

# Optional: Rename index
report_df.index.name = 'Label'
report_df.reset_index(inplace=True)

# Show the table
print(report_df.to_string(index=False))

       Label  precision  recall  f1-score   support
    Negative     0.3078  0.6266    0.4128 1398.0000
     Neutral     0.2225  0.6504    0.3316 1027.0000
    Positive     0.8283  0.2824    0.4212 5194.0000
    accuracy     0.3952  0.3952    0.3952    0.3952
   macro avg     0.4529  0.5198    0.3886 7619.0000
weighted avg     0.6512  0.3952    0.4076 7619.0000


In [None]:
# Calculate the balanced accuracy score
score = balanced_accuracy_score(df_test['label_name'], df_test['prediction'])
print(f"Balanced Accuracy Score: {score}")

Balanced Accuracy Score: 0.5198296299612496


In [None]:
# Calculate the balanced accuracy score
score = accuracy_score(df_test['label_name'], df_test['prediction'])
print(f"Accuracy Score: {score}")


Accuracy Score: 0.39519621997637483


# Load in scraped dataset

In [None]:
df_scraped = pd.read_excel( '/content/drive/My Drive/Masters Thesis/Colab notebook/final_SPX500_data.xlsx')

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# Define model path (where it was saved previously)
model_save_path = '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/FinBERT Model'

# Load the trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(model_save_path)

classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [None]:
df_scraped.columns

Index(['Author_Handle', 'Date', 'X_Post', 'Reply_Count', 'Repost_Count',
       'Like_Count', 'View_Count', 'Follower_Count', 'Verified_Status'],
      dtype='object')

In [None]:
# Convert text column to a list
X_Posts = df_scraped['X_Post'].tolist()

# Make predictions
predictions = classifier(X_Posts)

# Convert predictions to DataFrame format
df_scraped['Prediction'] = [pred['label'] for pred in predictions]
df_scraped['Confidence'] = [pred['score'] for pred in predictions]


In [None]:
df_scraped.to_excel("/content/drive/My Drive/Masters Thesis/Colab notebook/SPX500_final.xlsx", index=False)
