In [1]:
!pip install transformers torch pandas openpyxl

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

## Loading the model and tokenizer

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Check if GPU is available and set device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move model to GPU if available
model.to(device)

# Define labels for interpretation
labels = ["negative", "neutral", "positive"]

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

## Loading in the excel file

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import pandas as pd

# Load the Excel file
#file_path = "/content/drive/MyDrive/Colab-Notebooks/Thesis/Kopi af final_SPX500_data.xlsx"  # Change to your file path

# bubs
file_path = '/content/drive/My Drive/Masters Thesis/Colab notebook/final_SPX500_data.xlsx'
df = pd.read_excel(file_path)

if "X_Post" not in df.columns:
    raise ValueError("Column 'X_Post' not found in the Excel file.")

df["X_Post"] = df["X_Post"].astype(str)


## Process the Tweets and Get Sentiment

In [11]:
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        scores = outputs.logits
        probabilities = F.softmax(scores, dim=-1)  # Normalize logits (if needed)
        predicted_class = torch.argmax(probabilities).item()
    return labels[predicted_class]

# Apply sentiment analysis to all posts
df["sentiment"] = df["X_Post"].apply(get_sentiment)

## Exporting to excel

In [None]:
# Save the DataFrame with sentiment labels back to an Excel file
output_path = "/content/drive/MyDrive/Colab-Notebooks/Thesis/x_posts_with_sentiment_finbert.xlsx"
df.to_excel(output_path, index=False)

print("Sentiment analysis completed and saved to:", output_path)

Sentiment analysis completed and saved to: /content/drive/MyDrive/Colab-Notebooks/Thesis/x_posts_with_sentiment_finbert.xlsx


# Testing on manually labelled data

In [29]:
# Load labeled dataset
file_path = '/content/drive/My Drive/Masters Thesis/Colab notebook/Sentiment analysis/labeled_sentiment.xlsx'
df = pd.read_excel(file_path)

In [23]:
df_test = df[df['Manual Sentiment'].notna()]

In [26]:
df_test.columns

Index(['Author_Handle', 'Date', 'X_Post', 'Reply_Count', 'Repost_Count',
       'Like_Count', 'View_Count', 'Follower_Count', 'Verified_Status',
       'Manual Sentiment'],
      dtype='object')

In [32]:
# Run predictions
df_test["predicted_sentiment"] = df_test["X_Post"].apply(get_sentiment)

# Convert to numeric for metric calculation
label_to_int = {label: idx for idx, label in enumerate(labels)}
y_true = df_test["Manual Sentiment"].map(label_to_int)
y_pred = df_test["predicted_sentiment"].map(label_to_int)

from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    matthews_corrcoef,
    precision_score,
    recall_score
)

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
balanced_acc = balanced_accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average="weighted")
mcc = matthews_corrcoef(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="weighted")
recall = recall_score(y_true, y_pred, average="weighted")

# Print results
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_acc:.4f}")
print(f"F1 Score (weighted): {f1:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")

Evaluation Metrics:
Accuracy: 0.1604
Balanced Accuracy: 0.1574
F1 Score (weighted): 0.1069
Matthews Correlation Coefficient (MCC): -0.3123
Precision (weighted): 0.0802
Recall (weighted): 0.1604
