In [7]:
# Mount Google Drive to access and save files persistently across sessions
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Disable Weights & Biases (wandb) to allow training without requiring an API key or logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
# Load the pre-processed datasets required for model training and evaluation
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/processed/train_processed.csv')
val_df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/processed/val_processed.csv')
test_df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/processed/test_processed.csv')

In [4]:
# Initialize the pretrained BERT tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
'''
Tokenizes a list of texts using the pre-defined tokenizer.

Args:
    texts (list or pd.Series): List or Series of sentences to tokenize.
    max_length (int): Maximum sequence length after padding/truncation.

    Returns:
    dict: Dictionary of tokenized outputs as PyTorch tensors.
'''
def tokenize_texts(texts, max_length=128):
    return tokenizer(
        list(texts),
        padding='max_length', # pad shorter texts to max_length
        truncation=True, # truncate longer texts
        max_length=max_length,
        return_tensors='pt' # return PyTorch tensors
    )
# Tokenize the cleaned sentences from each dataset split for model input
train_encodings = tokenize_texts(train_df['cleaned_sentence'])
val_encodings = tokenize_texts(val_df['cleaned_sentence'])
test_encodings = tokenize_texts(test_df['cleaned_sentence'])

In [6]:
'''
Convert label columns from each dataset split into PyTorch tensors
For compatibility with model training and loss calculation
'''
import torch

train_labels = torch.tensor(train_df['label'].values)
val_labels = torch.tensor(val_df['label'].values)
test_labels = torch.tensor(test_df['label'].values)

In [None]:
from torch.utils.data import Dataset
# Custom dataset class for sentiment analysis data
class SentimentDataset(Dataset):
    """
    PyTorch Dataset to handle input encodings and labels for sentiment analysis.
    Provides data in a format compatible with DataLoader for batching during training.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

     # Retrieve encodings and label for a single example at index idx
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        # Total number of examples in the dataset
        return len(self.labels)

# Instantiate Dataset objects for training and validation splits
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)


In [None]:
from transformers import BertForSequenceClassification

# Load the pretrained BERT base model with a classification head for 3 sentiment classes: negative, neutral, positive)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Set up the optimizer and learning rate scheduler for training

from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

batch_size = 16
num_epochs = 3

# AdamW optimizer is commonly used with transformer models for weight decay regularization
optimizer = AdamW(model.parameters(), lr=5e-5)

# Calculate total number of training steps (batches * epochs)
num_training_steps = len(train_dataset) // batch_size * num_epochs

# Linear learning rate scheduler with optional warmup steps (none here)
# Gradually decreases the learning rate from the initial value to zero over training
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
# Upgrade transformer to latest version
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.53.1
    Uninstalling transformers-4.53.1:
      Successfully uninstalled transformers-4.53.1
Successfully installed transformers-4.53.2


In [None]:
# Confirm the version of transformer
import transformers
print(transformers.__version__) # Print transformers library version
print(transformers.__file__) # Print path to the transformers module

4.53.2
/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [None]:
from transformers import TrainingArguments

# Define training configuration and hyperparameters for the Trainer API
args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    eval_strategy='epoch'
)

print(args)

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False

In [None]:
import transformers

# Print the installed transformers library version and file path
print(transformers.__version__)
print(transformers.__file__)

from transformers import TrainingArguments

# Print the module where TrainingArguments is defined and its file path (if available)
print(TrainingArguments.__module__)
print(TrainingArguments.__file__ if hasattr(TrainingArguments, '__file__') else 'No __file__ attribute')

4.53.2
/usr/local/lib/python3.11/dist-packages/transformers/__init__.py
transformers.training_args
No __file__ attribute


In [None]:
import inspect
from transformers import TrainingArguments

# Print the signature of the TrainingArguments constructor
print(inspect.signature(TrainingArguments.__init__))



In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define training arguments for the Hugging Face Trainer API
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/bert-financial-sentiment-classifier/results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # training batch size per device
    per_device_eval_batch_size=16,   # evaluation batch size
    eval_strategy='epoch',           # evaluate at the end of each epoch
    save_strategy='epoch',           # save checkpoint every epoch
    learning_rate=5e-5,              # learning rate
    weight_decay=0.01,               # weight decay
    logging_dir='/content/drive/MyDrive/bert-financial-sentiment-classifier/logs',            # logging directory
    logging_steps=50,                # log every 50 steps
    load_best_model_at_end=True,     # load best model at end of training
    metric_for_best_model='accuracy' # metric to use for best model selection
)

# Define metric computation function
def compute_metrics(eval_pred):
    """
    Compute accuracy, precision, recall, and F1-score for model evaluation.

    Args:
        eval_pred (tuple): Tuple containing logits (model outputs) and true labels.

    Returns:
        dict: Dictionary with 'accuracy', 'f1', 'precision', and 'recall' scores.
    """

    logits, labels = eval_pred
    # Convert logits to predicted class indices
    predictions = np.argmax(logits, axis=-1)

    # Calculate precision, recall, f1-score with weighted average (handles class imbalance)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    # Calculate overall accuracy
    acc = accuracy_score(labels, predictions)

    # Return all metrics in a dictionary format expected by Trainer
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Initialize the Hugging Face Trainer with model, datasets, tokenizer, and evaluation metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start training process
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5215,0.541435,0.755705,0.768427,0.813518,0.755705
2,0.2841,0.595131,0.766443,0.775856,0.792335,0.766443


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5215,0.541435,0.755705,0.768427,0.813518,0.755705
2,0.2841,0.595131,0.766443,0.775856,0.792335,0.766443
3,0.213,0.645472,0.758389,0.762035,0.76705,0.758389


TrainOutput(global_step=792, training_loss=0.3971877760357327, metrics={'train_runtime': 19541.3894, 'train_samples_per_second': 0.648, 'train_steps_per_second': 0.041, 'total_flos': 832753967109120.0, 'train_loss': 0.3971877760357327, 'epoch': 3.0})

In [None]:
# Save the fine-tuned model and tokenizer to the specified directory
# This allows loading the trained model/tokenizer later for inference or further training
model.save_pretrained('/content/drive/MyDrive/bert-financial-sentiment-classifier/model')
tokenizer.save_pretrained('/content/drive/MyDrive/bert-financial-sentiment-classifier/model')

('/content/drive/MyDrive/bert-financial-sentiment-classifier/model/tokenizer_config.json',
 '/content/drive/MyDrive/bert-financial-sentiment-classifier/model/special_tokens_map.json',
 '/content/drive/MyDrive/bert-financial-sentiment-classifier/model/vocab.txt',
 '/content/drive/MyDrive/bert-financial-sentiment-classifier/model/added_tokens.json')

In [None]:
import os

model_dir = '/content/drive/MyDrive/bert-financial-sentiment-classifier/model'

# Check if the saved model directory exists and list its contents
print("Exists:", os.path.exists(model_dir))
print("Contents:", os.listdir(model_dir) if os.path.exists(model_dir) else "Folder not found")

Exists: True
Contents: ['config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt']


In [None]:
import torch
import pandas as pd

# Load processed test data CSV
test_df = pd.read_csv('/content/drive/MyDrive/bert-financial-sentiment-classifier/data/processed/test_processed.csv')


# Tokenize test texts (make sure tokenizer is initialized)
test_encodings = tokenizer(
    list(test_df['cleaned_sentence']),
    padding='max_length', # Pad all sequences to max_length
    truncation=True,      # Truncate sequences longer than max_length
    max_length=128,
    return_tensors='pt'   # Return PyTorch tensors for model input
)

# Convert test labels to PyTorch tensor for compatibility with the model
test_labels = torch.tensor(test_df['label'].values)

# Define dataset class to wrap encodings and labels for PyTorch DataLoader compatibility
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

# Instantiate the test dataset
test_dataset = SentimentDataset(test_encodings, test_labels)

# Evaluate the fine-tuned model on the test dataset using the Hugging Face Trainer
metrics = trainer.evaluate(test_dataset)

# Print the evaluation metrics
print(metrics)


KeyError: 'cleaned_sentence'

In [None]:
import pandas as pd

# Load the processed test dataset from CSV
test_df = pd.read_csv('/content/drive/MyDrive/bert-financial-sentiment-classifier/data/processed/test_processed.csv')

# Display the column names and first few rows to verify data structure and content
print("Columns:", test_df.columns)
print(test_df.head())


Columns: Index(['Sentence', 'Sentiment', 'label'], dtype='object')
                                            Sentence Sentiment  label
0  The number of bodily injury cases quadrupled i...  negative      0
1  Net sales decreased to EUR 91.6 mn from EUR 10...   neutral      1
2   $aapl high of day just hit. Back at it tomorrow.  positive      2
3  According to CEO Kai Telanne , the company 's ...  positive      2
4  Finland 's dominating rail company VR is plann...   neutral      1


In [None]:
print(test_df.head())

                                            Sentence Sentiment  label  \
0  The number of bodily injury cases quadrupled i...  negative      0   
1  Net sales decreased to EUR 91.6 mn from EUR 10...   neutral      1   
2   $aapl high of day just hit. Back at it tomorrow.  positive      2   
3  According to CEO Kai Telanne , the company 's ...  positive      2   
4  Finland 's dominating rail company VR is plann...   neutral      1   

                                    cleaned_sentence  
0    The number of bodily injury cases quadrupled in  
1  Net sales decreased to EUR  mn from EUR mn in ...  
2           high of day just hit Back at it tomorrow  
3  According to CEO Kai Telanne  the company s ne...  
4  Finland s dominating rail company VR is planni...  


In [None]:
import re

def clean_text(text):
    """
    Clean input text by:
    - Removing URLs (http, https, www)
    - Removing stock ticker symbols starting with '$'
    - Removing non-alphabetic characters (keeping spaces)
    - Stripping leading/trailing whitespace
    """
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\$\w*', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

# Check if 'cleaned_sentence' column exists
if 'cleaned_sentence' not in test_df.columns:
    test_df['cleaned_sentence'] = test_df['Sentence'].apply(clean_text)

In [None]:
'''
Tokenize the cleaned test sentences using the pretrained tokenizer
	- Pads all sequences to max_length (128)
	- Truncates sequences longer than max_length
	- Returns PyTorch tensors for input to the model
'''
test_encodings = tokenizer(
    list(test_df['cleaned_sentence']),
    padding='max_length',
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

In [None]:
import torch

# Convert test labels to PyTorch tensor for compatibility with the model during evaluation
test_labels = torch.tensor(test_df['label'].values)

In [None]:
from torch.utils.data import Dataset

class SentimentDataset(Dataset):

    """
    Custom PyTorch Dataset for handling tokenized inputs and labels for sentiment analysis.
    Enables easy batching and iteration over the dataset during training or evaluation.
    """

    def __init__(self, encodings, labels):
        """
        Args:
            encodings (dict): Tokenized inputs (e.g., input_ids, attention_mask) as tensors.
            labels (torch.Tensor): Corresponding labels tensor.
        """
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        """
        Retrieve a single data point with its tokenized inputs and label.

        Args:
            idx (int): Index of the data point to retrieve.

        Returns:
            dict: Dictionary containing input tensors and label for the given index.
        """
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.labels)

In [None]:
# Create the test dataset object using the tokenized inputs and label tensors
test_dataset = SentimentDataset(test_encodings, test_labels)

In [None]:
# Evaluate the fine-tuned model on the test dataset using the Hugging Face Trainer
metrics = trainer.evaluate(test_dataset)

# Print evaluation metrics such as accuracy, precision, recall, and F1-score
print(metrics)

{'eval_loss': 0.5291472673416138, 'eval_accuracy': 0.8027366020524516, 'eval_f1': 0.8093768490883736, 'eval_precision': 0.823593416797985, 'eval_recall': 0.8027366020524516, 'eval_runtime': 386.8626, 'eval_samples_per_second': 2.267, 'eval_steps_per_second': 0.142, 'epoch': 3.0}
