In [56]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import DataCollatorWithPadding, AutoTokenizer
import config

In [57]:
# 1. Load the dataset from Hugging Face Hub
# The 'fiqa-sentiment-classification' dataset has 'train', 'validation', and 'test' splits.
print("Loading dataset 'TheFinAI/fiqa-sentiment-classification' from Hugging Face Hub...")
ds = load_dataset("TheFinAI/fiqa-sentiment-classification")


print(f"Dataset loaded. Splits: {list(ds.keys())}")

Loading dataset 'TheFinAI/fiqa-sentiment-classification' from Hugging Face Hub...
Dataset loaded. Splits: ['train', 'test', 'valid']


In [58]:
# Create a mapping from 'type' (an string) to an integer ID
unique_types = ds['train'].unique('type')
type2id = {type_name: i for i, type_name in enumerate(unique_types)}
id2type = {i: type_name for type_name, i in type2id.items()}
print(f"Document types found and mapped: {type2id}")

Document types found and mapped: {'headline': 0, 'post': 1}


In [59]:
# 2. Define the preprocessing function
def preprocess_function(examples):
    
    # Adding Aspect in front of Sentence for Aspect based sentiment analysis
    texts = [
        f"{aspect} {tokenizer.sep_token} {sentence}"
        for aspect, sentence in zip(examples['aspect'], examples['sentence'])
    ]


    # Tokenize the texts. Padding is handled later by the data collator.
    tokenized_inputs = tokenizer(
        texts,
        max_length=256,
        truncation=True
    )

    # For regression, the 'labels' are the float scores from the dataset.
    tokenized_inputs["labels"] = examples['score']

    # Add the type_id to our processed data
    tokenized_inputs["type_ids"] = [type2id[t] for t in examples['type']]
    
    return tokenized_inputs



In [60]:
#Defining tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)

In [61]:
# 3. Apply the preprocessing function to all splits of the dataset
print("Tokenizing and formatting dataset...")
columns_to_remove = ['_id', 'sentence', 'target', 'aspect', 'type', 'score']
tokenized_datasets = ds.map(
    preprocess_function,
    batched=True,
    # Remove original text columns after processing. Keep 'score' to be renamed.
    remove_columns=columns_to_remove
)


# The labels are already named 'labels' in the preprocessing function, so no need to rename.
tokenized_datasets.set_format("torch",columns=["input_ids", "attention_mask", "labels", "type_ids"])

# 4. Create a Data Collator for dynamic padding
# This pads each batch to the length of the longest sequence in that batch.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 5. Create DataLoaders
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator
)

valid_dataloader = DataLoader(
    tokenized_datasets["valid"],
    batch_size=16,
    collate_fn=data_collator
)

test_dataloader = DataLoader(
    tokenized_datasets["test"],
    batch_size=16,
    collate_fn=data_collator
)

print("DataLoaders created successfully.")



Tokenizing and formatting dataset...


Map: 100%|██████████| 822/822 [00:00<00:00, 10804.10 examples/s]

DataLoaders created successfully.





In [62]:
tokenized_datasets["train"][0]

{'input_ids': tensor([  101, 15978,   120,   138,  8661, 21506,  1880,   102,  1787, 11508,
          3931,  5554,   139,  1616,  3842,  1383,  1106,  2585,  1205,   102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor(-0.3740),
 'type_ids': tensor(0)}

In [63]:
# We'll save the type mapping to the config for use in evaluation
config.ID2TYPE = id2type

In [64]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([16, 57]),
 'attention_mask': torch.Size([16, 57]),
 'labels': torch.Size([16]),
 'type_ids': torch.Size([16])}

In [65]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME, num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_inputs = {
    'input_ids': batch['input_ids'].to(device),
    'attention_mask': batch['attention_mask'].to(device)
}
labels = batch["labels"].to(device).unsqueeze(1)

# Pass model_inputs using **kwargs and labels separately
outputs = model(**model_inputs, labels=labels)

  return forward_call(*args, **kwargs)


In [67]:
print(outputs.loss, outputs.logits.shape)

tensor(0.1856, grad_fn=<MseLossBackward0>) torch.Size([16, 1])


In [68]:
from tqdm.auto import tqdm
from sklearn.metrics import classification_report, mean_squared_error, mean_absolute_error
from collections import defaultdict
import pandas as pd


In [69]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    """Trains the model for one epoch."""
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(data_loader, desc="Training", leave=False)
    
    for batch in progress_bar:
        optimizer.zero_grad() #prevents grad accumlation
        
        # The model only accepts arguments it's designed for.
        model_inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)
        }
        labels = batch["labels"].to(device).unsqueeze(1)
        
        # Pass model_inputs using **kwargs and labels separately
        outputs = model(**model_inputs, labels=labels)

        
        loss = outputs.loss  # This is MSELoss by default for regression
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #gradient clipping
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'MSE_loss': loss.item()})
        
    avg_loss = total_loss / len(data_loader)
    return avg_loss

In [70]:
def score_to_label_eval(score):
    """Converts a predicted score to a label based on thresholds in config."""
    if score > config.POSITIVE_THRESHOLD:
        return 2  # Positive
    elif score < config.NEGATIVE_THRESHOLD:
        return 0  # Negative
    else:
        return 1  # NeutralD

In [71]:
def evaluate(model, data_loader, device):
    """
    Evaluates the regression model and provides both overall and per-type 
    classification metrics.
    """
    model.eval()
    
    all_predicted_scores = []
    all_true_scores = []
    all_type_ids = []  # Store document type IDs for stratified analysis
    
    with torch.no_grad():
        progress_bar = tqdm(data_loader, desc="Evaluating", leave=False)
        for batch in progress_bar:
            # As in training, separate the model inputs from our metadata.
            model_inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            # Keep labels and type_ids for our own calculations
            true_scores = batch["labels"]
            type_ids = batch["type_ids"]
            
            # Call the model with only the arguments it expects
            outputs = model(**model_inputs)
            
            # Squeeze to remove the last dimension: [batch_size, 1] -> [batch_size] 
            predicted_scores = outputs.logits.squeeze(-1)
            
            all_predicted_scores.extend(predicted_scores.cpu().numpy()) #moving batch pred to cpu (required by numpy), changing the type to numpy and appenidng to list
            all_true_scores.extend(true_scores.cpu().numpy())
            all_type_ids.extend(type_ids.cpu().numpy())

    # --- 1. Overall Metrics ---
    overall_mse = mean_squared_error(all_true_scores, all_predicted_scores)
    overall_mae = mean_absolute_error(all_true_scores, all_predicted_scores)
    
    # Perform post-hoc classification on all data
    predicted_labels = [score_to_label_eval(s) for s in all_predicted_scores]
    true_labels = [score_to_label_eval(s) for s in all_true_scores]
    
    overall_class_report = classification_report(
        true_labels, predicted_labels,
        target_names=["Negative", "Neutral", "Positive"], output_dict=True, zero_division=0
    )

    # --- 2. Per-Type Metrics ---
    per_type_reports = {}
    preds_by_type = defaultdict(list)
    labels_by_type = defaultdict(list)

    # Group predictions and labels by their document type
    for i, type_id in enumerate(all_type_ids):
        preds_by_type[type_id].append(predicted_labels[i])
        labels_by_type[type_id].append(true_labels[i])
    
    # Generate a separate classification report for each document type
    for type_id, type_name in config.ID2TYPE.items():
        if type_id in preds_by_type:  # Check if this type exists in the current data split
            report = classification_report(
                labels_by_type[type_id],
                preds_by_type[type_id],
                target_names=["Negative", "Neutral", "Positive"], output_dict=True, zero_division=0
            )
            per_type_reports[type_name] = report
            
    # --- 3. Return a comprehensive dictionary of results ---
    return {
        "overall_mse": overall_mse,
        "overall_mae": overall_mae,
        "overall_report": overall_class_report,
        "per_type_reports": per_type_reports
    }

In [72]:
import random
import logging
import numpy as np

def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

def get_logger(name):
    """Initializes and returns a logger."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    return logging.getLogger(name)

In [73]:
from transformers import get_linear_schedule_with_warmup
import copy



In [74]:
# Initialize logger
logger = get_logger(__name__)

In [75]:
def run():
    """
    Main function to orchestrate the training and evaluation pipeline.
    """
    # For reproducibility
    set_seed(config.RANDOM_SEED)

    logger.info("--- Starting Financial Sentiment Analysis Pipeline ---")
    logger.info(f"Configuration: Model={config.MODEL_NAME}, Task={config.TASK_TYPE}, Device={config.DEVICE}")

    #transfering model to device- required by pytorch
    model.to(config.DEVICE)


    # --- 3. Setup Optimizer and Scheduler ---
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE) #initialising with pre-trained parameters
    total_steps = len(train_dataloader) * config.NUM_EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    print(f"Number of training steps: {total_steps}")

    # --- 4. Training Loop ---
    best_val_f1 = 0
    best_model_state = None

    logger.info(f"--- Starting Training for {config.NUM_EPOCHS} Epochs ---")
    for epoch in range(config.NUM_EPOCHS):
        logger.info(f"Epoch {epoch + 1}/{config.NUM_EPOCHS}")
        
        avg_train_loss = train_epoch(model, train_dataloader, optimizer, config.DEVICE, scheduler)
        logger.info(f"Average Training MSE Loss: {avg_train_loss:.4f}")
        
        # Evaluate on the validation set
        val_results = evaluate(model, valid_dataloader, config.DEVICE)
        val_f1 = val_results['overall_report']['weighted avg']['f1-score']
        val_mse = val_results['overall_mse']
        
        logger.info(f"Validation MSE: {val_mse:.4f} | Validation Weighted F1: {val_f1:.4f}")
        
        # Save the best model based on validation F1-score
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_state = copy.deepcopy(model.state_dict())
            logger.info(f"New best model found with F1: {best_val_f1:.4f} and saved.")

    # --- 5. Final Evaluation on Test Set ---
    logger.info("--- Training Finished. Starting Final Evaluation on Test Set ---")
    # Load the best performing model for the final test
    if best_model_state:
        model.load_state_dict(best_model_state)
    else:
        logger.warning("No best model was saved. Evaluating the model from the last epoch.")
    
    test_results = evaluate(model, test_dataloader, config.DEVICE)
    
    # Display results in a clean format
    logger.info("--- Overall Test Results ---")
    logger.info(f"Regression MSE: {test_results['overall_mse']:.4f}")
    logger.info(f"Regression MAE: {test_results['overall_mae']:.4f}")
    
    overall_df = pd.DataFrame(test_results['overall_report']).transpose()
    logger.info("Overall Classification Report:\n" + overall_df.to_string())

    logger.info("\n--- Per-Type Test Results ---")
    for type_name, report in test_results['per_type_reports'].items():
        logger.info(f"\n--- Report for Document Type: '{type_name}' ---")
        type_df = pd.DataFrame(report).transpose()
        logger.info("\n" + type_df.to_string())

    logger.info("--- Pipeline Finished ---")


In [76]:
run()

2025-07-30 22:55:44 - __main__ - INFO - --- Starting Financial Sentiment Analysis Pipeline ---
2025-07-30 22:55:44 - __main__ - INFO - Configuration: Model=bert-base-cased, Task=absa, Device=cpu
2025-07-30 22:55:44 - __main__ - INFO - --- Starting Training for 4 Epochs ---
2025-07-30 22:55:44 - __main__ - INFO - Epoch 1/4


Number of training steps: 208


2025-07-30 22:57:36 - __main__ - INFO - Average Training MSE Loss: 0.1683 
2025-07-30 22:57:40 - __main__ - INFO - Validation MSE: 0.1756 | Validation Weighted F1: 0.5325
2025-07-30 22:57:40 - __main__ - INFO - New best model found with F1: 0.5325 and saved.
2025-07-30 22:57:40 - __main__ - INFO - Epoch 2/4
  return forward_call(*args, **kwargs)
2025-07-30 22:59:32 - __main__ - INFO - Average Training MSE Loss: 0.0977 
2025-07-30 22:59:36 - __main__ - INFO - Validation MSE: 0.0792 | Validation Weighted F1: 0.7416
2025-07-30 22:59:36 - __main__ - INFO - New best model found with F1: 0.7416 and saved.
2025-07-30 22:59:36 - __main__ - INFO - Epoch 3/4
  return forward_call(*args, **kwargs)
2025-07-30 23:01:35 - __main__ - INFO - Average Training MSE Loss: 0.0553 
2025-07-30 23:01:39 - __main__ - INFO - Validation MSE: 0.0836 | Validation Weighted F1: 0.7174
2025-07-30 23:01:39 - __main__ - INFO - Epoch 4/4
  return forward_call(*args, **kwargs)
2025-07-30 23:03:43 - __main__ - INFO - Aver