In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load processed dataset
data = pd.read_csv("processed_financial_data.xls")

# Check columns
print(data.columns)
data.head()


Index(['label', 'sentence', 'processed_text', 'sentiment_score',
       'predicted_label'],
      dtype='object')


Unnamed: 0,label,sentence,processed_text,sentiment_score,predicted_label
0,neutral,"According to Gran , the company has no plans t...",according gran company plan move production ru...,0.0593,positive
1,neutral,Technopolis plans to develop in stages an area...,technopolis plan develop stage area less 100 0...,-0.055556,negative
2,negative,The international electronic industry company ...,international electronic industry company elco...,-0.053333,negative
3,positive,With the new production plant the company woul...,new production plant company would increase ca...,0.595251,positive
4,positive,According to the company 's updated strategy f...,according company updated strategy year 2009 2...,0.539287,positive


In [2]:
label_map = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

data["label_id"] = data["label"].map(label_map)
data = data.dropna(subset=["label_id"])

# Convert to int
data["label_id"] = data["label_id"].astype(int)

In [3]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["sentence"].astype(str).tolist(),
    data["label_id"].tolist(),
    test_size=0.15,
    random_state=42,
    stratify=data["label_id"]
)

print("Train size:", len(train_texts))
print("Validation size:", len(val_texts))


Train size: 4119
Validation size: 727


In [4]:

from transformers import Trainer, TrainingArguments
from datasets import Dataset
import numpy as np

class FinBERTFineTuner:
    def __init__(self, model_name="ProsusAI/finbert"):
        """Initialize for fine-tuning."""
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
    
    def prepare_dataset(self, texts, labels, max_length=128):
        """
        Prepare texts for training.
        FinBERT expects: [text, label]
        """
        def tokenize_function(examples):
            return self.tokenizer(
                examples['text'],
                padding='max_length',
                truncation=True,
                max_length=max_length
            )
        
        dataset = Dataset.from_dict({
            'text': texts,
            'label': labels
        })
        
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=['text']
        )
        
        return tokenized_dataset
    
    def fine_tune(self, train_texts, train_labels, val_texts, val_labels, 
                  num_epochs=3, batch_size=16, learning_rate=2e-5):
        """
        Fine-tune FinBERT on your data.
        
        Parameters:
        - num_epochs: 3 is typical (more overfits, less undertains)
        - batch_size: 16 for Colab GPU, 8 if memory limited
        - learning_rate: 2e-5 is standard for fine-tuning
        """
        
        # Prepare datasets
        train_dataset = self.prepare_dataset(train_texts, train_labels)
        val_dataset = self.prepare_dataset(val_texts, val_labels)
        
        # Define training arguments
        training_args = TrainingArguments(
            output_dir='./finbert_finetuned',
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=learning_rate,
            warmup_steps=500,
            weight_decay=0.01,
            logging_steps=100,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=2,
        )
        
        # Define metrics
        def compute_metrics(eval_preds):
            predictions, labels = eval_preds
            predictions = np.argmax(predictions, axis=1)
            accuracy = (predictions == labels).mean()
            precision = precision_score(labels, predictions, average='weighted', zero_division=0)
            recall = recall_score(labels, predictions, average='weighted', zero_division=0)
            f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
            return {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
        
        # Create trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
        )
        
        # Train
        print("Starting fine-tuning...")
        trainer.train()
        print("Fine-tuning complete!")
        
        return trainer

In [5]:
# Hugging Face
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import torch

In [6]:
!pip install -U accelerate datasets transformers




[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score


In [8]:
# Initialize fine-tuner
finbert_trainer = FinBERTFineTuner()

# Start fine-tuning
trainer = finbert_trainer.fine_tune(
    train_texts=train_texts,
    train_labels=train_labels,
    val_texts=val_texts,
    val_labels=val_labels,
    num_epochs=1,      
    batch_size=4,         
    learning_rate=2e-5     
)

Map:   0%|          | 0/4119 [00:00<?, ? examples/s]

Map:   0%|          | 0/727 [00:00<?, ? examples/s]

Starting fine-tuning...




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4136,0.508773,0.855571,0.859625,0.855571,0.856323


Fine-tuning complete!
