In [None]:
import re
import os
import time
import random
import datetime
import numpy as np
import pandas as pd
import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup

In [None]:
!pip install transformers datasets kagglehub --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.2 kB[0m [31m22.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [None]:
import random
import numpy as np
import torch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        try:
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
        except Exception as e:
            print("CUDA seeding error:", e)

set_seed()


In [None]:
import kagglehub

def load_datasets():
    fiqa_path = "hf://datasets/TheFinAI/fiqa-sentiment-classification/data/train-00000-of-00001-aeefa1eadf5be10b.parquet"
    fiqa_df = pd.read_parquet(fiqa_path)
    fiqa_df = fiqa_df[["sentence", "score"]].rename(columns={"sentence": "text", "score": "sentiment"})

    def convert_score_to_label(score):
        if score < 0: return 2
        elif score > 0: return 1
        else: return 0

    fiqa_df["sentiment"] = fiqa_df["sentiment"].apply(convert_score_to_label)

    kaggle_path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")
    kaggle_df = pd.read_csv(f"{kaggle_path}/all-data.csv", encoding="ISO-8859-1", header=None)
    kaggle_df.columns = ["sentiment", "text"]
    kaggle_df["sentiment"] = kaggle_df["sentiment"].map({"negative": 2, "neutral": 0, "positive": 1})

    combined_df = pd.concat([fiqa_df, kaggle_df], ignore_index=True)
    print("Dataset shape:", combined_df.shape)
    return combined_df

df = load_datasets()
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset shape: (5668, 2)


Unnamed: 0,text,sentiment
0,Royal Mail chairman Donald Brydon set to step ...,2
1,Slump in Weir leads FTSE down from record high,2
2,AstraZeneca wins FDA approval for key new lung...,1
3,UPDATE 1-Lloyds to cut 945 jobs as part of 3-y...,2
4,Standard Chartered Shifts Emerging-Markets Str...,2


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

def tokenize_data(df, max_length=64):
    input_ids = []
    attention_masks = []

    for text in df.text.values:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

    return (
        torch.cat(input_ids, dim=0),
        torch.cat(attention_masks, dim=0),
        torch.tensor(df.sentiment.values)
    )

input_ids, attention_masks, labels = tokenize_data(df)


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

batch_size = 32

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)
model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1)
    return np.sum(pred_flat == labels) / len(labels)

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = [x.to(device) for x in batch]

        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = outputs.loss, outputs.logits

        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss:.2f}")



Epoch 1/4
Average training loss: 0.78

Epoch 2/4
Average training loss: 0.44

Epoch 3/4
Average training loss: 0.32

Epoch 4/4
Average training loss: 0.26


In [None]:
model.eval()
val_accuracy = 0
for batch in val_dataloader:
    b_input_ids, b_input_mask, b_labels = [x.to(device) for x in batch]

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
    logits = outputs.logits.detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()
    val_accuracy += flat_accuracy(logits, label_ids)

print(f"Validation Accuracy: {val_accuracy / len(val_dataloader):.2f}")


Validation Accuracy: 0.85


In [None]:
model.eval()
test_accuracy = 0
for batch in test_dataloader:
    b_input_ids, b_input_mask, b_labels = [x.to(device) for x in batch]

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
    logits = outputs.logits.detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()
    test_accuracy += flat_accuracy(logits, label_ids)

print(f"Test Accuracy: {test_accuracy / len(test_dataloader):.2f}")


Test Accuracy: 0.85


In [None]:
def predict(text):
    model.eval()
    tokens = tokenizer.encode_plus(
        text,
        max_length=64,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    input_ids = tokens["input_ids"].to(device)
    attention_mask = tokens["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predicted = torch.argmax(probs, dim=-1).item()
    label_map = {0: "neutral", 1: "positive", 2: "negative"}
    return label_map[predicted], probs

predict("Donald Trump annnounced new tariffs")


('positive', tensor([[0.0257, 0.7547, 0.2195]], device='cuda:0'))

In [None]:
import re
import os
import time
import random
import datetime
import numpy as np
import pandas as pd
import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        try:
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
        except Exception as e:
            print("CUDA seeding error:", e)

set_seed()

def load_financial_dataset():
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        print("Google Drive mounted successfully")
    except:
        print("Not running in Colab or Drive already mounted")

    dataset_path = '/content/drive/My Drive/all_financial_news_sentiment_datasets.csv'

    if not os.path.exists(dataset_path):
        dataset_path = '/content/drive/MyDrive/all_financial_news_sentiment_datasets.csv'
        if not os.path.exists(dataset_path):
            alternative_paths = [
                'all_financial_news_sentiment_datasets.csv',
                '/all_financial_news_sentiment_datasets.csv',
                './all_financial_news_sentiment_datasets.csv',
                'drive/My Drive/all_financial_news_sentiment_datasets.csv',
                'drive/MyDrive/all_financial_news_sentiment_datasets.csv'
            ]

            for alt_path in alternative_paths:
                if os.path.exists(alt_path):
                    dataset_path = alt_path
                    print(f"Found dataset at: {dataset_path}")
                    break
            else:
                print("Warning: Could not find dataset at expected paths.")
                custom_path = input("Please enter the full path to your dataset file: ")
                if os.path.exists(custom_path):
                    dataset_path = custom_path
                else:
                    raise FileNotFoundError(f"Could not find dataset file at {custom_path}")

    if not os.path.exists(dataset_path):
        raise FileNotFoundError(f"Dataset not found at {dataset_path}")

    print(f"Loading dataset from: {dataset_path}")
    try:
        df = pd.read_csv(dataset_path, header=None, encoding='utf-8')
        df.columns = ['text', 'sentiment']

        print(f"Dataset loaded with {len(df)} rows")
        print("Sample data:")
        print(df.head())
        print("\nSentiment score distribution:")
        print(df['sentiment'].describe())

        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

df = load_financial_dataset()

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

def tokenize_data(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df.text.values:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

    return (
        torch.cat(input_ids, dim=0),
        torch.cat(attention_masks, dim=0),
        torch.tensor(df.sentiment.values, dtype=torch.float)
    )

input_ids, attention_masks, labels = tokenize_data(df)
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

batch_size = 16

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment",
    num_labels=1,
    problem_type="regression",
    ignore_mismatched_sizes=True
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

epochs = 4
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def mse_loss(preds, labels):
    return np.mean((preds - labels) ** 2)

def mae_loss(preds, labels):
    return np.mean(np.abs(preds - labels))

print("Starting training...\n")
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = [x.to(device) for x in batch]

        model.zero_grad()
        outputs = model(
            b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    model.eval()
    val_mse = 0
    val_mae = 0

    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = [x.to(device) for x in batch]

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits.squeeze(-1).detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        val_mse += mse_loss(logits, label_ids)
        val_mae += mae_loss(logits, label_ids)

    avg_val_mse = val_mse / len(val_dataloader)
    avg_val_mae = val_mae / len(val_dataloader)
    print(f"Validation MSE: {avg_val_mse:.4f}")
    print(f"Validation MAE: {avg_val_mae:.4f}")

model.eval()
test_mse = 0
test_mae = 0
test_preds = []
test_true = []

for batch in test_dataloader:
    b_input_ids, b_input_mask, b_labels = [x.to(device) for x in batch]

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits.squeeze(-1).detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()

    test_preds.extend(logits)
    test_true.extend(label_ids)

    test_mse += mse_loss(logits, label_ids)
    test_mae += mae_loss(logits, label_ids)

avg_test_mse = test_mse / len(test_dataloader)
avg_test_mae = test_mae / len(test_dataloader)
print("\nTest Results:")
print(f"Test MSE: {avg_test_mse:.4f}")
print(f"Test MAE: {avg_test_mae:.4f}")

def predict_sentiment(text):
    model.eval()
    tokens = tokenizer.encode_plus(
        text,
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    input_ids = tokens["input_ids"].to(device)
    attention_mask = tokens["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    score = outputs.logits.item()

    score = max(-1, min(1, score))

    if score < -0.25:
        sentiment = "negative"
    elif score > 0.25:
        sentiment = "positive"
    else:
        sentiment = "neutral"

    return score, sentiment

model_save_path = '/content/drive/MyDrive/financial_sentiment_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nModel saved to {model_save_path}")

test_headlines = [
    "AMD reduces debt significantly, improves balance sheet",
    "Economic indicators point to contraction in telecom sector",
    "telecom sector rallies as Tesla leads gains",
    "Investors maintain hold rating on Google stock"
]

print("\nTesting model with sample headlines:")
for headline in test_headlines:
    score, sentiment = predict_sentiment(headline)
    print(f"Text: {headline}")
    print(f"Predicted sentiment score: {score:.4f}")
    print(f"Sentiment category: {sentiment}\n")

Using device: cuda
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully
Loading dataset from: /content/drive/My Drive/all_financial_news_sentiment_datasets.csv
Dataset loaded with 14338 rows
Sample data:
                                                text  sentiment
0  AMD reduces debt significantly, improves balan...       0.93
1  Economic indicators point to contraction in te...      -0.96
2        telecom sector rallies as Tesla leads gains       0.86
3     Investors maintain hold rating on Google stock       0.01
4  Meta restructuring fails to address fundamenta...      -0.30

Sentiment score distribution:
count    14338.000000
mean         0.204137
std          0.650038
min         -0.970000
25%         -0.260000
50%          0.350000
75%          0.820000
max          0.980000
Name: sentiment, dtype: float64


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([1, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch 1/4
Average training loss: 0.1624
Validation MSE: 0.1246
Validation MAE: 0.2518

Epoch 2/4
Average training loss: 0.0814
Validation MSE: 0.0889
Validation MAE: 0.1992

Epoch 3/4
Average training loss: 0.0464
Validation MSE: 0.0849
Validation MAE: 0.1841

Epoch 4/4
Average training loss: 0.0280
Validation MSE: 0.0881
Validation MAE: 0.1814

Test Results:
Test MSE: 0.1037
Test MAE: 0.1972

Model saved to /content/drive/MyDrive/financial_sentiment_model

Testing model with sample headlines:
Text: AMD reduces debt significantly, improves balance sheet
Predicted sentiment score: 0.9507
Sentiment category: positive

Text: Economic indicators point to contraction in telecom sector
Predicted sentiment score: -0.9894
Sentiment category: negative

Text: telecom sector rallies as Tesla leads gains
Predicted sentiment score: 0.6067
Sentiment category: positive

Text: Investors maintain hold rating on Google stock
Predicted sentiment score: -0.0282
Sentiment category: n