In [15]:
import pandas as pd
import kagglehub
import nltk

# nltk.download('punkt')
# nltk.download('punkt_tab')
nltk.download('stopwords')
# import re
# import requests
# from bs4 import BeautifulSoup
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, accuracy_score
# from datasets import load_dataset
# import nltk
# from nltk.tokenize import sent_tokenize
# from nltk.corpus import stopwords
# import matplotlib.pyplot as plt
# import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def load_fiqa_dataset():
    splits = {
        'train': 'data/train-00000-of-00001-aeefa1eadf5be10b.parquet',
        'test': 'data/test-00000-of-00001-0fb9f3a47c7d0fce.parquet',
        'valid': 'data/valid-00000-of-00001-51867fe1ac59af78.parquet'
    }

    df_fiqa = pd.read_parquet("hf://datasets/TheFinAI/fiqa-sentiment-classification/" + splits["train"])

    df_fiqa = df_fiqa[['sentence', 'score']].rename(columns={'sentence': 'text', 'score': 'sentiment'})

    def convert_score_to_label(score):
        if score < 0:
            return -1
        elif score > 0:
            return 1
        else:
            return 0

    df_fiqa['sentiment'] = df_fiqa['sentiment'].apply(convert_score_to_label)
    print("FiQA:", df_fiqa.shape)

    # for label in [-1, 0, 1]:
    #     example = df_fiqa[df_fiqa["sentiment"] == label].iloc[2]["text"]
    #     print(f"Kaggle - First example for sentiment {label}:\n{example}\n")

    return df_fiqa


def load_kaggle_dataset():
    path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")
    kaggle_df = pd.read_csv(f"{path}/all-data.csv", encoding="ISO-8859-1", header=None)

    kaggle_df.columns = ["sentiment", "text"]
    sentiment_mapping = {"negative": -1, "neutral": 0, "positive": 1}
    kaggle_df["sentiment"] = kaggle_df["sentiment"].map(sentiment_mapping)

    print("Kaggle dataset:", kaggle_df.shape)

    # for label in [-1, 0, 1]:
    #     example = kaggle_df[kaggle_df["sentiment"] == label].iloc[0]["text"]
    #     print(f"Kaggle - First example for sentiment {label}:\n{example}\n")

    return kaggle_df


def load_all_datasets():
    df_fiqa = load_fiqa_dataset()
    df_kaggle = load_kaggle_dataset()
    print("fiqa distirbution: ", df_fiqa['sentiment'].value_counts())
    print("kaggle distirbution: ", df_kaggle['sentiment'].value_counts())


    df_combined = pd.concat([df_fiqa, df_kaggle], ignore_index=True)
    print("Sentiment class distribution:", df_combined['sentiment'].value_counts())
    return df_combined

df = load_all_datasets()

print(df.iloc[1]["text"], df.iloc[1].sentiment)

FiQA: (822, 2)
Kaggle dataset: (4846, 2)
fiqa distirbution:  sentiment
 1    546
-1    264
 0     12
Name: count, dtype: int64
kaggle distirbution:  sentiment
 0    2879
 1    1363
-1     604
Name: count, dtype: int64
Sentiment class distribution: sentiment
 0    2891
 1    1909
-1     868
Name: count, dtype: int64
Slump in Weir leads FTSE down from record high -1


In [19]:
#preprocess data
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string
import re

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

stop_words = set(stopwords.words('english'))
important_negations = {"not", "no", "nor", "never", "n't"}
stop_words = stop_words - important_negations

lemmatizer = WordNetLemmatizer()


def preprocess(text):
    if pd.isna(text) or text == "":
        return ""

    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

df = df.dropna(subset=['text'])
df['text'] = df['text'].apply(preprocess)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [21]:
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
import torch.optim as optim
from sklearn.model_selection import train_test_split


embedder = SentenceTransformer('all-MiniLM-L6-v2')
sampled_df = df.sample(n=len(df), random_state=42)
embeddings = embedder.encode(sampled_df["text"].tolist(), batch_size=32, show_progress_bar=True)

X = np.array(embeddings)
y = sampled_df['sentiment'].values.astype(np.float32)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/178 [00:00<?, ?it/s]

In [24]:
class SentimentRegressorClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            nn.Tanh()
        )

    def forward(self, x):
        return self.model(x)

model = SentimentRegressorClassifier(input_dim=X_train.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=16, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=32)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [26]:
for epoch in range(10):
  model.train()
  train_loss = 0
  all_train_preds = []

  for inputs, targets in train_loader:
    inputs, targets = inputs.to(device), targets.to(device)
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs.squeeze(), targets)
    loss.backward()
    optimizer.step()

    train_loss += loss.item() * inputs.size(0)
    all_train_preds.extend(outputs.view(-1).detach().cpu().numpy())

  avg_train_loss = train_loss / len(train_loader.dataset)
  model.eval()
  val_loss = 0
  all_val_preds = []

  with torch.no_grad():
    for inputs, targets in val_loader:
      inputs, targets = inputs.to(device), targets.to(device)
      outputs = model(inputs)
      loss = criterion(outputs.squeeze(), targets)

      val_loss += loss.item() * inputs.size(0)
      all_val_preds.extend(outputs.view(-1).detach().cpu().numpy())

  avg_val_loss = val_loss / len(val_loader.dataset)
  scheduler.step(avg_val_loss)

In [33]:
def predict_sentiment_score(text):
  model.eval()
  vectorizer = embedder.encode([text])
  tensor = torch.tensor(vectorizer, dtype=torch.float32).to(device)

  with torch.no_grad():
    raw_score = model(tensor).item()
    print(raw_score)

  return round(raw_score, 3)

test_texts = [
    "US stock market couldn't crash anytime soon",
    "Trump increases chances of major stock prices downfall in US due to new tariffs",
    "Germany will spend more money on defend industry, weapons and army",
    "Trump administration could cause a major crash due to their new tariffs for all the countries."
]

print("\nTesting model predictions:")
for text in test_texts:
    score = predict_sentiment_score(text)
    print(f"Text: {text}...")
    print(f"Credibility score: {score}")
    print("---")


Testing model predictions:
-0.9539057016372681
Text: US stock market couldn't crash anytime soon...
Credibility score: -0.954
---
-0.9071109294891357
Text: Trump increases chances of major stock prices downfall in US due to new tariffs...
Credibility score: -0.907
---
0.03765174746513367
Text: Germany will spend more money on defend industry, weapons and army...
Credibility score: 0.038
---
-0.9583342671394348
Text: Trump administration could cause a major crash due to their new tariffs for all the countries....
Credibility score: -0.958
---
