In [1]:
!pip install -U pyarrow --quiet
!pip install datasets transformers torch numpy seqeval --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from tqdm import tqdm

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Generate synthetic financial data
def generate_financial_data(num_samples=1000):
    np.random.seed(42)
    revenues = np.random.randint(100000, 10000000, num_samples)
    expenses = np.random.randint(50000, 9000000, num_samples)
    profits = revenues - expenses

    data = []
    labels = []

    for i in tqdm(range(num_samples)):
        financial_text = f"Revenue: ${revenues[i]}, Expenses: ${expenses[i]}, Profit: ${profits[i]}"

        if profits[i] > 1000000:
            interpretation = "The company is performing exceptionally well with high profits."
            label = 2
        elif profits[i] > 0:
            interpretation = "The company is profitable but there's room for improvement."
            label = 1
        else:
            interpretation = "The company is operating at a loss and needs immediate attention."
            label = 0

        data.append(financial_text + " " + interpretation)
        labels.append(label)

    return data, labels

# Create a custom dataset
class FinancialDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Generate synthetic data
texts, labels = generate_financial_data()

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

# Create dataset and dataloader
dataset = FinancialDataset(texts, labels, tokenizer, max_length=128)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Set up optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Training loop
num_epochs = 3
for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

# Function to interpret new financial data
def interpret_financial_data(financial_text):
    model.eval()
    encoding = tokenizer.encode_plus(
        financial_text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)

    interpretations = [
        "The company is operating at a loss and needs immediate attention.",
        "The company is profitable but there's room for improvement.",
        "The company is performing exceptionally well with high profits."
    ]

    return interpretations[predicted.item()]

# Example usage
new_financial_data = "Revenue: $8500000, Expenses: $7000000, Profit: $1500000"
interpretation = interpret_financial_data(new_financial_data)
print(f"Financial Data: {new_financial_data}")
print(f"Interpretation: {interpretation}")

Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/63 [00:00<?, ?it/s][A
  2%|▏         | 1/63 [00:27<27:59, 27.08s/it][A
  3%|▎         | 2/63 [00:48<23:52, 23.48s/it][A
  5%|▍         | 3/63 [01:08<22:19, 22.33s/it][A
  6%|▋         | 4/63 [01:28<20:52, 21.22s/it][A
  8%|▊         | 5/63 [01:48<19:55, 20.61s/it][A
 10%|▉         | 6/63 [02:08<19:28, 20.50s/it][A
 11%|█         | 7/63 [02:27<18:44, 20.08s/it][A
 13%|█▎        | 8/63 [02:47<18:27, 20.14s/it][A
 14%|█▍        | 9/63 [03:06<17:49, 19.81s/it][A
 16%|█▌        | 10/63 [03:27<17:37, 19.95s/it][A
 17%|█▋        | 11/63 [03:46<17:06, 19.74s/it][A
 19%|█▉        | 12/63 [04:06<16:52, 19.86s/it][A
 21%|██        | 13/63 [

Epoch 1/3, Average Loss: 0.2660



  0%|          | 0/63 [00:00<?, ?it/s][A
  2%|▏         | 1/63 [00:19<20:01, 19.38s/it][A
  3%|▎         | 2/63 [00:39<20:13, 19.90s/it][A
  5%|▍         | 3/63 [01:00<20:09, 20.16s/it][A
  6%|▋         | 4/63 [01:19<19:31, 19.85s/it][A
  8%|▊         | 5/63 [01:39<19:24, 20.08s/it][A
 10%|▉         | 6/63 [01:59<18:51, 19.85s/it][A
 11%|█         | 7/63 [02:19<18:42, 20.04s/it][A
 13%|█▎        | 8/63 [02:39<18:09, 19.81s/it][A
 14%|█▍        | 9/63 [02:59<17:55, 19.92s/it][A
 16%|█▌        | 10/63 [03:19<17:41, 20.03s/it][A
 17%|█▋        | 11/63 [03:38<17:11, 19.83s/it][A
 19%|█▉        | 12/63 [03:59<16:57, 19.94s/it][A
 21%|██        | 13/63 [04:18<16:27, 19.75s/it][A
 22%|██▏       | 14/63 [04:38<16:16, 19.93s/it][A
 24%|██▍       | 15/63 [04:58<15:48, 19.75s/it][A
 25%|██▌       | 16/63 [05:18<15:38, 19.97s/it][A
 27%|██▋       | 17/63 [05:38<15:20, 20.01s/it][A
 29%|██▊       | 18/63 [05:58<14:51, 19.81s/it][A
 30%|███       | 19/63 [06:18<14:38, 19.97s/it]

Epoch 2/3, Average Loss: 0.0174



  0%|          | 0/63 [00:00<?, ?it/s][A
  2%|▏         | 1/63 [00:19<19:50, 19.20s/it][A
  3%|▎         | 2/63 [00:39<20:14, 19.91s/it][A
  5%|▍         | 3/63 [00:59<20:05, 20.09s/it][A
  6%|▋         | 4/63 [01:19<19:26, 19.77s/it][A
  8%|▊         | 5/63 [01:39<19:20, 20.01s/it][A
 10%|▉         | 6/63 [01:59<18:48, 19.80s/it][A
 11%|█         | 7/63 [02:19<18:39, 19.98s/it][A
 13%|█▎        | 8/63 [02:38<18:08, 19.79s/it][A
 14%|█▍        | 9/63 [02:58<17:56, 19.93s/it][A
 16%|█▌        | 10/63 [03:19<17:39, 20.00s/it][A
 17%|█▋        | 11/63 [03:38<17:08, 19.77s/it][A
 19%|█▉        | 12/63 [03:58<16:57, 19.95s/it][A
 21%|██        | 13/63 [04:18<16:28, 19.76s/it][A
 22%|██▏       | 14/63 [04:38<16:19, 19.99s/it][A
 24%|██▍       | 15/63 [04:58<15:51, 19.83s/it][A
 25%|██▌       | 16/63 [05:18<15:40, 20.00s/it][A
 27%|██▋       | 17/63 [05:38<15:25, 20.11s/it][A
 29%|██▊       | 18/63 [05:58<14:56, 19.93s/it][A
 30%|███       | 19/63 [06:18<14:43, 20.08s/it]

Epoch 3/3, Average Loss: 0.0062
Financial Data: Revenue: $8500000, Expenses: $7000000, Profit: $1500000
Interpretation: The company is performing exceptionally well with high profits.


In [4]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.model_selection import train_test_split
from seqeval.metrics import accuracy_score

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define compute metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Generate synthetic financial data
def generate_financial_data(num_samples=1000):
    np.random.seed(42)
    revenues = np.random.randint(100000, 10000000, num_samples)
    expenses = np.random.randint(50000, 9000000, num_samples)
    profits = revenues - expenses

    data = []
    labels = []

    for i in range(num_samples):
        financial_text = f"Revenue: ${revenues[i]}, Expenses: ${expenses[i]}, Profit: ${profits[i]}"

        if profits[i] > 1000000:
            interpretation = "The company is performing exceptionally well with high profits."
            label = 2
        elif profits[i] > 0:
            interpretation = "The company is profitable but there's room for improvement."
            label = 1
        else:
            interpretation = "The company is operating at a loss and needs immediate attention."
            label = 0

        data.append(financial_text + " " + interpretation)
        labels.append(label)

    return data, labels

# Create a custom dataset
class FinancialDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Generate synthetic data
texts, labels = generate_financial_data()

# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Create datasets
train_dataset = FinancialDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = FinancialDataset(val_texts, val_labels, tokenizer, max_length=128)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_steps=100,
    save_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

eval_results = trainer.evaluate()
print(f"Before Training Evaluation results: {eval_results}")

# Train the model
trainer.train()

eval_results = trainer.evaluate()
print(f"Post Training Evaluation results: {eval_results}")

# Function to interpret new financial data
def interpret_financial_data(financial_text):
    encoding = tokenizer.encode_plus(
        financial_text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)

    interpretations = [
        "The company is operating at a loss and needs immediate attention.",
        "The company is profitable but there's room for improvement.",
        "The company is performing exceptionally well with high profits."
    ]

    return interpretations[predicted.item()]

# Example usage
new_financial_data = "Revenue: $8500000, Expenses: $7000000, Profit: $1500000"
interpretation = interpret_financial_data(new_financial_data)
print(f"Financial Data: {new_financial_data}")
print(f"Interpretation: {interpretation}")

Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6609,0.502858
2,0.1125,0.064867
3,0.0104,0.008454


Financial Data: Revenue: $8500000, Expenses: $7000000, Profit: $1500000
Interpretation: The company is performing exceptionally well with high profits.
