In [1]:
# Block 1: Import Libraries
import os
import json
import pandas as pd
import numpy as np
import torch
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, mean_squared_error

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)


In [2]:
# Block 2: Define Paths for Data and Output

# Define the root path of your project on Google Drive
project_root = '../../'

# Define the path to the golden dataset
golden_path = os.path.join(project_root, 'Data', 'Historical Reddit', 'golden_dataset_sentiment.csv')

# Define the specific input directory for FinBERT data
finbert_input_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'FinBERT_Data')

# Define the base output directory structure within the project root
base_output_dir = os.path.join(project_root, 'outputs', 'sentiment_analysis', 'finbert')
results_output_dir = os.path.join(base_output_dir, 'results')   # For saving per-subreddit predictions
model_output_dir = os.path.join(base_output_dir, 'models')      # Optional: If saving fine-tuned models
# evaluation_output_dir = os.path.join(base_output_dir, 'evaluations') # Optional: If saving evaluations

print(f"Project root (Google Drive): {project_root}")
print(f"Looking for FinBERT input data in: {finbert_input_dir}")
print(f"Using golden dataset from: {golden_path}")
print(f"Saving outputs to: {base_output_dir}") # Base output path for FinBERT Sentiment

# Create all necessary output directories if they don't exist
os.makedirs(model_output_dir, exist_ok=True)
os.makedirs(results_output_dir, exist_ok=True)
# os.makedirs(evaluation_output_dir, exist_ok=True) # Uncomment if needed later

# Set device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Project root (Google Drive): ../../
Looking for FinBERT input data in: ../../Data\Historical Reddit\FinBERT_Data
Using golden dataset from: ../../Data\Historical Reddit\golden_dataset_sentiment.csv
Saving outputs to: ../../outputs\sentiment_analysis\finbert
Using device: cuda


In [3]:
# Block 3: Load and Merge Data from JSON Files and the Golden Dataset
dfs = []
for fname in os.listdir(finbert_input_dir):
    if fname.startswith('finbert_r_') and fname.endswith('.json'):
        file_path = os.path.join(finbert_input_dir, fname)
        with open(file_path, 'r', encoding='utf-8') as f:
            data_json = json.load(f)
        df = pd.DataFrame(data_json)
        dfs.append(df)
all_posts = pd.concat(dfs, ignore_index=True)

# Load the golden dataset that contains sentiment labels
golden = pd.read_csv(golden_path)
data = all_posts.merge(golden[['id', 'sentiment']], on='id', how='inner')

# Create text field for BERT processing
data['processed_text'] = data['processed_text_finbert']
# Display info about dataset
print(f"Total samples in merged dataset: {len(data)}")
print(f"Distribution of sentiment classes:")
print(data['sentiment'].value_counts().sort_index())


Total samples in merged dataset: 49175
Distribution of sentiment classes:
sentiment
1     3613
2    14914
3     9968
4    15114
5     5566
Name: count, dtype: int64


In [4]:
# Block 4: Split Data into Training, Validation, and Test Sets
# First split: 80% train+val, 20% test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    data['processed_text'], data['sentiment'], stratify=data['sentiment'], test_size=0.2, random_state=42
)

# Second split: Split train+val into train and val
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, stratify=y_trainval, test_size=0.2, random_state=42
)

# Create DataFrames for each split
train_df = pd.DataFrame({'text': X_train, 'sentiment': y_train})
val_df = pd.DataFrame({'text': X_val, 'sentiment': y_val})
test_df = pd.DataFrame({'text': X_test, 'sentiment': y_test})

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")


Training samples: 31472
Validation samples: 7868
Test samples: 9835


In [5]:
# Block 5: Load FinBERT Tokenizer and Define Model Initialization Function

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def model_init():
    """Initializes a fresh model for each hyperparameter trial."""
    # Load the base model
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # Resize the model's classification layer to handle 5 classes
    # This is the most reliable way to change the output size
    model.resize_token_embeddings(len(tokenizer))  # Ensure token embeddings match tokenizer
    model.classifier = torch.nn.Linear(model.config.hidden_size, 5)  # Replace classifier
    model.num_labels = 5  # Update the model's num_labels attribute

    # Update the model config
    model.config.num_labels = 5  # Update the config's num_labels
    model.config.id2label = {0: "1", 1: "2", 2: "3", 3: "4", 4: "5"}
    model.config.label2id = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4}
    model.config.problem_type = "single_label_classification"

    # Move to device
    return model.to(device)

print(f"Tokenizer loaded from {model_name}.")
print("Model initialization function `model_init` defined.")
# Verify a model can be initialized (optional)
# temp_model = model_init()
# print(f"Test model initialized with {temp_model.num_labels} classes.")
# del temp_model # Clean up memory

Tokenizer loaded from yiyanghkust/finbert-tone.
Model initialization function `model_init` defined.




In [6]:
# Block 6: Prepare Datasets for the Transformer Model
def preprocess_function(examples):
    # Tokenize the texts and prepare for the model
    result = tokenizer(
        examples['text'],
        truncation=True,
        max_length=256,
        padding="max_length",
    )

    # Convert 1-5 sentiment labels to 0-4 for the model (0-indexed)
    result["labels"] = [label - 1 for label in examples['sentiment']]
    return result

# Convert pandas DataFrames to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Look at a sample to verify the data structure
print("Sample from dataset:")
print(train_dataset[0])

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Check processed data
print("\nSample after preprocessing:")
print(train_dataset[0])
print(f"Label range: min={min(train_dataset['labels'])}, max={max(train_dataset['labels'])}")

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Sample from dataset:
{'text': 'Be Careful Out There Folks: It is Beginning to Feel a Lot More Like 2017 (SAFE.. This, Finance... That) Mirroring Old ICO Days Do not be left holding worthless bags! Without mincing words, the current state of the cryptocurrency market is looking no different from what it was in late 2017, just before the bubble burst and people went back to their normal lives like crypto never existed. There were shiny ICOs and whitepapers here and there and hundreds of tokens being released every day, sometimes on websites with fake photos of humans that exist nowhere. Today, we are living a reminiscence with many shit projects hitting under the umbrella of Defi, .Finance domain websites, and practically no live product. Do not get me wrong, I like and do DeFi, but there are just so many scams out there and new investors will get burned when all of this euphoria ends. Worse is the recent proliferation of meme coins claiming to be the next doge and that investors can mak

Map:   0%|          | 0/31472 [00:00<?, ? examples/s]

Map:   0%|          | 0/7868 [00:00<?, ? examples/s]

Map:   0%|          | 0/9835 [00:00<?, ? examples/s]


Sample after preprocessing:
{'text': 'Be Careful Out There Folks: It is Beginning to Feel a Lot More Like 2017 (SAFE.. This, Finance... That) Mirroring Old ICO Days Do not be left holding worthless bags! Without mincing words, the current state of the cryptocurrency market is looking no different from what it was in late 2017, just before the bubble burst and people went back to their normal lives like crypto never existed. There were shiny ICOs and whitepapers here and there and hundreds of tokens being released every day, sometimes on websites with fake photos of humans that exist nowhere. Today, we are living a reminiscence with many shit projects hitting under the umbrella of Defi, .Finance domain websites, and practically no live product. Do not get me wrong, I like and do DeFi, but there are just so many scams out there and new investors will get burned when all of this euphoria ends. Worse is the recent proliferation of meme coins claiming to be the next doge and that investors

In [7]:
# Block 7: Define Training Arguments and Metrics Function

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Convert back to 1-5 scale for evaluation
    predictions_1to5 = predictions + 1
    labels_1to5 = labels + 1

    # Calculate metrics
    accuracy = (predictions == labels).mean()
    weighted_f1 = f1_score(labels_1to5, predictions_1to5, average='weighted')
    mse = mean_squared_error(labels_1to5, predictions_1to5)

    return {
        'accuracy': accuracy,
        'weighted_f1': weighted_f1,
        'mse': mse
    }

# Set training arguments with less logging
training_args = TrainingArguments(
    output_dir="./finbert_finetuned",
    # --- Reduced Logging ---
    logging_dir='./logs',        # Keep log directory
    logging_strategy="epoch",    # Log at the end of each epoch
    disable_tqdm=True,           # Disable detailed progress bars for cleaner output
    report_to=["none"],          # Disable external reporting (like wandb)
    # --- Evaluation and Saving ---
    evaluation_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save model at the end of each epoch
    load_best_model_at_end=True, # Load the best model found during training
    metric_for_best_model="weighted_f1", # Use weighted_f1 to determine the best model
    greater_is_better=True,
    # --- Other Training Parameters ---
    num_train_epochs=3, # Default epochs (will be overridden by hyperparameter search)
    per_device_train_batch_size=16, # Default batch size (will be overridden)
    per_device_eval_batch_size=16,  # Default eval batch size (will be linked to train size during search)
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(), # Use mixed precision if GPU available
)

print("TrainingArguments configured for reduced logging.")


TrainingArguments configured for reduced logging.




In [8]:
# Block 8: Initialize Trainer and Run Hyperparameter Search (with Reduced Logging)

# --- Initialize Trainer using model_init ---
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # Stops training if 'weighted_f1' doesn't improve for 3 evaluation steps.
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# --- Define the Hyperparameter Space for Optuna ---
def optuna_hp_space(trial):
  """Defines the hyperparameter search space for Optuna."""
  return {
      "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
      "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
      "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
  }

# ---  Set Optuna's logging verbosity ---
optuna.logging.set_verbosity(optuna.logging.INFO)

# --- Run Hyperparameter Search ---
print("Starting hyperparameter search...")
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=5
)

print("\nHyperparameter search finished.")
print(f"Best trial found:")
print(f"  Value (eval_weighted_f1): {best_trial.objective}")
print(f"  Params: {best_trial.hyperparameters}")

# --- Train Final Model with Best Hyperparameters ---
print("\nTraining final model with the best hyperparameters found...")
best_params = best_trial.hyperparameters
for param, value in best_params.items():
    if param == "per_device_train_batch_size":
        setattr(training_args, "per_device_eval_batch_size", value)
    setattr(training_args, param, value)

# Update args to disable tqdm for final training too
setattr(training_args, "disable_tqdm", True)

final_model = model_init()
trainer = Trainer(
    model=final_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # Add callback here too for the final training run
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

model_save_path = os.path.join(model_output_dir, "finbert_finetuned_best_hparams")
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Final model trained with best hyperparameters saved to {model_save_path}")


[I 2025-04-08 15:40:14,761] A new study created in memory with name: no-name-46878936-912a-47ab-a0b5-20cb7837bf4e


Starting hyperparameter search...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 1.2007, 'grad_norm': 7.9891180992126465, 'learning_rate': 4.104864579076952e-06, 'epoch': 1.0}
{'eval_loss': 1.0374835729599, 'eval_accuracy': 0.5638027452974073, 'eval_weighted_f1': 0.5539302016816158, 'eval_mse': 0.8942552109811897, 'eval_runtime': 22.3269, 'eval_samples_per_second': 352.399, 'eval_steps_per_second': 22.036, 'epoch': 1.0}
{'loss': 0.9885, 'grad_norm': 16.531944274902344, 'learning_rate': 2.737967629765613e-06, 'epoch': 2.0}
{'eval_loss': 0.9953482747077942, 'eval_accuracy': 0.5878240976105745, 'eval_weighted_f1': 0.5784566894285187, 'eval_mse': 0.8462125063548551, 'eval_runtime': 21.8089, 'eval_samples_per_second': 360.771, 'eval_steps_per_second': 22.56, 'epoch': 2.0}
{'loss': 0.8974, 'grad_norm': 11.49541187286377, 'learning_rate': 1.3703750585971182e-06, 'epoch': 3.0}
{'eval_loss': 0.9971779584884644, 'eval_accuracy': 0.5884595831215048, 'eval_weighted_f1': 0.582159351168216, 'eval_mse': 0.8160904931367565, 'eval_runtime': 22.1168, 'eval_samples_per_secon

[I 2025-04-08 16:03:05,888] Trial 0 finished with value: 1.990439441544706 and parameters: {'learning_rate': 4.925698370520911e-06, 'num_train_epochs': 4, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 1.990439441544706.


{'train_runtime': 1369.7743, 'train_samples_per_second': 91.904, 'train_steps_per_second': 5.744, 'train_loss': 0.9795921027024022, 'epoch': 4.0}
{'loss': 1.1782, 'grad_norm': 8.861656188964844, 'learning_rate': 2.619500977385232e-06, 'epoch': 1.0}
{'eval_loss': 1.0437047481536865, 'eval_accuracy': 0.5622775800711743, 'eval_weighted_f1': 0.5524934865253643, 'eval_mse': 0.9070920183019827, 'eval_runtime': 22.1205, 'eval_samples_per_second': 355.689, 'eval_steps_per_second': 22.242, 'epoch': 1.0}
{'loss': 0.997, 'grad_norm': 16.76319122314453, 'learning_rate': 2.6620944892126343e-09, 'epoch': 2.0}
{'eval_loss': 1.0202585458755493, 'eval_accuracy': 0.5748601931875953, 'eval_weighted_f1': 0.5676926426268532, 'eval_mse': 0.874936451448907, 'eval_runtime': 22.2627, 'eval_samples_per_second': 353.417, 'eval_steps_per_second': 22.1, 'epoch': 2.0}


[I 2025-04-08 16:14:38,205] Trial 1 finished with value: 2.0174892872633556 and parameters: {'learning_rate': 4.711907245906362e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 16}. Best is trial 1 with value: 2.0174892872633556.


{'train_runtime': 691.4695, 'train_samples_per_second': 91.029, 'train_steps_per_second': 5.689, 'train_loss': 1.0876436917140633, 'epoch': 2.0}
{'loss': 1.1548, 'grad_norm': 6.6695051193237305, 'learning_rate': 1.3133695625411714e-05, 'epoch': 1.0}
{'eval_loss': 1.0336283445358276, 'eval_accuracy': 0.5707930859176411, 'eval_weighted_f1': 0.554256717995848, 'eval_mse': 0.8104982206405694, 'eval_runtime': 22.0818, 'eval_samples_per_second': 356.312, 'eval_steps_per_second': 22.281, 'epoch': 1.0}
{'loss': 0.8933, 'grad_norm': 17.87555694580078, 'learning_rate': 9.852358022214431e-06, 'epoch': 2.0}
{'eval_loss': 0.9710701704025269, 'eval_accuracy': 0.6072699542450433, 'eval_weighted_f1': 0.6014555378054353, 'eval_mse': 0.7714794102694459, 'eval_runtime': 22.1039, 'eval_samples_per_second': 355.955, 'eval_steps_per_second': 22.259, 'epoch': 2.0}
{'loss': 0.6795, 'grad_norm': 16.667139053344727, 'learning_rate': 6.5710204190171456e-06, 'epoch': 3.0}
{'eval_loss': 1.0387054681777954, 'eval_a

[I 2025-04-08 16:43:07,174] Trial 2 finished with value: 1.924825393552997 and parameters: {'learning_rate': 1.477269538448584e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16}. Best is trial 1 with value: 2.0174892872633556.


{'train_runtime': 1708.0781, 'train_samples_per_second': 92.127, 'train_steps_per_second': 5.758, 'train_loss': 0.7186684294769954, 'epoch': 5.0}
{'loss': 1.1299, 'grad_norm': 6.544529914855957, 'learning_rate': 2.527768512690331e-05, 'epoch': 1.0}
{'eval_loss': 1.0242236852645874, 'eval_accuracy': 0.5734621250635485, 'eval_weighted_f1': 0.5540170985690781, 'eval_mse': 0.838840874428063, 'eval_runtime': 22.1013, 'eval_samples_per_second': 355.997, 'eval_steps_per_second': 22.261, 'epoch': 1.0}
{'loss': 0.8403, 'grad_norm': 17.557580947875977, 'learning_rate': 1.68575006200426e-05, 'epoch': 2.0}
{'eval_loss': 0.9758158326148987, 'eval_accuracy': 0.6096847991865786, 'eval_weighted_f1': 0.6025341302268693, 'eval_mse': 0.7594051855617692, 'eval_runtime': 23.2171, 'eval_samples_per_second': 338.889, 'eval_steps_per_second': 21.191, 'epoch': 2.0}
{'loss': 0.5469, 'grad_norm': 15.823457717895508, 'learning_rate': 8.437316113181891e-06, 'epoch': 3.0}
{'eval_loss': 1.140053391456604, 'eval_accu

[I 2025-04-08 17:06:30,910] Trial 3 finished with value: 1.9109947574011619 and parameters: {'learning_rate': 3.0327226090071557e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16}. Best is trial 1 with value: 2.0174892872633556.


{'train_runtime': 1402.8735, 'train_samples_per_second': 89.736, 'train_steps_per_second': 5.608, 'train_loss': 0.7073396295839357, 'epoch': 4.0}
{'loss': 1.2054, 'grad_norm': 14.891928672790527, 'learning_rate': 1.2500083223165004e-06, 'epoch': 1.0}
{'eval_loss': 1.0683526992797852, 'eval_accuracy': 0.5565582104728012, 'eval_weighted_f1': 0.5398929372761283, 'eval_mse': 0.9027707168276563, 'eval_runtime': 21.7933, 'eval_samples_per_second': 361.029, 'eval_steps_per_second': 22.576, 'epoch': 1.0}
{'loss': 1.0386, 'grad_norm': 24.194337844848633, 'learning_rate': 9.527502456680644e-10, 'epoch': 2.0}
{'eval_loss': 1.047190546989441, 'eval_accuracy': 0.5654550076258261, 'eval_weighted_f1': 0.5558092640218915, 'eval_mse': 0.900482968988307, 'eval_runtime': 22.2502, 'eval_samples_per_second': 353.615, 'eval_steps_per_second': 22.112, 'epoch': 2.0}


[I 2025-04-08 17:20:48,715] Trial 4 finished with value: 2.0217472406360244 and parameters: {'learning_rate': 2.2488081631918547e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 8}. Best is trial 4 with value: 2.0217472406360244.


{'train_runtime': 856.783, 'train_samples_per_second': 73.466, 'train_steps_per_second': 9.183, 'train_loss': 1.1219899769537207, 'epoch': 2.0}

Hyperparameter search finished.
Best trial found:
  Value (eval_weighted_f1): 2.0217472406360244
  Params: {'learning_rate': 2.2488081631918547e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 8}

Training final model with the best hyperparameters found...
{'loss': 1.2022, 'grad_norm': 13.969533920288086, 'learning_rate': 1.2500083223165004e-06, 'epoch': 1.0}
{'eval_loss': 1.0687209367752075, 'eval_accuracy': 0.5537620742247077, 'eval_weighted_f1': 0.541377037935951, 'eval_mse': 0.9203101169293341, 'eval_runtime': 26.51, 'eval_samples_per_second': 296.793, 'eval_steps_per_second': 37.118, 'epoch': 1.0}
{'loss': 1.036, 'grad_norm': 26.22003746032715, 'learning_rate': 1.2703336608907526e-09, 'epoch': 2.0}
{'eval_loss': 1.0493544340133667, 'eval_accuracy': 0.5704117946110828, 'eval_weighted_f1': 0.5624692667652869, 'eval_mse': 0.9065836

In [9]:
# Block 9: Evaluate the Fine-tuned Model on Test Set
print("Evaluating on test set...")
eval_results = trainer.evaluate(test_dataset)
print(f"Test set evaluation results:\n{eval_results}")

# Get predictions for confusion matrix and detailed metrics
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Convert back to 1-5 scale
preds_1to5 = preds + 1
labels_1to5 = labels + 1

# Print classification report
print("\nFinBERT Deep Learning Sentiment Analysis Evaluation:")
print(classification_report(labels_1to5, preds_1to5))
print("Confusion Matrix:")
print(confusion_matrix(labels_1to5, preds_1to5))

# Calculate MSE
mse = mean_squared_error(labels_1to5, preds_1to5)
print(f"\nMean Squared Error: {mse:.4f}")

# Calculate per-class MSE
for i in range(1, 6):
    class_mask = labels_1to5 == i
    if np.any(class_mask):
        class_mse = mean_squared_error(labels_1to5[class_mask], preds_1to5[class_mask])
        print(f"MSE for class {i}: {class_mse:.4f}")


Evaluating on test set...
{'eval_loss': 1.054720401763916, 'eval_accuracy': 0.5630910015251652, 'eval_weighted_f1': 0.5542054329863343, 'eval_mse': 0.9019827147941027, 'eval_runtime': 32.7867, 'eval_samples_per_second': 299.97, 'eval_steps_per_second': 37.515, 'epoch': 2.0}
Test set evaluation results:
{'eval_loss': 1.054720401763916, 'eval_accuracy': 0.5630910015251652, 'eval_weighted_f1': 0.5542054329863343, 'eval_mse': 0.9019827147941027, 'eval_runtime': 32.7867, 'eval_samples_per_second': 299.97, 'eval_steps_per_second': 37.515, 'epoch': 2.0}

FinBERT Deep Learning Sentiment Analysis Evaluation:
              precision    recall  f1-score   support

           1       0.53      0.27      0.36       723
           2       0.56      0.71      0.62      2983
           3       0.56      0.44      0.49      1993
           4       0.56      0.62      0.59      3023
           5       0.63      0.44      0.52      1113

    accuracy                           0.56      9835
   macro avg 

In [10]:
# Block 10: Batch Prediction Function for JSON Files
def predict_batch(texts, model, tokenizer, device, batch_size=32):
    """Process a list of texts in batches and return predictions"""
    all_predictions = []
    all_scores = []

    # Process in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        # Tokenize
        inputs = tokenizer(
            batch_texts,
            truncation=True,
            max_length=256,
            padding=True,
            return_tensors="pt"
        ).to(device)

        # Get predictions
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)

        # Convert to CPU for numpy operations
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        scores = torch.max(probabilities, dim=-1).values.cpu().numpy()

        # Convert predictions back to 1-5 scale
        preds = preds + 1

        all_predictions.extend(preds)
        all_scores.extend(scores)

    return all_predictions, all_scores


In [11]:
# # Block 11: Perform Inference on All Processed Files and Save Results
# print("Performing inference on all processed files...")
# model.eval()  # Set model to evaluation mode

# for fname in os.listdir(finbert_input_dir):
#     if fname.startswith('processed_r_') and fname.endswith('.json'):
#         file_path = os.path.join(finbert_input_dir, fname)

#         print(f"Processing {fname}...")

#         with open(file_path, 'r', encoding='utf-8') as f:
#             df = pd.DataFrame(json.load(f))

#         if 'processed_tokens_ml' not in df.columns:
#             continue

#         # Prepare text for model
#         texts = df['processed_tokens_ml'].apply(lambda toks: ' '.join(toks)).tolist()

#         # Get predictions in batches
#         predictions, scores = predict_batch(texts, model, tokenizer, device)

#         # Add to DataFrame
#         df['finbert_sentiment'] = predictions
#         df['finbert_confidence'] = scores

#         # Save results
#         out_file = fname.replace('.json', '_finbert_sentiment.csv')
#         out_path = os.path.join(output_dir, out_file)
#         df.to_csv(out_path, index=False)
#         print(f"Saved: {out_file}")
