In [1]:
!pip install datasets
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone", num_labels=3)

from datasets import load_dataset
ds = load_dataset("kdave/Indian_Financial_News")

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m480.6/480.6 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

training_data_26000.csv:   0%|          | 0.00/115M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26961 [00:00<?, ? examples/s]

In [2]:
df = ds["train"].to_pandas()

X = df[["Content"]]
y = df["Sentiment"]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [3]:
def preprocess_function(examples):
    """
    Tokenize the inputs and truncate/pad them to a fixed length.
    """
    max_length = 512
    return tokenizer(
        examples["Content"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

In [4]:
train_dataset = Dataset.from_pandas(X_train.assign(labels=y_train))
val_dataset = Dataset.from_pandas(X_val.assign(labels=y_val))

# To tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Removing conetent now that we have info contained in the other columns
train_dataset = train_dataset.remove_columns(["Content"])
val_dataset = val_dataset.remove_columns(["Content"])

'''
train_dataset = train_dataset.rename_column("labels", "labels")
val_dataset = val_dataset.rename_column("labels", "labels")
'''

# Define a mapping for sentiment labels
label_mapping = {"Positive": 2, "Neutral": 1, "Negative": 0}

train_dataset = train_dataset.map(lambda x: {"labels": label_mapping[x["labels"]]})
val_dataset = val_dataset.map(lambda x: {"labels": label_mapping[x["labels"]]})

from datasets import Value
train_dataset = train_dataset.cast_column("labels", Value("int64"))
val_dataset = val_dataset.cast_column("labels", Value("int64"))


Map:   0%|          | 0/18872 [00:00<?, ? examples/s]

Map:   0%|          | 0/4044 [00:00<?, ? examples/s]

Map:   0%|          | 0/18872 [00:00<?, ? examples/s]

Map:   0%|          | 0/4044 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18872 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4044 [00:00<?, ? examples/s]

In [5]:
def compute_metrics(pred):
    """
    Compute evaluation metrics for the model.
    """
    predictions = pred.predictions.argmax(axis=-1)
    labels = pred.label_ids

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',             # Directory to save the model and logs
    num_train_epochs=4,                 # Number of training epochs
    per_device_train_batch_size=16,     # Batch size for training
    per_device_eval_batch_size=32,      # Batch size for evaluation
    warmup_steps=500,                   # Warmup steps for learning rate scheduler
    weight_decay=0.01,                  # Weight decay for optimization
    logging_dir='./logs',               # Directory to save logs
    logging_steps=50,                   # Log every 50 steps
    evaluation_strategy="epoch",        # Evaluate the model at the end of each epoch
    save_strategy="epoch",              # Save model at the end of each epoch
    load_best_model_at_end=True,        # Load the best model when training finishes
    metric_for_best_model="accuracy",   # Metric to use for selecting the best model
    greater_is_better=True,             # Higher accuracy is better
    learning_rate=2e-5,                 # Initial learning rate
    lr_scheduler_type="linear",         # Linear learning rate scheduler
    save_total_limit=2,                 # Keep only the 2 most recent model checkpoints
    gradient_accumulation_steps=2,      # Accumulate gradients over 2 steps to handle large batches
    fp16=True,                          # Use mixed precision for faster training (requires CUDA)
    seed=42,                            # Set random seed for reproducibility
    report_to="none"                    # Disable reporting to external logging services (e.g., WandB)
)

trainer = Trainer(
    model=finbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.697,0.693051,0.692631,0.687798,0.692631,0.686636
2,1.1525,0.593402,0.75816,0.76698,0.75816,0.756815
3,0.677,0.461111,0.843225,0.844316,0.843225,0.843601
4,0.402,0.4842,0.860287,0.860244,0.860287,0.860252


TrainOutput(global_step=2360, training_loss=1.2950223580255347, metrics={'train_runtime': 1240.8959, 'train_samples_per_second': 60.833, 'train_steps_per_second': 1.902, 'total_flos': 1.9861905677451264e+16, 'train_loss': 1.2950223580255347, 'epoch': 4.0})

In [19]:
import pandas as pd
from datasets import Dataset

test_file_path = '/content/balanced_sentiment_dataset.csv'
test_df = pd.read_csv(test_file_path)

print(test_df.head())

test_df['Sentiment'].value_counts()

test_df.tail()



                                             Content Sentiment
0  Chugh would be responsible for driving growth ...   Neutral
1  Chugh would be responsible for driving growth ...   Neutral
2  Chugh would be responsible for driving growth ...   Neutral
3  BSE Sensex and Nifty 50 were trading nearly on...   Neutral
4  Apart from a postgraduate degree in management...   Neutral


Unnamed: 0,Content,Sentiment
6379,['Inventory Market Live: Sensex down 200 pts. ...,Negative
6380,"['At the close of barter, the Dow Daniel jones...",Negative
6381,['Thousand - cap of top - 10 business firm div...,Negative
6382,['Bharat demand to follow watchful of downside...,Negative
6383,['The unsecured loaning holy scripture for the...,Negative


In [9]:
test_df.columns

Index(['Content', 'Sentiment'], dtype='object')

In [12]:
def preprocess_function(examples):
    max_length = 512  # Maximum sequence length for FinBERT
    return tokenizer(examples["Content"], padding="max_length", truncation=True, max_length=max_length)

# This is to convert pd.dataframe to a hugging dataset
test_dataset = Dataset.from_pandas(test_df)

test_dataset = test_dataset.map(preprocess_function, batched=True)

test_dataset = test_dataset.rename_column("Sentiment", "labels")

label_mapping = {"Positive": 0, "Neutral": 1, "Negative": 2}
test_dataset = test_dataset.map(lambda x: {"labels": label_mapping[x["labels"]]})

test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

results = trainer.evaluate(test_dataset)

print("Test Set Evaluation Results:")
print(results)

Map:   0%|          | 0/6384 [00:00<?, ? examples/s]

Map:   0%|          | 0/6384 [00:00<?, ? examples/s]

Test Set Evaluation Results:
{'eval_loss': 4.030726432800293, 'eval_accuracy': 0.25, 'eval_precision': 0.2546633180195966, 'eval_recall': 0.25, 'eval_f1': 0.2476172926912327, 'eval_runtime': 27.5326, 'eval_samples_per_second': 231.871, 'eval_steps_per_second': 7.264, 'epoch': 4.0}


In [22]:
val_dataset

Dataset({
    features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4044
})

In [23]:

test_dataset = Dataset.from_pandas(X_test.assign(labels=y_test))

In [24]:

test_dataset = test_dataset.map(preprocess_function, batched=True)

test_dataset = test_dataset.remove_columns(["Content"])

Map:   0%|          | 0/4045 [00:00<?, ? examples/s]

In [25]:

test_results = trainer.evaluate(test_dataset)


print("Test Set Evaluation Results:", test_results)

Test Set Evaluation Results: {'eval_runtime': 19.2881, 'eval_samples_per_second': 209.714, 'eval_steps_per_second': 6.584, 'epoch': 4.0}


In [31]:
test_dataset = Dataset.from_pandas(X_test.assign(labels=y_test))

# Tokenize the dataset
test_dataset = test_dataset.map(preprocess_function, batched=True)

test_dataset = test_dataset.remove_columns(["Content"])

label_mapping = {"Positive": 2, "Neutral": 1, "Negative": 0}

test_dataset = test_dataset.map(lambda x: {"labels": label_mapping[x["labels"]]})

from datasets import Value
test_dataset = test_dataset.cast_column("labels", Value("int64"))



Map:   0%|          | 0/4045 [00:00<?, ? examples/s]

Map:   0%|          | 0/4045 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4045 [00:00<?, ? examples/s]

In [32]:
predictions = trainer.predict(test_dataset)


preds = predictions.predictions.argmax(axis=-1)

true_labels = predictions.label_ids

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(true_labels, preds)

precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average="weighted")


print(f"Test Accuracy: {accuracy:.4f}")

print(f"Test Precision: {precision:.4f}")

print(f"Test Recall: {recall:.4f}")

print(f"Test F1-score: {f1:.4f}")

Test Accuracy: 0.8722
Test Precision: 0.8716
Test Recall: 0.8722
Test F1-score: 0.8718
