In [1]:
import numpy as np
import pandas as pd
from helpers.clean_data import IndianNewsDataCleaner
from helpers.tokenizer_indian import tokenize_function
from datasets import Dataset
from sklearn.model_selection import train_test_split
from postgres_scripts.read_data import load_financial_news
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!docker start pg-finance

pg-finance


In [3]:
user = os.getenv('DB_USER')
password = os.getenv('DB_PASS')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
database = os.getenv('DB_NAME')


In [4]:
df = load_financial_news(database=database, host=host, password=password, port=port, user=user)
cleaner = IndianNewsDataCleaner(df, country="India", label='Sentiment')
df_clean = (
    cleaner
    .map_sentiment()
    .add_country()
    .clean_text()
    .filter_data()
    .get_clean_data()
)


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Content,Summary,Sentiment,country
0,0,https://www.moneycontrol.com/news/business/eco...,US consumer spending dropped by a record in Ap...,consumer spending plunges 13.6 percent in Apri...,0,India
1,1,https://www.businesstoday.in/top-story/state-r...,State-run lenders require an urgent Rs 1.2 tri...,government will have to take a bulk of the tab...,0,India
2,2,https://www.financialexpress.com/economy/covid...,Apparel exporters on Wednesday urged the gover...,exporters are facing issues in terms of raw ma...,0,India
3,3,https://www.moneycontrol.com/news/business/mar...,Asian shares battled to extend a global reboun...,the dollar loses some ground on the safe haven...,0,India
4,4,https://www.financialexpress.com/industry/six-...,After India’s sovereign credit rating fell to ...,six Indian public-sector undertakings have tak...,0,India


In [6]:
dataset = Dataset.from_pandas(df_clean)
train_df, test_df = train_test_split(df_clean, test_size=0.2, stratify=df_clean['Sentiment'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['Sentiment'], random_state=42)

In [29]:
train_dataset = Dataset.from_pandas(train_df[:70])
val_dataset = Dataset.from_pandas(val_df[:10])
test_dataset = Dataset.from_pandas(test_df[:20])


In [28]:
train_dataset[1]

{'Content': 'Never before in recent history have we had life turned upside down in the manner Covid-19’s danse macabre has done. Never before, in war or calamity, can one recall the cessation of all of the Railways’ 13,500 daily passenger trains and most of its 8,000 freight trains. Bruised like the rest of nation’s economy, Indian Railways (IR) will need to adapt its moribund apparatus to a new paradigm. A kind of state-within-the-state, IR directly provides livelihood to nearly 1 crore of the country’s population. It uniquely touches the life of aam aadmi. By dint of its traditional resilience, amidst the raging pandemic, IR moved record volumes of grains, industrial and consumer goods, large contingents of soldiers to the northern and eastern borders and tens of lakhs of stranded workers, apart from ferrying miscellaneous parcels and inter-city passengers as people resumed their social and economic engagements after the lifting of the lockdown. As pithily put by Donald Rumsfeld, the

In [9]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. Running on GPU.")
else:
    print("CUDA is NOT available. Running on CPU.")

CUDA is available. Running on GPU.


In [11]:
torch.cuda.empty_cache()

In [12]:
params = {
    "learning_rate": 2e-5,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "num_train_epochs": 1,
    "weight_decay": 0.01,
    "model_name": "yiyanghkust/finbert-tone"
}

In [13]:
def evaluate_finbert_batch(dataloader, model_dir='./finbert-india'):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = BertForSequenceClassification.from_pretrained(model_dir)
    tokenizer = BertTokenizer.from_pretrained(model_dir)
    model.to(device)
    model.eval()

    true_labels = []
    pred_labels = []
    pred_probs = []

    for batch in dataloader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].cpu().numpy()

        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()
            preds = np.argmax(probs, axis=1)

        true_labels.extend(labels)
        pred_labels.extend(preds)
        pred_probs.append(probs)

    true_labels = np.array(true_labels)
    pred_labels = np.array(pred_labels)
    pred_probs = np.vstack(pred_probs)

    acc = accuracy_score(true_labels, pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')

    try:
        auc = roc_auc_score(true_labels, pred_probs, multi_class='ovo')
    except Exception:
        auc = None

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

In [24]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertForSequenceClassification, BertTokenizer
import torch

def evaluate_finbert(test_dataset, model_dir='./finbert-india'):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = BertForSequenceClassification.from_pretrained(model_dir, use_safetensors=True)
    tokenizer = BertTokenizer.from_pretrained(model_dir)
    model.to(device)
    model.eval()

    true_labels = []
    pred_labels = []
    pred_probs = []

    for batch in test_dataset:
        inputs = {k: torch.tensor(v).unsqueeze(0).to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels']
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
            pred = torch.argmax(probs, dim=1).cpu().item()

        true_labels.append(labels)
        pred_labels.append(pred)
        pred_probs.append(probs.cpu().numpy())

    true_labels = np.array(true_labels)
    pred_labels = np.array(pred_labels)
    pred_probs = np.vstack(pred_probs)

    acc = accuracy_score(true_labels, pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')
    # For multiclass roc_auc, use 'ovo' or 'ovr'
    try:
        auc = roc_auc_score(true_labels, pred_probs, multi_class='ovo')
    except Exception:
        auc = None

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    if auc is not None:
        print(f"ROC AUC: {auc:.4f}")

    cm = confusion_matrix(true_labels, pred_labels)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Neutral', 'Positive'],
                yticklabels=['Negative', 'Neutral', 'Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

In [40]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertTokenizer

def train_finbert(
    train_dataset,
    val_dataset,
    output_dir='./finbert-india',
    model_name='yiyanghkust/finbert-tone',
    num_labels=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_steps=500,
    seed=42
):
    # Load model and tokenizer
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, use_safetensors=True)
    tokenizer = BertTokenizer.from_pretrained(model_name)

    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy=evaluation_strategy,
        save_strategy=save_strategy,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        logging_dir=logging_dir,
        load_best_model_at_end=load_best_model_at_end,
        metric_for_best_model=metric_for_best_model,
        logging_steps=logging_steps,
        seed=seed,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Training completed. Global step: {train_output.global_step}, Training loss: {train_output.training_loss:.4f}")

    # Evaluate on validation dataset and print metrics
    eval_metrics = trainer.evaluate_finbert_batch()
    print("Evaluation metrics:", eval_metrics)

    # return model and tokenizer if needed for further usage
    return model, tokenizer

In [25]:
def get_tokenized_datasets(country='India', label='Sentiment', test_size=0.2, val_size=0.1, random_state=42):
    # Load env vars
    user = os.getenv('DB_USER')
    password = os.getenv('DB_PASS')
    host = os.getenv('DB_HOST')
    port = os.getenv('DB_PORT')
    database = os.getenv('DB_NAME')

    # Load and clean raw data
    df = load_financial_news(database=database, host=host, password=password, port=port, user=user)
    cleaner = IndianNewsDataCleaner(df, country=country, label=label)
    df_clean = (
        cleaner
        .map_sentiment()
        .add_country()
        .clean_text()
        .filter_data()
        .get_clean_data()
    )

    # Split data
    train_df, test_df = train_test_split(df_clean, test_size=test_size, stratify=df_clean[label], random_state=random_state)
    train_df, val_df = train_test_split(train_df, test_size=val_size, stratify=train_df[label], random_state=random_state)

    # Tokenize
    train_dataset = Dataset.from_pandas(train_df[:70])
    val_dataset = Dataset.from_pandas(val_df[:10])
    test_dataset = Dataset.from_pandas(test_df[:20])

    train_tokenized = train_dataset.map(tokenize_function, batched=True)
    val_tokenized = val_dataset.map(tokenize_function, batched=True)
    test_tokenized = test_dataset.map(tokenize_function, batched=True)

    return train_tokenized, val_tokenized, test_tokenized

ERROR! Session/line number was not unique in database. History logging moved to new session 517


In [32]:
def run_mlflow_experiment(params):
    train_dataset, val_dataset, test_dataset = get_tokenized_datasets()
    with mlflow.start_run(run_name="FinBERT_India"):
        mlflow.log_params(params)

        model, tokenizer, metrics = train_finbert(
            train_dataset,
            val_dataset,
            learning_rate=params["learning_rate"],
            per_device_train_batch_size=params["per_device_train_batch_size"],
            per_device_eval_batch_size=params["per_device_eval_batch_size"],
            num_train_epochs=params["num_train_epochs"],
            weight_decay=params["weight_decay"],
            model_name=params["model_name"]
        )

        mlflow.log_metrics(metrics)
        test_metrics = eval_metrics(test_dataset, model_dir=params["model_name"])
        mlflow.log_metrics({f"test_{k}": v for k, v in test_metrics.items()})
        mlflow.transformers.log_model(
            transformers_model=model,
            artifact_path="finbert-india-model",
            tokenizer=tokenizer,
            input_example={"text": "The market outlook is positive"}
        )


In [22]:
torch.cuda.empty_cache()

In [30]:
train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 71.44 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 57.14 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 68.71 examples/s]


In [42]:
with mlflow.start_run(run_name="FinBERT_India_TEST"):
    mlflow.log_params(params)

    model, tokenizer, metrics = train_finbert(
        train_dataset,
        val_dataset,
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["per_device_train_batch_size"],
        per_device_eval_batch_size=params["per_device_eval_batch_size"],
        num_train_epochs=params["num_train_epochs"],
        weight_decay=params["weight_decay"],
        model_name=params["model_name"]
    )

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
mlflow.log_metrics(metrics)

In [43]:
!pip install -U transformers



In [44]:
import transformers
print(transformers.__version__)

4.52.3
