<a href="https://colab.research.google.com/github/NinaMwangi/finance_chatbot/blob/main/FinBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finance Chatbot

In [None]:
pip install evaluate

In [None]:
pip install gradio

In [None]:
pip install tf-keras

In [None]:
import pandas as pd
import numpy as np
import re
import string
import random
import evaluate
import gradio as gr
from tqdm import tqdm
import tensorflow as tf
from transformers import DataCollatorForSeq2Seq
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, create_optimizer
import warnings
warnings.filterwarnings("ignore", message="The initializer RandomNormal is unseeded*")

In [None]:
pip install --upgrade datasets

# Loading Dataset

In [None]:
# Load dataset and display a sample
dataset = load_dataset("virattt/financial-qa-10K")["train"]
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_data = split_dataset["train"]
val_data = split_dataset["test"]
print(train_data[0])

# Selecting my pretrained model
The code loads a pretrained instruction-tuned language model (flan-t5-small) from Hugging Face:

Defines the model name (flan-t5-small)

Loading the tokenizer to convert text into tokens the model understands

Loading the model itself (in TensorFlow) for sequence-to-sequence tasks like question answering, summarization, or translation

In [None]:
# Selecting my model and tokenizer
model_checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Preprocessing the dataset
- The preprocess() function is preparing each dataset example for training the sequence-to-sequence model
- Tokenizing both the input and the target (answer)

- Setting the target tokens as labels, which the model uses to learn during training

- Returning a tokenized dictionary ready for use in model training

In [None]:
def preprocess(example):
    inputs = [
        f"Q: {q} Context: {c} A:"
        for q, c in zip(example["question"], example["context"])
    ]
    targets = example["answer"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
train_dataset = train_data.map(preprocess, batched=True)
val_dataset = val_data.map(preprocess, batched=True)

# Batching
Converting the tokenized dataset into a batched and ready to train format for TensorFlow.

In [None]:
# Initialising the data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model, return_tensors="tf"
)

# Converting the tokenized dataset into tf.data.Dataset objects
tf_train_dataset = train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator
)
tf_val_dataset = val_dataset.to_tf_dataset(
    columns=["input_ids", "attention_mask", 'labels'],
    shuffle=False,
    batch_size=8,
    collate_fn=data_collator
)

# Training

In [None]:
# Creating an optimiser and a learning rate schedule
# Training configuration
batch_size = 8
epochs = 10
learning_rate = 3e-5
train_data_size = len(train_dataset)

# Total steps (use train_dataset if not shuffled externally)
total_train_steps = (train_data_size // batch_size) * epochs
warmup_steps = total_train_steps // 10

# Create optimizer and scheduler
optimizer, schedule = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=total_train_steps,
    num_warmup_steps=warmup_steps,
    weight_decay_rate=0.01
)

In [None]:
model.compile(
    optimizer=optimizer
)

In [None]:
# Training the model
history = model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    epochs=epochs
)

# Saving the Model and Tokenizer

In [None]:
#Saving the pretrained model in Drive
model.save_pretrained("/content/drive/MyDrive/Finance Chatbot")
tokenizer.save_pretrained("/content/drive/MyDrive/Finance Chatbot")

In [None]:
# Loading Tokenizer and Fine_tuned model
model_name = "/content/drive/MyDrive/Finance Chatbot"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
!pip install rouge_score

# Evaluating the model

In [None]:
# Loading evaluation metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")
em_metric = evaluate.load("exact_match")

# Lists for metrics
predictions = []
references = []

# Generating responses
for item in tqdm(val_data):
    try:
        question = item["question"]
        context = item["context"]
        reference_answer = item["answer"]

        prompt = f"Q: {question} Context: {context} A:"

        # Tokenizing input
        inputs = tokenizer(
            prompt,
            return_tensors="tf",
            truncation=True,
            padding="max_length",
            max_length=512
        )

        # Generating output
        outputs = model.generate(**inputs, max_new_tokens=64)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        # Saving for metrics
        predictions.append(answer)
        references.append(reference_answer.strip())

    except Exception as e:
        print(f"Error processing item: {e}")
        continue

In [None]:
def normalize_text(text):
    text = text.lower().strip()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Normalize both predictions and references
norm_preds = [normalize_text(p) for p in predictions]
norm_refs = [normalize_text(r) for r in references]

# Evaluating ROUGE on normalized text
rouge_result = rouge_metric.compute(
    predictions=norm_preds,
    references=norm_refs,
    use_stemmer=True
)

# Evaluating BLEU on normalized text
bleu_result = bleu_metric.compute(
    predictions=norm_preds,
    references=[[ref] for ref in norm_refs]
)

# Evaluating Exact Match
em_result = em_metric.compute(predictions=norm_preds, references=norm_refs)

# Print scores
print("ROUGE-1 Score:", round(rouge_result['rouge1'] * 100, 2))
print("BLEU Score:", round(bleu_result['bleu'] * 100, 2))
print("Exact Match Score:", round(em_result['exact_match'] * 100, 2))

#Inferencing

In [None]:
# Loading fine-tuned model
model_path = "/content/drive/MyDrive/Finance Chatbot"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_path)

# Function to retrieve matching context
def get_context_for_question(question):
    for item in dataset:
        if item["question"].strip().lower() == question.strip().lower():
            return item["context"]
    return "No relevant context found."

# Define the prediction function (inference)
def generate_answer(question, chat_history):
    context = get_context_for_question(question)
    prompt = f"Q: {question} Context: {context} A:"


    inputs = tokenizer(
        prompt,
        return_tensors="tf",
        padding="max_length",
        truncation=True,
        max_length=256
    )

    outputs = model.generate(
        **inputs,
        max_new_tokens=64,
        num_beams=4,
        early_stopping=True
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    chat_history.append((question, answer))
    return "", chat_history


In [None]:
# Launching the chatbot interface
with gr.Blocks(theme=gr.themes.Base()) as interface:
    gr.Markdown(
        """
        # 💬 Finance QA Chatbot
        Ask a finance-related question and get an accurate, concise response.
        Built using a fine-tuned T5 Transformer on financial Q&A data.
        """,
    )

    chatbot = gr.Chatbot(label="Finance Chatbot", height=400, bubble_full_width=False)
    with gr.Row():
        with gr.Column(scale=8):
            question_box = gr.Textbox(
                placeholder="Ask a finance question...", show_label=False, lines=2
            )
        with gr.Column(scale=1):
            submit_btn = gr.Button("Send")

    clear_btn = gr.Button("Clear Chat")

    # Chat state
    state = gr.State([])

    # Bind function
    submit_btn.click(
        generate_answer,
        inputs=[question_box, state],
        outputs=[question_box, chatbot],
    )

    clear_btn.click(lambda: [], inputs=[], outputs=[chatbot, state])

# Run app
interface.launch(share=True)