# Financial Entity Recognition Model Development

This notebook focuses on developing a Named Entity Recognition (NER) model specifically for financial entities in both English and Swahili text. The model will be used to extract key financial information from user queries in the PesaGuru chatbot system.

## Key Entities to Recognize:
- MONEY_AMOUNT (e.g., "KES 5000", "5k", "1M")
- TIME_PERIOD (e.g., "2 years", "6 months")
- FINANCIAL_INSTRUMENT (e.g., "stocks", "bonds", "T-bills")
- INSTITUTION (e.g., "KCB", "Equity Bank")
- CURRENCY (e.g., "KES", "USD")
- PERCENTAGE (e.g., "5%", "10.5%")

In [None]:
# Import required libraries
import spacy
import pandas as pd
import numpy as np
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from pathlib import Path
import json
import random

In [None]:
# Load financial corpus data
with open('../data/external/financial_corpus.json', 'r', encoding='utf-8') as f:
    financial_data = json.load(f)

# Load Swahili corpus data for multilingual support
with open('../data/external/swahili_corpus.json', 'r', encoding='utf-8') as f:
    swahili_data = json.load(f)

In [None]:
def prepare_training_data(data):
    """Convert annotated data into spaCy's training format"""
    training_data = []
    for item in data:
        text = item['text']
        entities = item['entities']
        training_data.append((text, {'entities': entities}))
    return training_data

# Prepare training data
english_training_data = prepare_training_data(financial_data)
swahili_training_data = prepare_training_data(swahili_data)

# Combine datasets
all_training_data = english_training_data + swahili_training_data
random.shuffle(all_training_data)

## 2. Model Configuration and Training

In [None]:
def create_spacy_model():
    """Create a new spaCy model with custom NER pipeline"""
    nlp = spacy.blank('en')  # we'll handle multilingual text in preprocessing
    
    # Create new NER pipeline
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe('ner', last=True)
    else:
        ner = nlp.get_pipe('ner')
    
    # Add entity labels
    for _, annotations in all_training_data:
        for ent in annotations.get('entities', []):
            ner.add_label(ent[2])
    
    return nlp

In [None]:
def train_model(nlp, training_data, n_iter=100):
    """Train the NER model"""
    # Get names of other pipes to disable during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        
        for itn in range(n_iter):
            random.shuffle(training_data)
            losses = {}
            
            # Batch the examples and iterate over them
            batches = minibatch(training_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                          losses=losses)
            
            print(f'Iteration {itn+1}, Losses:', losses)
    
    return nlp

In [None]:
# Create and train the model
nlp = create_spacy_model()
trained_model = train_model(nlp, all_training_data)

## 3. Model Evaluation

In [None]:
def evaluate_model(model, test_data):
    """Evaluate model performance on test data"""
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for text, annot in test_data:
        doc = model(text)
        gold_entities = set([(start, end, label) for start, end, label in annot['entities']])
        pred_entities = set([(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])
        
        true_positives += len(gold_entities & pred_entities)
        false_positives += len(pred_entities - gold_entities)
        false_negatives += len(gold_entities - pred_entities)
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Split data into train and test sets
split = int(len(all_training_data) * 0.8)
train_data = all_training_data[:split]
test_data = all_training_data[split:]

# Evaluate model
metrics = evaluate_model(trained_model, test_data)
print("Model Performance Metrics:")
print(f"Precision: {metrics['precision']:.3f}")
print(f"Recall: {metrics['recall']:.3f}")
print(f"F1 Score: {metrics['f1']:.3f}")

## 4. Model Testing and Examples

In [None]:
def test_entity_recognition(model, text):
    """Test the model on a given text and display recognized entities"""
    doc = model(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Test cases
test_cases = [
    "I want to invest KES 50000 in government bonds for 2 years",
    "What is the current interest rate at Equity Bank?",
    "Nataka kuwekeza shilingi 10000 kwa hisa za Safaricom",  # Swahili: I want to invest 10000 shillings in Safaricom shares
    "The stock gained 5.5% in value today"
]

for text in test_cases:
    print(f"\nText: {text}")
    entities = test_entity_recognition(trained_model, text)
    print("Recognized entities:", entities)

## 5. Save the Model

In [None]:
# Save the trained model
output_dir = Path('../server/ai/models/entity_recognition')
if not output_dir.exists():
    output_dir.mkdir(parents=True)

trained_model.to_disk(output_dir)
print(f"Model saved to {output_dir}")

## 6. Usage Instructions

To use the trained model in the PesaGuru application:

1. Load the model:
```python
import spacy
nlp = spacy.load('../server/ai/models/entity_recognition')
```

2. Process text:
```python
text = "I want to invest KES 50000 in government bonds"
doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
```

The model will identify and classify financial entities in both English and Swahili text, making it suitable for the multilingual requirements of PesaGuru.