In [1]:
# Import librariries 
# We need basics like os, sys, time, and datetime,logging, warnings
# Data handling libraries like pandas, numpy, and scipy
# Data visualization libraries like matplotlib and seaborn
# Presidio libraries for data anonymization and PII detection
# and the custom Presidio configuration file
# Import Open AI libraries for LLMs and OpenAI API
import os
import sys
import time
import datetime
import logging
import warnings


In [2]:

import pandas as pd
import numpy as np

In [3]:

import json
import re
import random
import string
import requests
import glob
import asyncio,aiofiles
import io
import multiprocessing


In [4]:
# Now install the Presidio libraries for PII detection and anonymization
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer import PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer import Pattern
from presidio_analyzer.predefined_recognizers import SpacyRecognizer

logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')





In [5]:
# Install tesseract for OCR and pytesseract for image processing
# Import pdfPlumber, pdf2Image

import pytesseract
from PIL import Image
import pdfplumber
from pdf2image import convert_from_path

In [6]:
# Import NLP package spaCy for text processing and NER
import spacy
from spacy import displacy
# Import OpenAI libraries for LLMs and OpenAI API
import openai

In [7]:
# Ensure the spacy model is installed
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

# Load spacy model for NER
nlp = spacy.load("en_core_web_sm")


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 5.6 MB/s eta 0:00:03
     ------------- -------------------------- 4.5/12.8 MB 12.2 MB/s eta 0:00:01
     ---------------------------------- ---- 11.3/12.8 MB 19.6 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 18.7 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')




Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.8/457.4 MB 4.8 MB/s eta 0:01:35
     ---------------------------------------- 3.4/457.4 MB 9.6 MB/s eta 0:00:48
      -------------------------------------- 9.4/457.4 MB 17.3 MB/s eta 0:00:26
     - ------------------------------------ 16.5/457.4 MB 21.7 MB/s eta 0:00:21
     - ------------------------------------ 23.3/457.4 MB 24.2 MB/s eta 0:00:18
     -- ----------------------------------- 29.9/457.4 MB 25.3 MB/s eta 0:00:17
     --- ---------------------------------- 37.0/457.4 MB 26.4 MB/s eta 0:00:16
     --- ---------------------------------- 43.3/457.4 MB 27.2 MB/s eta 0:00:16
     ---- ------------------------

In [8]:
def extract_text_from_pdf(file_path):
    # Set the tesseract executable path
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path as per your installation
    images = convert_from_path(file_path, dpi=300)
    text = ''
    for i, image in enumerate(images):
        text += pytesseract.image_to_string(image)
    return text    


In [9]:
# Create a function named read_pdf_files(), this should loop through ../data/fake_email_data folder and get all the pdf files in the folder using glob
def read_pdf_files(folder_path):
    email_files_txt = []
    pdf_files = glob.glob(os.path.join(folder_path, "*.pdf"))
    for pdf_file in pdf_files:
        text = extract_text_from_pdf(pdf_file)
        email_files_txt.append({
            'file_name': os.path.basename(pdf_file),
            'text': text
        })
    return email_files_txt
    

In [10]:
path = '../data/fake_email_data'
# Call the function to read PDF files
email_files_txt = read_pdf_files(path)

In [11]:
# Custom presidio analyzer for Account Number detection
from presidio_analyzer import Pattern




def custom_account_number_recognizer():
    #acct_regex = r"(?:\b(?:Account|Ac|Act|transfer)\b[\s:,-]*)?(?:0230[.-]?)?\d{5}[.-]?[A-Za-z\d]{2,3}"
    acct_regex = r'(?:\b(?:Account|Ac|Act|transfer)\b[\s:,-]*)?(?:0230[.-]?)?\d{5}[.-]?(?=[A-Za-z\d]{2,3}\b)(?=(?:[^0-9]*[0-9])?[^0-9]*$)[A-Za-z\d]{2,3}'

    account_number_pattern = Pattern(name="account_number_pattern",
                                 regex=acct_regex,
                                 score=0.85

                                 )
    # Create a Pattern Recognizer
    account_number_recognizer = PatternRecognizer(supported_entity="ACCOUNT_NUMBER", 
                                                  patterns=[account_number_pattern]
                                                  )
    
    return account_number_recognizer



In [12]:
def custom_isin_recognizer():
    isin_regex = r"\b([a-z]{2}\d{10})|([a-z]{3}[0-9]{1}[0-9a-z]{9})|([a-z]{2}[0-9]{1}[0-9a-z]{9})|([a-z]{3}[0-9]{1}[0-9a-z]{8})|([a-z]{2}\-\d{9}\-\d)|([a-z]{2}\-[0-9]{1}[0-9a-z]{8}\-\d)|([a-z]{3}\-[0-9]{1}[0-9a-z]{7}\-\d)"
    isin_pattern = Pattern(name="isin_pattern",
                           regex=isin_regex,
                           score=0.90
                           )
    # Create a Pattern Recognizer
    isin_recognizer = PatternRecognizer(supported_entity="ISIN", 
                                        patterns=[isin_pattern]
                                        )
    
    return isin_recognizer

In [13]:
def custom_address_recognizer():
        spacy_recognizer = SpacyRecognizer()
        spacy_recognizer.load()
        spacy_recognizer.supported_entities = ["LOCATION", "GPE", "LOC"]
        return spacy_recognizer


## Transformer Model Usage 
---

In [14]:
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [15]:
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
# Download the model and the tokenizer and  save  locally 


model.save_pretrained("../models/bert-base-NER")
tokenizer.save_pretrained("../models/bert-base-NER-tokenizer")

('../models/bert-base-NER-tokenizer\\tokenizer_config.json',
 '../models/bert-base-NER-tokenizer\\special_tokens_map.json',
 '../models/bert-base-NER-tokenizer\\vocab.txt',
 '../models/bert-base-NER-tokenizer\\added_tokens.json',
 '../models/bert-base-NER-tokenizer\\tokenizer.json')

In [27]:
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-NER-tokenizer")
model = AutoModelForTokenClassification.from_pretrained("../models/bert-base-NER")

In [28]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# Define a function to perform NER using the transformers pipeline
def perform_ner(text):
    entities = ner_pipeline(text)
    # Replace the entity labels with more descriptive ones
    for entity in entities:
        if entity['entity_group'] == 'PER':
            entity['entity_group'] = 'PERSON'
        elif entity['entity_group'] == 'ORG':
            entity['entity_group'] = 'ORGANIZATION'
        elif entity['entity_group'] == 'LOC':
            entity['entity_group'] = 'LOCATION'
    return entities

In [29]:
x = perform_ner(email_files_txt[0]['text'])
x[0]

{'entity_group': 'PERSON',
 'score': 0.98676986,
 'word': 'Guy Cesar Pittman',
 'start': 31,
 'end': 49}

In [20]:
# Ensure the custom recognizer is registered before running the analysis
def analyze_pdf_files(path,email_files_txt):
	presidio_results_df = None

	print("Starting PDF analysis...")
	
	# Read the PDF files from the specified path
	if email_files_txt is None:
		print(f"Reading PDF files from {path}, start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
		email_files_txt = read_pdf_files(path)
		print(f"Finished reading PDF files, end time: {time.strftime('%Y-%m-%d %H:%M:%S')}")

	elif len(email_files_txt) != len(glob.glob(os.path.join(path, "*.pdf"))):
		print(f"Reading PDF files from {path}, start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
		email_files_txt = read_pdf_files(path)
		print(f"Finished reading PDF files, end time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
	else:
		print(f"PDF files already read, start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")

	# Initialize the analyzer engine with the custom recognizer - account number recognizer

	account_recognizer = custom_account_number_recognizer()

	# Initialize the analyzer engine with the custom recognizer - ISIN recognizer
	isin_recognizer = custom_isin_recognizer()

	# Initialize the analyzer engine with the custom recognizer - Address recognizer
	address_recognizer = custom_address_recognizer()
	
	# Create configuration containing engine name and models
	spacy_config = {
		"nlp_engine_name": "spacy",
		"models": [
					{
					"lang_code": "en", 
			  		"model_name": "en_core_web_sm"
					}	
				]
	}  



	# Create NLP engine with spacy based transformers model
	nlp_engine = NlpEngineProvider(nlp_configuration=spacy_config)

	# Create NLP engine with transformers based model
	#nlp_engine = NlpEngineProvider(nlp_configuration=transformer_config)


	nlp_engine_provider = nlp_engine.create_engine()

	# Create the analyzer engine with the NLP engine provider
	analyzer = AnalyzerEngine(
		nlp_engine=nlp_engine_provider
	)


	# Add the custom recognizer to the analyzer
	analyzer.registry.add_recognizer(account_recognizer)
	analyzer.registry.add_recognizer(isin_recognizer)
	analyzer.registry.add_recognizer(address_recognizer)



	# Define the entities to analyze
	entities = ["ACCOUNT_NUMBER","ISIN", "EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "BANK_ACCOUNT", "IBAN", "PERSON", "LOCATION", "ORGANIATION","LOCATION", "GPE", "LOC"]

	# Analyze the text from the PDF files
	results = []
	for email_file in email_files_txt:
		
		text = email_file.get('text')
		file_name = email_file.get('file_name')
		analysis_results = None
		try:
			print(f"Analyzing file: {file_name}, time started: {time.strftime('%Y-%m-%d %H:%M:%S')}")
			
			# Perform analysis with the custom recognizer
			analysis_results = analyzer.analyze(text=text, entities= entities, language="en")
			for analysis_result in analysis_results:
				results.append({
					'file_name': file_name,
					'entity': analysis_result.entity_type,
					'start': analysis_result.start,
					'end': analysis_result.end,
					'score': analysis_result.score,
					'text': text,
					'CID' : text[analysis_result.start:analysis_result.end],
					'source': 'presidio'
					})
				
			# Perform bert based analysis with the custom recognizer
			bert_results = perform_ner(text)
			for bert_result in bert_results:
				results.append({
					'file_name': file_name,
					'entity': bert_result['entity_group'],
					'start': bert_result['start'],
					'end': bert_result['end'],
					'score': bert_result['score'],
					'text': text,
					'CID' : bert_result['word'],
					'source': 'transformer'
					})		

			# Add each row to a dataframe if the dataframe exists, if not create for the first time
			if presidio_results_df is None:
				presidio_results_df = pd.DataFrame(results)
			else:
				presidio_results_df = pd.concat([presidio_results_df, pd.DataFrame(results)], ignore_index=True)

		except Exception as e:
			print(f"Error analyzing file {file_name}: {e}")
			continue
		# Store the analysis result in the results list so that we can convert it to a dataframe later. Need to get the text, entity type and score
	return presidio_results_df



In [21]:

# Ensure the custom recognizer is registered before running the analysis
path = '../data/fake_email_data'
# suppress presidio warnings
warnings.filterwarnings("ignore", category=UserWarning, module="presidio_analyzer")
# Analyze the PDF files
presidio_results_df = analyze_pdf_files(path,email_files_txt)


Starting PDF analysis...
PDF files already read, start time: 2025-04-21 13:46:51
Analyzing file: unsettled_trade_email_0.pdf, time started: 2025-04-21 13:46:52
Analyzing file: unsettled_trade_email_1.pdf, time started: 2025-04-21 13:46:53
Analyzing file: unsettled_trade_email_2.pdf, time started: 2025-04-21 13:46:54
Analyzing file: unsettled_trade_email_3.pdf, time started: 2025-04-21 13:46:54
Analyzing file: unsettled_trade_email_4.pdf, time started: 2025-04-21 13:46:55
Analyzing file: unsettled_trade_email_5.pdf, time started: 2025-04-21 13:46:56
Analyzing file: unsettled_trade_email_6.pdf, time started: 2025-04-21 13:46:56
Analyzing file: unsettled_trade_email_7.pdf, time started: 2025-04-21 13:46:57
Analyzing file: unsettled_trade_email_8.pdf, time started: 2025-04-21 13:46:58
Analyzing file: unsettled_trade_email_9.pdf, time started: 2025-04-21 13:46:58
Analyzing file: unsettled_trade_email_with_table.pdf, time started: 2025-04-21 13:46:59


In [22]:
presidio_results_df.head()

Unnamed: 0,file_name,entity,start,end,score,text,CID,source
0,unsettled_trade_email_0.pdf,EMAIL_ADDRESS,976,994,1.0,Unsettled Trade Notification - Guy\n\nCesar Pi...,paul94@example.net,presidio
1,unsettled_trade_email_0.pdf,ISIN,1087,1099,0.9,Unsettled Trade Notification - Guy\n\nCesar Pi...,US0563377191,presidio
2,unsettled_trade_email_0.pdf,ISIN,1128,1140,0.9,Unsettled Trade Notification - Guy\n\nCesar Pi...,GB9310278618,presidio
3,unsettled_trade_email_0.pdf,ISIN,1170,1182,0.9,Unsettled Trade Notification - Guy\n\nCesar Pi...,FR6165831749,presidio
4,unsettled_trade_email_0.pdf,ISIN,1211,1223,0.9,Unsettled Trade Notification - Guy\n\nCesar Pi...,DE1756193227,presidio
