In [None]:
# Import librariries 
# We need basics like os, sys, time, and datetime,logging, warnings
# Data handling libraries like pandas, numpy, and scipy
# Data visualization libraries like matplotlib and seaborn
# Presidio libraries for data anonymization and PII detection
# and the custom Presidio configuration file
# Import Open AI libraries for LLMs and OpenAI API
import os
import sys
import time
import datetime
import logging
import warnings
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
import random
import string
import requests
import glob
import asyncio,aiofiles
import io
import multiprocessing



In [33]:
# Now install the Presidio libraries for PII detection and anonymization
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer import PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer import Pattern
from presidio_analyzer.nlp_engine import TransformersNlpEngine
from presidio_analyzer.predefined_recognizers import SpacyRecognizer

logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')



In [None]:
# Install tesseract for OCR and pytesseract for image processing
# Import pdfPlumber, pdf2Image
import pytesseract
from PIL import Image
import pdfplumber
from pdf2image import convert_from_path

In [None]:
# Import NLP package spaCy for text processing and NER
import spacy
from spacy import displacy
# Import OpenAI libraries for LLMs and OpenAI API
import openai

In [None]:
# Load spacy model for NER
nlp = spacy.load("en_core_web_sm")


In [None]:
def extract_text_from_pdf(file_path):
    # Set the tesseract executable path
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path as per your installation
    images = convert_from_path(file_path, dpi=300)
    text = ''
    for i, image in enumerate(images):
        text += pytesseract.image_to_string(image)
    return text    


In [None]:
# Create a function named read_pdf_files(), this should loop through ../data/fake_email_data folder and get all the pdf files in the folder using glob
def read_pdf_files(folder_path):
    email_files_txt = []
    pdf_files = glob.glob(os.path.join(folder_path, "*.pdf"))
    for pdf_file in pdf_files:
        text = extract_text_from_pdf(pdf_file)
        email_files_txt.append({
            'file_name': os.path.basename(pdf_file),
            'text': text
        })
    return email_files_txt
    

In [None]:
path = '../data/fake_email_data'
# Call the function to read PDF files
email_files_txt = read_pdf_files(path)

In [28]:
# Custom presidio analyzer for Account Number detection
from presidio_analyzer import Pattern




def custom_account_number_recognizer():
    #acct_regex = r"(?:\b(?:Account|Ac|Act|transfer)\b[\s:,-]*)?(?:0230[.-]?)?\d{5}[.-]?[A-Za-z\d]{2,3}"
    acct_regex = r'(?:\b(?:Account|Ac|Act|transfer)\b[\s:,-]*)?(?:0230[.-]?)?\d{5}[.-]?(?=[A-Za-z\d]{2,3}\b)(?=(?:[^0-9]*[0-9])?[^0-9]*$)[A-Za-z\d]{2,3}'

    account_number_pattern = Pattern(name="account_number_pattern",
                                 regex=acct_regex,
                                 score=0.85

                                 )
    # Create a Pattern Recognizer
    account_number_recognizer = PatternRecognizer(supported_entity="ACCOUNT_NUMBER", 
                                                  patterns=[account_number_pattern]
                                                  )
    
    return account_number_recognizer



In [29]:
def custom_isin_recognizer():
    isin_regex = r"\b([a-z]{2}\d{10})|([a-z]{3}[0-9]{1}[0-9a-z]{9})|([a-z]{2}[0-9]{1}[0-9a-z]{9})|([a-z]{3}[0-9]{1}[0-9a-z]{8})|([a-z]{2}\-\d{9}\-\d)|([a-z]{2}\-[0-9]{1}[0-9a-z]{8}\-\d)|([a-z]{3}\-[0-9]{1}[0-9a-z]{7}\-\d)"
    isin_pattern = Pattern(name="isin_pattern",
                           regex=isin_regex,
                           score=0.90
                           )
    # Create a Pattern Recognizer
    isin_recognizer = PatternRecognizer(supported_entity="ISIN", 
                                        patterns=[isin_pattern]
                                        )
    
    return isin_recognizer

In [35]:
def custom_address_recognizer():
        spacy_recognizer = SpacyRecognizer()
        spacy_recognizer.load()
        spacy_recognizer.supported_entities = ["LOCATION", "GPE", "LOC"]
        return spacy_recognizer


In [37]:
# Ensure the custom recognizer is registered before running the analysis
def analyze_pdf_files(path,email_files_txt):
	presidio_results_df = None

	print("Starting PDF analysis...")
	
	# Read the PDF files from the specified path
	if email_files_txt is None:
		print(f"Reading PDF files from {path}, start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
		email_files_txt = read_pdf_files(path)
		print(f"Finished reading PDF files, end time: {time.strftime('%Y-%m-%d %H:%M:%S')}")

	elif len(email_files_txt) != len(glob.glob(os.path.join(path, "*.pdf"))):
		print(f"Reading PDF files from {path}, start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
		email_files_txt = read_pdf_files(path)
		print(f"Finished reading PDF files, end time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
	else:
		print(f"PDF files already read, start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")

	# Initialize the analyzer engine with the custom recognizer - account number recognizer

	account_recognizer = custom_account_number_recognizer()

	# Initialize the analyzer engine with the custom recognizer - ISIN recognizer
	isin_recognizer = custom_isin_recognizer()

	# Initialize the analyzer engine with the custom recognizer - Address recognizer
	address_recognizer = custom_address_recognizer()
	
	# Create configuration containing engine name and models
	spacy_config = {
		"nlp_engine_name": "spacy",
		"models": [
					{
					"lang_code": "en", 
			  		"model_name": "en_core_web_trf"
					}	
				]
	}  

	# Create another configuration for transformers engine
	# Define which transformers model to use
	# below is now commented as we are using spacy based transformers model, HF model is throwing compilation error


	# transformer_config = {
    # "nlp_engine_name": "transformers",
    # "models": [
    #     {
    #         "lang_code": "en",
    #         "model": {
    #             "model_name": "dslim/bert-base-NER"
    #         }
    #     }
    # ]
	# }





	# Create NLP engine with spacy based transformers model
	nlp_engine = NlpEngineProvider(nlp_configuration=spacy_config)

	# Create NLP engine with transformers based model
	#nlp_engine = NlpEngineProvider(nlp_configuration=transformer_config)


	nlp_engine_provider = nlp_engine.create_engine()

	# Create the analyzer engine with the NLP engine provider
	analyzer = AnalyzerEngine(
		nlp_engine=nlp_engine_provider
	)


	# Add the custom recognizer to the analyzer
	analyzer.registry.add_recognizer(account_recognizer)
	analyzer.registry.add_recognizer(isin_recognizer)
	analyzer.registry.add_recognizer(address_recognizer)



	# Define the entities to analyze
	entities = ["ACCOUNT_NUMBER","ISIN", "EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "BANK_ACCOUNT", "IBAN", "PERSON", "LOCATION", "ORGANIATION","LOCATION", "GPE", "LOC"]

	# Analyze the text from the PDF files
	results = []
	for email_file in email_files_txt:
		
		text = email_file.get('text')
		file_name = email_file.get('file_name')
		analysis_results = None
		try:
			print(f"Analyzing file: {file_name}, time started: {time.strftime('%Y-%m-%d %H:%M:%S')}")
			
			# Perform analysis with the custom recognizer
			analysis_results = analyzer.analyze(text=text, entities= entities, language="en")
			for analysis_result in analysis_results:
				results.append({
					'file_name': file_name,
					'entity': analysis_result.entity_type,
					'start': analysis_result.start,
					'end': analysis_result.end,
					'score': analysis_result.score,
					'text': text,
					'CID' : text[analysis_result.start:analysis_result.end]
					})

			# Add each row to a dataframe if the dataframe exists, if not create for the first time
			if presidio_results_df is None:
				presidio_results_df = pd.DataFrame(results)
			else:
				presidio_results_df = pd.concat([presidio_results_df, pd.DataFrame(results)], ignore_index=True)

		except Exception as e:
			print(f"Error analyzing file {file_name}: {e}")
			continue
		# Store the analysis result in the results list so that we can convert it to a dataframe later. Need to get the text, entity type and score
	return presidio_results_df



In [38]:

# Ensure the custom recognizer is registered before running the analysis
path = '../data/fake_email_data'
# suppress presidio warnings
warnings.filterwarnings("ignore", category=UserWarning, module="presidio_analyzer")
# Analyze the PDF files
presidio_results_df = analyze_pdf_files(path,email_files_txt)


Starting PDF analysis...
PDF files already read, start time: 2025-04-08 01:23:26
Analyzing file: unsettled_trade_email_0.pdf, time started: 2025-04-08 01:23:30


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_1.pdf, time started: 2025-04-08 01:23:31


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_2.pdf, time started: 2025-04-08 01:23:32


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_3.pdf, time started: 2025-04-08 01:23:33


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_4.pdf, time started: 2025-04-08 01:23:36


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_5.pdf, time started: 2025-04-08 01:23:40


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_6.pdf, time started: 2025-04-08 01:23:42


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_7.pdf, time started: 2025-04-08 01:23:44


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_8.pdf, time started: 2025-04-08 01:23:46


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_9.pdf, time started: 2025-04-08 01:23:48


  with torch.cuda.amp.autocast(self._mixed_precision):


Analyzing file: unsettled_trade_email_with_table.pdf, time started: 2025-04-08 01:23:50


  with torch.cuda.amp.autocast(self._mixed_precision):


In [39]:
presidio_results_df.head()

Unnamed: 0,file_name,entity,start,end,score,text,CID
0,unsettled_trade_email_0.pdf,EMAIL_ADDRESS,976,994,1.0,Unsettled Trade Notification - Guy\n\nCesar Pi...,paul94@example.net
1,unsettled_trade_email_0.pdf,ISIN,1087,1099,0.9,Unsettled Trade Notification - Guy\n\nCesar Pi...,US0563377191
2,unsettled_trade_email_0.pdf,ISIN,1128,1140,0.9,Unsettled Trade Notification - Guy\n\nCesar Pi...,GB9310278618
3,unsettled_trade_email_0.pdf,ISIN,1170,1182,0.9,Unsettled Trade Notification - Guy\n\nCesar Pi...,FR6165831749
4,unsettled_trade_email_0.pdf,ISIN,1211,1223,0.9,Unsettled Trade Notification - Guy\n\nCesar Pi...,DE1756193227
