In [None]:
#I will use NER of spacy to extract the entities from the text
#I will use the spacy library to extract the entities from the text
import spacy
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

In [None]:
def extract_text_from_pdf(file_path):
    # Set the tesseract executable path
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path as per your installation
    images = convert_from_path(file_path, dpi=300)
    text = ''
    for i, image in enumerate(images):
        text += pytesseract.image_to_string(image)
    return text    

file_path = r'C:\Users\sakpa\Projects\Upgrad\Gen AI\QTS\Data\unsettled_trade_email_final.pdf'
email_content = extract_text_from_pdf(file_path)

In [None]:
# Perform NER using spaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(email_content)
entities = [(ent.text, ent.label_) for ent in doc.ents]
print('==================================',email_content,'==========================================\n')
# Print the extracted entities
for entity in entities:
    print(f"Entity: {entity[0]}, Label: {entity[1]}")


From: John Doe, Settlements Team, Bank 1
To: Operations Team, Bank 2

CC: Trade Support Team, Bank 1

Dear Bank 2 Ops Team,

We would like to bring to your attention that the following trade remains unsettled as of today. Kindly

review the details and provide an update on the settlement status at the earliest.

Trade Details:

[See Screenshot Below]

Issue: The trade was expected to settle on [Settlement Date], but it has not been processed
successfully. We request you to investigate the reason for the delay and advise on the next steps

required to complete the settlement.

Requested Action:
1. Please confirm the status of the trade.
2. Provide any required actions from our end to facilitate the settlement.

3. If there are any discrepancies, kindly highlight them for resolution.

Your prompt attention to this matter would be highly appreciated. Kindly acknowledge receipt of this

email and provide an update by [Response Deadline].

Best regards,

John Doe

Settlements Team

Bank 1


In [None]:
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer


In [None]:
# Initialize Presidio Analyzer for sensitive data detection
analyzer = AnalyzerEngine()

In [None]:
# Call analyzer to get results
results = analyzer.analyze(text=email_content,
                           language='en')
for result in results:
    print(f"Type: {result.entity_type}, Text: {email_content[result.start:result.end]}, Score: {result.score}")

Type: EMAIL_ADDRESS, Text: johndoe@bank1.com, Score: 1.0
Type: PERSON, Text: John Doe, Score: 0.85
Type: DATE_TIME, Text: today, Score: 0.85
Type: PERSON, Text: John Doe

, Score: 0.85
Type: LOCATION, Text: Zurich, Score: 0.85
Type: LOCATION, Text: Switzerland, Score: 0.85
Type: DATE_TIME, Text: 2025-03-18, Score: 0.85
Type: DATE_TIME, Text: 2025-03-19, Score: 0.85
Type: NRP, Text: GB0987654321, Score: 0.85
Type: DATE_TIME, Text: 2025-03-20 750 FR5678901234, Score: 0.85
Type: DATE_TIME, Text: 2025-03-21, Score: 0.85
Type: DATE_TIME, Text: 2025-03-22, Score: 0.85
Type: PHONE_NUMBER, Text: +41 22 123 4567, Score: 0.75
Type: URL, Text: bank1.com, Score: 0.5
Type: IN_PAN, Text: resolution, Score: 0.4
Type: PHONE_NUMBER, Text: 023089313956.10, Score: 0.4
Type: PHONE_NUMBER, Text: 023010874934, Score: 0.4
Type: PHONE_NUMBER, Text: 023081243784, Score: 0.4
Type: PHONE_NUMBER, Text: 023091308470, Score: 0.4
Type: PHONE_NUMBER, Text: 023083536513, Score: 0.4
Type: US_DRIVER_LICENSE, Text: L3, S

In [None]:

# Initialize Presidio Analyzer
analyzer = AnalyzerEngine()

#define the regex pattern for account number
acct_regex = r"(?:\b(?:Account|Ac|Act|transfer)\b[\s:,-]*)?(?:0230[.-]?)?\d{5}[.-]?[A-Za-z\d]{1,2}"

# Define updated regex pattern for Account Number
account_number_pattern = Pattern(name="account_number_pattern",
                                 regex= acct_regex,
                                 score=0.95)

# Create a Pattern Recognizer
account_recognizer = PatternRecognizer(supported_entity="ACCOUNT_NUMBER", patterns=[account_number_pattern])

# Add the custom recognizer to the analyzer
analyzer.registry.add_recognizer(account_recognizer)

# Sample Texts for Testing
texts = [
    "Please check Account 023012345-AZ for the transfer.",
    "My Ac. 023056789.B1 needs approval.",
    "Act 56789-XY is pending verification.",
    "Transfer 023078543-2X was successful.",
    "The account number 987654-AB is incorrect.",  # Should NOT match (missing 5 digits)
]

# Run the analyzer on each text
for text in texts:
    results = analyzer.analyze(text=text, entities=["ACCOUNT_NUMBER"], language="en")
    print(f"\nText: {text}")
    for result in results:
        print(f"Type: {result.entity_type}, Text: {text[result.start:result.end]}, Score: {result.score}")



Text: Please check Account 023012345-AZ for the transfer.
Type: ACCOUNT_NUMBER, Text: Account 023012345-AZ, Score: 0.95

Text: My Ac. 023056789.B1 needs approval.
Type: ACCOUNT_NUMBER, Text: 023056789.B1, Score: 0.95

Text: Act 56789-XY is pending verification.
Type: ACCOUNT_NUMBER, Text: Act 56789-XY, Score: 0.95

Text: Transfer 023078543-2X was successful.
Type: ACCOUNT_NUMBER, Text: Transfer 023078543-2X, Score: 0.95

Text: The account number 987654-AB is incorrect.
Type: ACCOUNT_NUMBER, Text: 987654, Score: 0.95


In [None]:
# Detect CID using Presidio
results = analyzer.analyze(text=email_content, entities=["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "ACCOUNT_NUMBER"], language="en")

# Extract detected entities
cid_data = []
for result in results:
    entity_text = email_content[result.start:result.end]
    cid_data.append(f"{result.entity_type}: {entity_text}")

print("Presidio detected:", cid_data)

Presidio detected: ['EMAIL_ADDRESS: johndoe@bank1.com', 'ACCOUNT_NUMBER: 1234567', 'ACCOUNT_NUMBER: 02308931395', 'ACCOUNT_NUMBER: 0987654', 'ACCOUNT_NUMBER: 02301087493', 'ACCOUNT_NUMBER: 5678901', 'ACCOUNT_NUMBER: 02308124378', 'ACCOUNT_NUMBER: 4321098', 'ACCOUNT_NUMBER: 02309130847', 'ACCOUNT_NUMBER: 6789012', 'ACCOUNT_NUMBER: 02308353651', 'PERSON: John Doe', 'PERSON: John Doe\n\n', 'PHONE_NUMBER: +41 22 123 4567', 'PHONE_NUMBER: 023089313956.10', 'PHONE_NUMBER: 023010874934', 'PHONE_NUMBER: 023081243784', 'PHONE_NUMBER: 023091308470', 'PHONE_NUMBER: 023083536513']


In [None]:
email_content

'Subject: Urgent: Trade Unsettlement Notification\n\nFrom: John Doe, Settlements Team, Bank 1\nTo: Operations Team, Bank 2\n\nCC: Trade Support Team, Bank 1\n\nDear Bank 2 Ops Team,\n\nWe would like to bring to your attention that the following trade remains unsettled as of today. Kindly\n\nreview the details and provide an update on the settlement status at the earliest.\n\nTrade Details:\n\n[See Screenshot Below]\n\nIssue: The trade was expected to settle on [Settlement Date], but it has not been processed\nsuccessfully. We request you to investigate the reason for the delay and advise on the next steps\n\nrequired to complete the settlement.\n\nRequested Action:\n1. Please confirm the status of the trade.\n2. Provide any required actions from our end to facilitate the settlement.\n\n3. If there are any discrepancies, kindly highlight them for resolution.\n\nYour prompt attention to this matter would be highly appreciated. Kindly acknowledge receipt of this\n\nemail and provide an up

In [None]:
# Define OpenAI prompt
prompt = f"""
Extract customer-identifying data (CID) from the following text:
"{email_content}"

Detect:
- Names
- Emails
- Phone numbers
- Account numbers
- Addresses

Ensure accuracy and correct any errors. Respond in JSON format.
"""

# Call OpenAI
response = openai.ChatCompletion.create(
    model="gpt-4-turbo",
    messages=[{"role": "user", "content": prompt}]
)

# Get extracted data
cid_openai = response["choices"][0]["message"]["content"]
print("OpenAI detected:", cid_openai)


In [None]:
import json

# Convert OpenAI response to JSON
cid_openai_dict = json.loads(cid_openai)

# Merge with Presidio results
merged_cid = {
    "Presidio": cid_data,
    "OpenAI": cid_openai_dict
}

print("Final CID Extraction:", merged_cid)


In [None]:

# Initialize Presidio Analyzer
analyzer = AnalyzerEngine()

def detect_cid_with_presidio(text):
    """Extracts customer-identifying data using Presidio."""
    
    entities = ["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "ACCOUNT_NUMBER", "ORGANIZATION", "LOCATION", "DATE_TIME"]
    
    results = analyzer.analyze(text=text, entities=entities, language="en")
    
    detected_cid = {}
    for result in results:
        entity_text = text[result.start:result.end]
        detected_cid[result.entity_type] = entity_text
    
    return detected_cid


In [None]:
import json

def detect_cid_with_openai(text):
    """Extracts customer-identifying data using OpenAI GPT."""
    
    prompt = f"""
    Extract customer-identifying data (CID) from the following text:
    "{text}"

    Detect:
    - Names
    - Emails
    - Phone numbers
    - Account numbers
    - Addresses
    - Company names
    - Dates of birth

    Ensure accuracy and correct any errors. Respond in JSON format.
    """

    response = openai.ChatCompletion.create(
        model="gpt-4-turbo",
        messages=[{"role": "user", "content": prompt}]
    )

    return json.loads(response["choices"][0]["message"]["content"])


In [None]:
def merge_cid_results(presidio_cid, openai_cid):
    """Merges CID results from Presidio and OpenAI."""
    
    merged_cid = {**presidio_cid}  # Start with Presidio results

    for key, value in openai_cid.items():
        if key not in merged_cid:
            merged_cid[key] = value  # Add missing values
        elif merged_cid[key] != value:
            merged_cid[key] += f", {value}"  # Combine both
    
    return merged_cid


In [None]:
from presidio_anonymizer import AnonymizerEngine

# Initialize Anonymizer
anonymizer = AnonymizerEngine()

def anonymize_text(text, detected_cid):
    """Anonymizes detected CID in the text."""
    
    anonymized_text = text
    for entity, value in detected_cid.items():
        anonymized_text = anonymized_text.replace(value, f"[{entity}]")
    
    return anonymized_text


In [None]:
def process_email(email_text):
    """Processes an email, extracts CID, and anonymizes it."""

    # Step 1: Detect CID using Presidio
    presidio_cid = detect_cid_with_presidio(email_text)
    print("Presidio CID:", presidio_cid)

    # Step 2: Detect CID using OpenAI
    openai_cid = detect_cid_with_openai(email_text)
    print("OpenAI CID:", openai_cid)

    # Step 3: Merge results
    merged_cid = merge_cid_results(presidio_cid, openai_cid)
    print("Merged CID:", merged_cid)

    # Step 4: Anonymize text
    anonymized_text = anonymize_text(email_text, merged_cid)
    
    return anonymized_text


In [None]:
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, LemmaContextAwareEnhancer
from presidio_analyzer.nlp_engine import SpacyNlpEngine

# create a function that return an account patternrecognizer
def create_account_pattern_recognizer():
    """Creates a custom Account Number recognizer."""
    
    # Define regex pattern for account number 
    # may start with 0230, followed by 5 digits and an optional suffix of 1 or 2 alphanumeric characters
    # The pattern allows for optional separators (., -) between the digits and the suffix
    # and between the prefix and the digits.

    # Example: 023012345-AZ, 023056789.B1, 56789-XY, 023078543-2X
    # Updated regex pattern to match the new format

    acct_regex = r'\b(?:0230[.-]?)?\d{5}[.-]?[A-Z0-9]{1,2}\b'
    
    # Define updated regex pattern for Account Number
    account_number_pattern = Pattern(name="account_number_pattern",
                                     regex=acct_regex,
                                     score=0.95
                                     )
    
    # Create a Pattern Recognizer
    account_recognizer = PatternRecognizer(supported_entity="ACCOUNT_NUMBER", 
                                           patterns=[account_number_pattern]
                                           # context_enhancers = [LemmaContextAwareEnhancer()]
                                           )
    
    
    return account_recognizer

In [None]:
# Example usage
print(email_content)

In [None]:
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, LemmaContextAwareEnhancer
from presidio_analyzer.nlp_engine import SpacyNlpEngine
import spacy

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Initialize SpacyNlpEngine
spacy_nlp_engine = SpacyNlpEngine(nlp, "en")

def analyze_account_number(text):
    # Create PatternRecognizer for account numbers
   
    account_number_recognizer = create_account_pattern_recognizer()

    # Initialize AnalyzerEngine with SpacyNlpEngine
    analyzer = AnalyzerEngine(nlp_engine=spacy_nlp_engine)
    analyzer.registry.add_recognizer(account_number_recognizer)

    # Analyze the text
    results = analyzer.analyze(text=text, language="en")
    return results

# Example usage
text1 = "My account number is 1234567890. Please process payment."
text2 = "Acct number: 98765432. Bank account details are important."
text3 = "Random numbers like 12345678 are not account numbers."

results1 = analyze_account_number(email_content)


print("Results 1:", results1)


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'C:\Users\sakpa\Projects\Upgrad\Gen AI\QTS\Data\emails.csv\emails.csv')
df.head()


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [10]:
print(df.iloc[88000].message)

Message-ID: <22104263.1075848301039.JavaMail.evans@thyme>
Date: Thu, 19 Apr 2001 04:51:00 -0700 (PDT)
From: dgagliardi@reliant.com
To: chay@reliant.com, clint.dean@enron.com
Subject: Fw: True Orange E-Mail/Fax #48
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: dgagliardi@reliant.com
X-To: chay@reliant.com, clint.dean@enron.com
X-cc: 
X-bcc: 
X-Folder: \Clint_Dean_Nov2001\Notes Folders\Info
X-Origin: DEAN-C
X-FileName: cdean.nsf

----- Forwarded by David M Gagliardi/TTG/HouInd on 04/19/01 11:50 AM -----

"Michael
Gagliardi"           To:     <dgagliardi@reliantenergy.com>, 
david_ricks@gsdm.com,
<mikegag@msn.        tony.a.rogers@fritolay.com
com>                 cc:
Subject:     Fw: True Orange E-Mail/Fax #48
04/19/01
09:12 AM








----- Original Message -----
From: TruOrange@aol.com
Sent: Wednesday, April 18, 2001 9:39 PM
To: TruOrange@aol.com; hramsey@intellimark-it.com
Subject: True Orange E-Mail/Fax #48

True Orange Fax/E-Mai

In [11]:
short_df = df.iloc[500:800]
short_df.head()

Unnamed: 0,file,message
500,allen-p/_sent_mail/553.,Message-ID: <15982773.1075855728341.JavaMail.e...
501,allen-p/_sent_mail/554.,Message-ID: <19237776.1075855728362.JavaMail.e...
502,allen-p/_sent_mail/555.,Message-ID: <32350375.1075855728384.JavaMail.e...
503,allen-p/_sent_mail/556.,Message-ID: <30301161.1075855728406.JavaMail.e...
504,allen-p/_sent_mail/557.,Message-ID: <16004214.1075855728430.JavaMail.e...


In [12]:
short_df.shape

(300, 2)