In [1]:
#Imports and Models

import re
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [2]:
#Define regex patterns

# Dates (12/05/2021, 2021-05-12,15-Jan-2025 etc.)
# Numeric dates: 12/05/2021, 2021-05-12
numeric_dates = r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b"

# Month-name dates: Dec 1, 2018  or  1 Dec 2018
month_dates = r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{1,2},?\s\d{4}\b" \
                  r"|\b\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*,?\s\d{4}\b"

# Phone numbers (+1-202-555-0173, 9876543210, etc.)
phone_pattern = r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,5}[-.\s]?\d{4}"

# Emails
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

# Generic IDs (Aadhaar, SSN-like, etc.)
id_pattern = r'\b(?:[A-Z]{2,10}-\d{3,10}|\d{3,10}-[A-Z]{1,10})\b'



#Name pattern
# Regex to catch names after 'Patient:' or 'Name:'
name_pattern = r'(?<=Patient:\s)[A-Z][a-z]+(?:\s[A-Z][a-z]+)?'


# Address pattern: ( india and US friendly)
# - Numbers +street name (e.g., 102 Maple Road)

address_pattern = r'\d{1,5}\s[A-Za-z0-9\s\.]+(?:Street|St|Road|Rd|Avenue|Ave|Lane|Ln|Nagar|Colony|Boulevard|Blvd|Drive|Dr|Court|Ct)?[,\.]?\s?[A-Za-z\s]*[,\.]?\s?(?:[A-Za-z]+)?\s?\d{6}?'


In [3]:
#Regex deidentification function

def regex_deidentify(text):
    text = re.sub(numeric_dates, "[DATE]", text)
    text = re.sub(month_dates, "[DATE]", text, flags=re.IGNORECASE)
    text = re.sub(phone_pattern, "[CONTACT]", text)
    text = re.sub(email_pattern, "[CONTACT]", text)
    text = re.sub(id_pattern, "[ID]", text)
    text = re.sub(name_pattern, "[NAME]", text)
    text = re.sub(address_pattern, "[ADDRESS]", text, flags=re.IGNORECASE)
    return text


In [4]:
def spacy_deidentify(text: str) -> str:
    doc = nlp(text)
    new_text = text
    
    # --- Mask PHI using spaCy ---
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            new_text = new_text.replace(ent.text, "[NAME]")
        elif ent.label_ == "ORG":
            if "hospital" in ent.text.lower() or "clinic" in ent.text.lower() or "insurance" in ent.text.lower():
                new_text = new_text.replace(ent.text, "[ORG]")
        elif ent.label_ in ["GPE", "LOC", "FAC"]:
            if any(keyword in ent.text.lower() for keyword in ["road", "street", "lane", "city", "state"]):
                new_text = new_text.replace(ent.text, "[ADDRESS]")

    # --- Extra manual insurance masking ---
    insurance_orgs = ["Blue Shield", "United Healthcare", "Aetna"]
    for org in insurance_orgs:
        if org in new_text:
            new_text = new_text.replace(org, "[ORG]")

    return new_text



In [5]:
#Combined Pipeline

def deidentify_text(text):
    text = regex_deidentify(text)
    text = spacy_deidentify(text)
    return text


In [6]:
sample_note = """
Patient Name: John Smith
Visited: City Hospital, New York
Date of Admission: 12/05/2021
Phone: +1-202-555-0173
Email: john.smith@example.com
Insurance Provider: Blue Shield
Patient ID: AB12345678
"""

print("Original Note:\n", sample_note)
print("\n--- After Regex ---\n", regex_deidentify(sample_note))
print("\n--- After spaCy ---\n", spacy_deidentify(sample_note))
print("\n--- Final De-identified Note ---\n", deidentify_text(sample_note))


Original Note:
 
Patient Name: John Smith
Visited: City Hospital, New York
Date of Admission: 12/05/2021
Phone: +1-202-555-0173
Email: john.smith@example.com
Insurance Provider: Blue Shield
Patient ID: AB12345678


--- After Regex ---
 
Patient Name: John Smith
Visited: City Hospital, New York
Date of Admission: [DATE]
Phone: [CONTACT]
Email: [CONTACT]
Insurance Provider: Blue Shield
Patient ID: AB[CONTACT]


--- After spaCy ---
 
Patient Name: [NAME]
Visited: [ORG], New York
Date of Admission: 12/05/2021
Phone: +1-202-555-0173
Email: john.smith@example.com
Insurance Provider: [ORG]
Patient ID: AB12345678


--- Final De-identified Note ---
 
Patient Name: [NAME]
Visited: [ORG], New York
Date of Admission: [DATE]
Phone: [CONTACT]
Email: [CONTACT]
Insurance Provider: [ORG]
Patient ID: AB[CONTACT]



In [7]:
# Sample text
sample_text = """
Patient Name: Anil Kumar
Admission Date: 14-Aug-2021
Treated at Fortis Hospital, Bengaluru.
Contact: 080-22446655
License #: DL-29384
"""

# Call your de-identification function
deidentified_text =deidentify_text(sample_text)

# Print the output
print(deidentified_text)



Patient Name: [NAME]
Admission Date: 14-Aug-2021
Treated at [ORG], Bengaluru.
Contact: [CONTACT]
License #: [ID]



In [8]:
# Sample text
sample_text = """
Patient: Amit Sharma
Visited AIIMS Delhi on 10/10/2020.
Record No: AIIMS-88392
Phone: +91-9812345678
"""

# Call your de-identification function
deidentified_text =deidentify_text(sample_text)

# Print the output
print(deidentified_text)


Patient: [NAME]
Visited [NAME] on [DATE].
Record No: [ID]
Phone: [CONTACT]



In [9]:
# Sample text
sample_text = """
Mr. Daniel Lopez presented with chest pain.
Seen at Mercy General Hospital, Los Angeles on 03-25-2022.
Contact: 213-555-2345
Insurance Beneficiary No: 99123-AZ
"""

# Call your de-identification function
deidentified_text =deidentify_text(sample_text)

# Print the output
print(deidentified_text)


Mr. [NAME] presented with chest pain.
Seen at [ORG], Los Angeles on [DATE].
Contact: [CONTACT]
Insurance Beneficiary No: [ID]



In [10]:
# Sample text
sample_text = """
Name: Linda Martinez
Seen at Kaiser Permanente, San Francisco on Dec 1, 2018.
DOB: 02-14-1985
Policy ID: KP-556677
Contact: 415-444-7788
"""

# Call your de-identification function
deidentified_text =deidentify_text(sample_text)

# Print the output
print(deidentified_text)


Name: [NAME]
Seen at Kaiser Permanente, San Francisco on [DATE].
DOB: [DATE]
Policy ID: [ID]
Contact: [CONTACT]



In [11]:
# Sample text
sample_text = """
Patient Jane Williams, DOB: 08/09/1978.
Consulted with Dr. Anthony Wilson at Sunrise Clinic.
Address: 102 Maple Road, Seattle, WA 98101.
Contact: jwilliams78@gmail.com
"""

# Call your de-identification function
deidentified_text =deidentify_text(sample_text)

# Print the output
print(deidentified_text)


Patient [NAME], DOB: [DATE].
Consulted with Dr. [NAME] at [ORG].
Address: 102 Maple Road, Seattle, WA 98101.
Contact: [CONTACT]



In [12]:
# Sample text
sample_text = """
Patient Name: Mary Johnson
Age: 54
Visited: Apollo Hospital, Chennai
Date of Admission: 14/08/2023
Phone: +91-9876543210
Email: mary.johnson@example.com
Patient ID: XY98765432

Chief Complaint: Patient reported Fever, persistent Cough, and severe Chest pain.  
Past Medical History: Diabetes, Hypertension.  
Current Medications: Metformin 500mg daily, Aspirin 75mg.  
Lab Tests Ordered: Blood Glucose, Hemoglobin, CT scan chest.  
Procedure Performed: Appendectomy in 2020.  

Consulted with Dr. David Miller at Apollo Hospital.  
Patient discharged to home at 123 MG Road, Chennai on 20/08/2023.
"""

# Call your de-identification function
deidentified_text =deidentify_text(sample_text)

# Print the output
print(deidentified_text)


Patient Name: [NAME]
Age: 54
Visited: [ORG], Chennai
Date of Admission: [DATE]
Phone: [CONTACT]
Email: [CONTACT]
Patient ID: XY[CONTACT]

Chief Complaint: Patient reported Fever, persistent Cough, and severe Chest pain.  
Past Medical History: Diabetes, Hypertension.  
Current Medications: Metformin 500mg daily, Aspirin 75mg.  
Lab Tests Ordered: [NAME], Hemoglobin, CT scan chest.  
Procedure Performed: Appendectomy in 2020.  

Consulted with Dr. [NAME] at [ORG].  
Patient discharged to home at 123 MG Road, Chennai on [DATE].



In [13]:
import os
os.getcwd()


'C:\\Users\\prana\\Desktop\\Job\\NLP Healthcare Domain\\NLP Healthcare Projects\\Clinical_Notes_Deidentifier_Project'

In [14]:
import os
# Change this to the folder where your notebook + deidentifier.py are
os.chdir(r"C:\Users\\prana\\Desktop\\Job\\NLP Healthcare Domain\\NLP Healthcare Projects\\Clinical_Notes_Deidentifier_Project")


In [15]:
from deid_pipeline import deidentify_text

sample_text = """
Patient Jane Williams, DOB: 08/09/1978.
Consulted with Dr. Anthony Wilson at Sunrise Clinic.
Address: 102 Maple Road, Seattle, WA 98101.
Contact: jwilliams78@gmail.com
"""

deidentified_text = deidentify_text(sample_text)
print(deidentified_text)


ModuleNotFoundError: No module named 'deid_pipeline'

In [None]:
from deid_pipeline import deidentify_text

sample_text = """
Patient Name: Mary Johnson
Age: 54
Visited: Apollo Hospital, Chennai
Date of Admission: 14/08/2023
Phone: +91-9876543210
Email: mary.johnson@example.com
Patient ID: XY98765432

Chief Complaint: Patient reported Fever, persistent Cough, and severe Chest pain.  
Past Medical History: Diabetes, Hypertension.  
Current Medications: Metformin 500mg daily, Aspirin 75mg.  
Lab Tests Ordered: Blood Glucose, Hemoglobin, CT scan chest.  
Procedure Performed: Appendectomy in 2020.  

Consulted with Dr. David Miller at Apollo Hospital.  
Patient discharged to home at 123 MG Road, Chennai on 20/08/2023.

"""

deidentified_text = deidentify_text(sample_text)
print(deidentified_text)

In [None]:
import importlib
import deid_pipeline   # loads your py file

# Force reload in case you edit deid_pipeline.py
importlib.reload(deid_pipeline)


In [None]:
sample_note = """
Patient Name: Mary Johnson
Age: 54
Visited: Apollo Hospital, Chennai
Date of Admission: 14/08/2023
Phone: +91-9876543210
Email: mary.johnson@example.com
Patient ID: XY98765432

Chief Complaint: Patient reported Fever, persistent Cough, and severe Chest pain.  
Past Medical History: Diabetes, Hypertension.  
Current Medications: Metformin 500mg daily, Aspirin 75mg.  
Lab Tests Ordered: Blood Glucose, Hemoglobin, CT scan chest.  
Procedure Performed: Appendectomy in 2020.  

Consulted with Dr. David Miller at Apollo Hospital.  
Patient discharged to home at 123 MG Road, Chennai on 20/08/2023
"""

cleaned = deid_pipeline.deidentify_text(sample_note)
print(cleaned)


In [None]:
import deid_pipeline
print(deid_pipeline.__file__)


In [16]:
import importlib
import deid_pipeline_fixed   # 1️⃣ import the module first
importlib.reload(deid_pipeline_fixed)   # 2️⃣ then reload

# Now test
sample_note = """
Patient Name: Mary Johnson
Age: 54
Visited: Apollo Hospital, Chennai
Date of Admission: 14/08/2023
Phone: +91-9876543210
Email: mary.johnson@example.com
Patient ID: XY98765432

Chief Complaint: Patient reported Fever, persistent Cough, and severe Chest pain.  
Past Medical History: Diabetes, Hypertension.  
Current Medications: Metformin 500mg daily, Aspirin 75mg.  
Lab Tests Ordered: Blood Glucose, Hemoglobin, CT scan chest.  
Procedure Performed: Appendectomy in 2020.  

Consulted with Dr. David Miller at Apollo Hospital.  
Patient discharged to home at 123 MG Road, Chennai on 20/08/2023
"""

cleaned = deid_pipeline_fixed.deidentify_text(sample_note)
print(cleaned)



Patient Name: [NAME]
Age: [ADDRESS]: [ORG], [ADDRESS]
Date of Admission: [DATE]
Phone: [CONTACT]
Email: [CONTACT]
Patient ID: XY[CONTACT]

Chief Complaint: Patient reported Fever, persistent Cough, and severe Chest pain.  
Past Medical History: Diabetes, Hypertension.  
Current Medications: Metformin 500mg daily, Aspirin 75mg.  
Lab Tests Ordered: [NAME], [ADDRESS], CT scan chest.  
Procedure Performed: Appendectomy in [DATE].  

Consulted with Dr. [NAME] at [ORG].  
Patient discharged to home at [ADDRESS] [DATE]

