In [None]:
import pandas as pd
from faker import Faker
import random
import re
from typing import List, Dict
import string  


In [None]:
# FAKER Library

fake = Faker('en_CA')
fake_us = Faker('en_US')

In [None]:
# SIN CA
def luhn_algorithm(sin_without_check_digit):
    """Calculate and return the valid 9th digit based on the Luhn algorithm"""
    total = 0
    reverse_digits = sin_without_check_digit[::-1]
    
    for i, digit in enumerate(reverse_digits):
        num = int(digit)
        if i % 2 == 1:  # Double every second digit (from the right)
            num *= 2
            if num > 9:
                num -= 9
        total += num
    
    # Compute the check digit so that the total is divisible by 10
    check_digit = (10 - (total % 10)) % 10
    return check_digit

def generate_canadian_sin():
    """Generate a valid Canadian SIN (Social Insurance Number)"""
    # Randomly generate the first 8 digits
    sin_without_check_digit = ''.join([str(random.randint(1, 9)) if i == 0 else str(random.randint(0, 9)) for i in range(8)])
    
    # Calculate the 9th check digit using the Luhn algorithm
    check_digit = luhn_algorithm(sin_without_check_digit)
    
    # Return the complete SIN
    return sin_without_check_digit + str(check_digit)


In [None]:
# SSN US

def generate_us_ssn():
    """Generate a valid U.S. Social Security Number (SSN) following SSA rules."""

    while True:
        # Generate first three digits (area number)
        area = random.randint(1, 899)  # Cannot be 000, 900-999
        if area == 666 or (900 <= area <= 999):
            continue

        # Generate next two digits (group number) - Cannot be 00
        group = random.randint(1, 99)  # Ensures it's between 01-99

        # Generate last four digits (serial number) - Cannot be 0000
        serial = random.randint(1, 9999)  # Ensures it's between 0001-9999

        # Format as XXX-XX-XXXX
        ssn = f"{area:03d}-{group:02d}-{serial:04d}"

        # Ensure the generated SSN is not in the reserved range (987-65-4320 to 987-65-4329)
        ssn_number = int(ssn.replace("-", ""))
        if 987654320 <= ssn_number <= 987654329:
            continue  # Regenerate if in the invalid range
        return ssn  # Return valid SSN


In [None]:
# Canadian Passport

def generate_canadian_passport():
    """Generate a valid Canadian passport number (XX123456)."""
    letters = ''.join(random.choices(string.ascii_uppercase, k=2))  # Two uppercase letters
    digits = ''.join(random.choices(string.digits, k=6))  # Six digits
    return f"{letters}{digits}"



In [None]:
# Custom PII generation functions


def generate_canadian_pr_number():
    """Generate a Canadian Permanent Resident (PR) number"""
    return f"PR-{random.randint(1000, 9999)}-{random.randint(1000, 9999)}"

def generate_client_card_number():
    """Generate a client card number (similar to a credit card number)"""
    return f"CC{random.randint(1000000000000000, 9999999999999999)}"

def generate_account_number():
    """Generate a bank account number"""
    return f"ACC{random.randint(1000000000, 9999999999)}"

def generate_drivers_license():
    """Generate a driver's license number in a custom format"""
    return f"DL{random.randint(1000000, 9999999)}"


# Define PII types and generation rules (focused on financial and banking data only)

PII_TYPES = {
    "time": fake.time,
    "date": fake.date,
    "name": fake.name,
    "email": fake.email,
    "phone_number": fake.phone_number,
    "street_address": fake.address,
    "sin": generate_canadian_sin,
    "ssn": generate_us_ssn,
    "passport_number": generate_canadian_passport,
    "driver_license_number": generate_drivers_license,
    "credit_card_number": fake.credit_card_number,
    # "client_card": generate_client_card_number,
    # "account_number": generate_account_number,
    # "pr_number": generate_canadian_pr_number,
}


In [None]:
def generate_pii_text() -> str:
    """Generate a long text containing random PII (3–5 sentences) with a financial theme"""
    sentences = []
    pii_list = []
    pii_type_list = []  # To store the list of generated PII types
    # Randomly insert 3–5 PII fields into the text
    for _ in range(random.randint(2, 5)):
        pii_type = random.choice(list(PII_TYPES.keys()))
        pii_value = PII_TYPES[pii_type]()  # Call function to generate PII value
        pii_list.append(pii_value)  # Add PII value to the list
        pii_type_list.append(pii_type)
        sentence = f"The client {pii_value} is associated with a {fake.word().capitalize()} account."  # Example template
        sentences.append(sentence)
    # Add random non-PII sentences
    sentences.extend([fake.sentence() for _ in range(random.randint(1, 3))])
    random.shuffle(sentences)  # Shuffle the sentence order
    return " ".join(sentences), pii_list, pii_type_list  # Return the full text and PII info

def generate_dataset(num_rows: int = 100) -> List[Dict]:
    """Generate a synthetic financial dataset with the specified number of rows"""
    data = []
    for _ in range(num_rows):
        text, pii_list, pii_type_list = generate_pii_text()  # Get the generated text and PII
        row = {
            "text": text,
            "pii_values": pii_list,  # Store the PII values as a list
            "pii_types": pii_type_list
        }
        data.append(row)
    return data

# Generate 100 rows of financial data
df = pd.DataFrame(generate_dataset(100))

df


Unnamed: 0,text,pii_values,pii_types
0,The client DL9370521 is associated with a Maxi...,"[Curtis Sanders, DL9370521, 822-53-4189, 1997-...","[name, driver_license_number, ssn, date, time]"
1,The client DL5239404 is associated with a Illu...,"[783-22-7503, DL5239404, hicksmark@example.net]","[ssn, driver_license_number, email]"
2,Eos voluptatum id similique quia vel. The clie...,"[17:07:53, SM760282, Rachel Wilson]","[time, passport_number, name]"
3,Similique ipsum vel veniam ab ipsum. The clien...,"[18:47:52, DL5290732, 575593433, 06:42:55, mar...","[time, driver_license_number, sin, time, email]"
4,The client cassandra20@example.com is associat...,"[1 (758) 570-4109, 7978 Joseph Summit\nNew Bra...","[phone_number, street_address, street_address,..."
...,...,...,...
95,The client DL4411288 is associated with a Volu...,"[1 (600) 165-9801, 104481053, 1-575-276-4504, ...","[phone_number, sin, phone_number, name, driver..."
96,Et nisi facere assumenda soluta quasi. The cli...,"[747922065, ZR912517]","[sin, passport_number]"
97,The client 1972-04-30 is associated with a Nis...,"[DL1729938, DL6511420, 3572903781730640, 1972-...","[driver_license_number, driver_license_number,..."
98,The client amandakane@example.net is associate...,"[amandakane@example.net, 1992-04-03, 787-42-7809]","[email, date, ssn]"


In [22]:
df

Unnamed: 0,text,pii_values,pii_types
0,The client DL9370521 is associated with a Maxi...,"[Curtis Sanders, DL9370521, 822-53-4189, 1997-...","[name, driver_license_number, ssn, date, time]"
1,The client DL5239404 is associated with a Illu...,"[783-22-7503, DL5239404, hicksmark@example.net]","[ssn, driver_license_number, email]"
2,Eos voluptatum id similique quia vel. The clie...,"[17:07:53, SM760282, Rachel Wilson]","[time, passport_number, name]"
3,Similique ipsum vel veniam ab ipsum. The clien...,"[18:47:52, DL5290732, 575593433, 06:42:55, mar...","[time, driver_license_number, sin, time, email]"
4,The client cassandra20@example.com is associat...,"[1 (758) 570-4109, 7978 Joseph Summit\nNew Bra...","[phone_number, street_address, street_address,..."
...,...,...,...
95,The client DL4411288 is associated with a Volu...,"[1 (600) 165-9801, 104481053, 1-575-276-4504, ...","[phone_number, sin, phone_number, name, driver..."
96,Et nisi facere assumenda soluta quasi. The cli...,"[747922065, ZR912517]","[sin, passport_number]"
97,The client 1972-04-30 is associated with a Nis...,"[DL1729938, DL6511420, 3572903781730640, 1972-...","[driver_license_number, driver_license_number,..."
98,The client amandakane@example.net is associate...,"[amandakane@example.net, 1992-04-03, 787-42-7809]","[email, date, ssn]"


In [23]:
df_shuffled = df.sample(frac=1).reset_index(drop=True)

In [24]:
df_shuffled

Unnamed: 0,text,pii_values,pii_types
0,The client Mary Obrien is associated with a Do...,"[Mary Obrien, YI756099, 1971-11-14, 13:42:25]","[name, passport_number, date, time]"
1,Sequi praesentium nostrum quas quod rem. Qui m...,"[Mark Sanchez, cwalter@example.net, patriciacr...","[name, email, email, ssn]"
2,The client 1-635-885-1340 is associated with a...,"[30559655561131, DL7787895, TU901215, 1-635-88...","[credit_card_number, driver_license_number, pa..."
3,The client KU769247 is associated with a Volup...,"[2009-04-10, KU769247, 311-457-2964]","[date, passport_number, phone_number]"
4,Quia velit sint vero quisquam hic sapiente. Th...,"[00:23:06, Michael Ward, 03:00:09, Charles Wil...","[time, name, time, name, email]"
...,...,...,...
95,The client Michelle Sutton is associated with ...,"[684552833, 519-55-8232, Michelle Sutton]","[sin, ssn, name]"
96,The client DL6544493 is associated with a Odio...,"[950201753, 02:55:23, DL6544493]","[sin, time, driver_license_number]"
97,Consequuntur nobis neque velit. Reprehenderit ...,"[EH752641, Stephen Williams, (666) 624-4605, 4...","[passport_number, name, phone_number, credit_c..."
98,Voluptatum sint perspiciatis repudiandae dolor...,"[379-11-0046, (759) 161-7247]","[ssn, phone_number]"


In [25]:
def find_nth_occurrence(text, word, n):
    """Find the nth occurrence of word in text"""
    start = -1
    for i in range(n):
        start = text.find(word, start + 1)
        if start == -1:
            return None  # If word not found n times
    end = start + len(word)
    return start, end

# Function to add 'pii_span' column
def add_pii_span(df):
    pii_spans = []
    for idx, row in df.iterrows():
        text = row['text']
        pii_values = row['pii_values']
        pii_types = row['pii_types']
        
        # List to store the spans for each PII value
        pii_span_list = []
        
        # Dictionary to count occurrences of each word in the pii_values
        word_count = {word: 0 for word in pii_values}
        
        for idx, word in enumerate(pii_values):
            word_count[word] += 1
            nth_occurrence = word_count[word]
            
            # Find nth occurrence of the word in the text
            span = find_nth_occurrence(text, word, nth_occurrence)
            if span:
                start, end = span
                pii_span_list.append({
                    'start': start,
                    'end': end,
                    'label': pii_types[idx]  # Corresponding label from pii_types
                })
            else:
                pii_span_list.append(None)  # If the word is not found enough times
        
        pii_spans.append(pii_span_list)
    
    df['pii_span'] = pii_spans
    return df

In [26]:
df_shuffled = add_pii_span(df_shuffled)
df_shuffled

Unnamed: 0,text,pii_values,pii_types,pii_span
0,The client Mary Obrien is associated with a Do...,"[Mary Obrien, YI756099, 1971-11-14, 13:42:25]","[name, passport_number, date, time]","[{'start': 11, 'end': 22, 'label': 'name'}, {'..."
1,Sequi praesentium nostrum quas quod rem. Qui m...,"[Mark Sanchez, cwalter@example.net, patriciacr...","[name, email, email, ssn]","[{'start': 311, 'end': 323, 'label': 'name'}, ..."
2,The client 1-635-885-1340 is associated with a...,"[30559655561131, DL7787895, TU901215, 1-635-88...","[credit_card_number, driver_license_number, pa...","[{'start': 127, 'end': 141, 'label': 'credit_c..."
3,The client KU769247 is associated with a Volup...,"[2009-04-10, KU769247, 311-457-2964]","[date, passport_number, phone_number]","[{'start': 148, 'end': 158, 'label': 'date'}, ..."
4,Quia velit sint vero quisquam hic sapiente. Th...,"[00:23:06, Michael Ward, 03:00:09, Charles Wil...","[time, name, time, name, email]","[{'start': 55, 'end': 63, 'label': 'time'}, {'..."
...,...,...,...,...
95,The client Michelle Sutton is associated with ...,"[684552833, 519-55-8232, Michelle Sutton]","[sin, ssn, name]","[{'start': 132, 'end': 141, 'label': 'sin'}, {..."
96,The client DL6544493 is associated with a Odio...,"[950201753, 02:55:23, DL6544493]","[sin, time, driver_license_number]","[{'start': 142, 'end': 151, 'label': 'sin'}, {..."
97,Consequuntur nobis neque velit. Reprehenderit ...,"[EH752641, Stephen Williams, (666) 624-4605, 4...","[passport_number, name, phone_number, credit_c...","[{'start': 288, 'end': 296, 'label': 'passport..."
98,Voluptatum sint perspiciatis repudiandae dolor...,"[379-11-0046, (759) 161-7247]","[ssn, phone_number]","[{'start': 125, 'end': 136, 'label': 'ssn'}, {..."


In [None]:
df_shuffled.to_csv("fake_data_simple.csv", index=False)