<a href="https://colab.research.google.com/github/PMattox/fs/blob/main/faker_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the anonymizer

In [None]:
try:
  from faker import Faker
  print("Faker is already installed.")
except ImportError:
  !pip install faker
  from faker import Faker

Collecting faker
  Downloading faker-36.2.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-36.2.2-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/1.9 MB[0m [31m25.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m27.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-36.2.2


In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime

def email_to_int(email):
    """Masks email addresses by replacing each character with its ordinal value"""
    result = ""
    for char in email:
        result += str(ord(char))
    return int(result)


def anonymize_dataframe_cellwise(df):
    """Anonymizes each cell in a Pandas DataFrame using its initial value as the seed."""
    anonymized_df = df.copy()

    for col in df.columns:
        for index, value in df[col].items():
          # try:
          #     # seed = int(value)
          #     seed = value
          # except ValueError:
          #     seed = hash(str(value))  # Use hash for deterministic conversion

          Faker.seed(value) # use the value associated with this row+column pair as the seed for faker
          faker = Faker() # create the faker instance after seeding

          # run the anonymizer/faker against just this particular cell (value)
          if col == 'uuid':
              # anonymized_df.loc[index, 'uuid_safe'] = faker.unique.random_number(digits=15)
              if value.find('@') != -1:
                  anonymized_df.loc[index, 'uuid_safe'] = email_to_int(value)
              else:
                  anonymized_df.loc[index, 'uuid_safe'] = value
          elif col == 'firstname':
              anonymized_df.loc[index, 'firstname'] = faker.first_name()
          elif col == 'lastname':
              anonymized_df.loc[index, 'lastname'] = faker.last_name()
          elif col == 'streetaddress':
              anonymized_df.loc[index, 'streetaddress'] = faker.street_address()
          elif col == 'phonenumber':
              anonymized_df.loc[index, 'phonenumber'] = faker.phone_number() # or faker.basic_phone_number()
          # Add more elif blocks for other columns as needed
          else: #for other columns, simply copy the value over
              anonymized_df.loc[index, col] = value

    # Drop the original 'uuid' column and move 'uuid_safe' to the front
    anonymized_df = anonymized_df.drop(columns=['uuid'])
    first_column = anonymized_df.pop('uuid_safe')
    anonymized_df.insert(0, 'uuid_safe', first_column)

    # Create a new row that we can use to identify the source of the anonymizer run
    new_row = pd.DataFrame({'uuid_safe': ['999999999'],
                        'firstname': [str(datetime.now().date())+'_firstname'],
                        'lastname': [str(datetime.now().time())+'_lastname'],
                        'streetaddress': ['xxxxxxx'],
                        'phonenumber': ['5555555555']
                       # Add other columns and their 'xxxxxxx' values here
                       })
    # Concatenate the new row to the DataFrame
    anonymized_df = pd.concat([anonymized_df, new_row], ignore_index=True)


    return anonymized_df

In [None]:
# set up a df with the relevant columns
data = []
data.append({'uuid': '101', 'firstname': 'Adam', 'lastname': 'Zuro', 'streetaddress': '42 Wallaby Way', 'phonenumber': '555-111-0000'})
data.append({'uuid': '102', 'firstname': 'Billy', 'lastname': 'Yondu', 'streetaddress': '123 Main St', 'phonenumber': '555-222-0000'})
data.append({'uuid': '103', 'firstname': 'Catherine', 'lastname': 'Xero', 'streetaddress': '915 Park Ave', 'phonenumber': '5552569857'})
data.append({'uuid': 'test@email.com', 'firstname': 'Donald', 'lastname': 'West', 'streetaddress': '16 Baker Trl', 'phonenumber': '(555)359-4848'})

df = pd.DataFrame(data)

print("STARTING DF")
df

STARTING DF


Unnamed: 0,uuid,firstname,lastname,streetaddress,phonenumber
0,101,Adam,Zuro,42 Wallaby Way,555-111-0000
1,102,Billy,Yondu,123 Main St,555-222-0000
2,103,Catherine,Xero,915 Park Ave,5552569857
3,test@email.com,Donald,West,16 Baker Trl,(555)359-4848


In [None]:
# run the anonymizer
anonymized_df = anonymize_dataframe_cellwise(df)

print("ANONYMIZED DF #1")
anonymized_df

ANONYMIZED DF #1


Unnamed: 0,uuid_safe,firstname,lastname,streetaddress,phonenumber
0,101,Mallory,Lucas,839 Beth Ridge Apt. 064,+1-986-574-7444x374
1,102,David,Dickson,338 Simmons Forks,477.724.9151x3404
2,103,Emma,Smith,6898 Jones Creek,001-283-287-0202
3,11610111511664101109971051084699111109,Laura,Yates,4415 William Park Apt. 815,382.499.3940
4,999999999,2025-03-06_firstname,17:20:56.189967_lastname,xxxxxxx,5555555555


In [None]:
# rearrange and add/remove entries from the df, then run the anonymizer again to see if the same inputs align with the outputs

data2 = []
data2.append({'uuid': '101', 'firstname': 'Adam', 'lastname': 'Zuro', 'streetaddress': '42 Wallaby Way', 'phonenumber': '555-111-0000'})
# data2.append({'uuid': '102', 'firstname': 'Billy', 'lastname': 'Yondu', 'streetaddress': '123 Main St', 'phonenumber': '555-222-0000'}) # this is removed
data2.append({'uuid': '103', 'firstname': 'Catherine', 'lastname': 'Xero', 'streetaddress': '915 Park Ave', 'phonenumber': '5552569857'}) # this moves up in the order, but with same uuid
data2.append({'uuid': 'test@email.com', 'firstname': 'West', 'lastname': 'Donald', 'streetaddress': '16 Baker Trl', 'phonenumber': '(555)359-4848'}) # this now has first name and last name flipped
data2.append({'uuid': 'another@example.com', 'firstname': 'Donald', 'lastname': 'Valentine', 'streetaddress': '16 Baker Trl', 'phonenumber': '(555)359-4848'}) # this is new
df2 = pd.DataFrame(data2)

anonymized_df2 = anonymize_dataframe_cellwise(df2)

print("ANONYMIZED DF #2")
anonymized_df2

ANONYMIZED DF #2


Unnamed: 0,uuid_safe,firstname,lastname,streetaddress,phonenumber
0,101,Mallory,Lucas,839 Beth Ridge Apt. 064,+1-986-574-7444x374
1,103,Emma,Smith,6898 Jones Creek,001-283-287-0202
2,11610111511664101109971051084699111109,Lauren,Leon,4415 William Park Apt. 815,382.499.3940
3,9711011111610410111464101120971091121081014699...,Laura,Graham,4415 William Park Apt. 815,382.499.3940
4,999999999,2025-03-06_firstname,17:20:56.678034_lastname,xxxxxxx,5555555555




---



# TODO
1. Jumble up the values of individual columns and confirm the transformations are the same
2. ✅ Confirm the results are the same across runs i.e. kill the session, close the notebook
3. Connect to real data from the [sftp notebook](https://colab.research.google.com/drive/1UZ5_jZtXahvSH3pl4P-E4aBRIeyNj6hk#scrollTo=FqzTmEACaibe)
4. Add a line to the end of each anonymized file that gives us a way to identify the run e.g. UUID is `9999999`, name is datetime of the anonymization or maybe source file creation `202503050115`
  - e.g. Parent file UUID is `999999`, lastname is `202503050115_parent`
  - e.g. Student file UUID is `888888`, lastname is `202503050115_student`
  - e.g Faculty file UUID is `777777`, lastname is `202503050115_faculty`



# Trying it out
Generate a fake dataset and test it by running the anonymizer on subsets, that can then be reconstructed

In [None]:
# generate a "person" df for testing

import numpy as np

fake = Faker('en_US') # Specify US locale

df_len = 25 # set the length of this test dataframe
uuid_email_per = 0.1 # set the random sampling rate for uuids to emails

# Create an empty list to store the data
data = []

# Generate 30 rows of fake data
for _ in range(df_len):
    data.append({
        'uuid' : str(fake.unique.random_number(digits=4)),
        'firstname': fake.first_name(),
        'lastname': fake.last_name(),
        'streetaddress': fake.street_address(),
        'city': fake.city(),
        'state': fake.state(),
        'zip code': fake.zipcode(),
        'phonenumber': fake.phone_number(),
        'job title': fake.job()
    })

### Create a Pandas DataFrame from the generated data
person_df = pd.DataFrame(data)

# Replace a random sampling of the uuids with email addresses
num_rows = len(person_df)
num_to_replace = int(uuid_email_per * num_rows)

# Randomly select indices to replace
indices_to_replace = np.random.choice(num_rows, num_to_replace, replace=False)

# Replace UUIDs with fake email addresses at the selected indices
for index in indices_to_replace:
    person_df.loc[index, 'uuid'] = fake.email()

# Display the modified DataFrame
person_df


Unnamed: 0,uuid,firstname,lastname,streetaddress,city,state,zip code,phonenumber,job title
0,2604,Jennifer,Lewis,237 Jose Run,Garyfort,Oklahoma,58079,731-992-8602x088,Further education lecturer
1,1536,Stephen,Burns,685 Kenneth Gardens,East Katherinechester,Tennessee,76248,(637)794-2770x939,Public house manager
2,606,Adam,Deleon,3663 Lewis Stravenue,North Heather,Maryland,35740,+1-544-736-7696x276,Conference centre manager
3,4916,Alexis,Johnson,50430 Taylor Divide,South Erica,Mississippi,40856,(587)391-8785x872,Equality and diversity officer
4,8104,Robert,Moore,786 Gibson Mill,Rothmouth,Delaware,80497,(625)322-8259x445,Public librarian
5,4047,Timothy,Smith,3412 Jones Stravenue Suite 930,North Cindyview,Alaska,75279,592.755.3874,Call centre manager
6,3001,William,Willis,91853 Gregory Fort Suite 208,Stephaniechester,South Dakota,52276,965-241-8039,Printmaker
7,9974,James,Dougherty,28315 Hall Place,Port Scotthaven,Virginia,25623,(723)980-2445,Microbiologist
8,5922,Eric,Fry,593 Rachel Stream,New Kimberlyshire,Mississippi,87553,880.204.2166x39221,Primary school teacher
9,9377,Brandon,Gray,915 Hardy Squares Apt. 344,South Johnny,Connecticut,31057,001-662-985-9689x25092,"Exhibitions officer, museum/gallery"


In [None]:
# subsample the above df into two separate test_dfs of size 15

test_df1 = person_df.sample(n=15, random_state=42)  # First test DataFrame
test_df2 = person_df.sample(n=15, random_state=123) # Second test DataFrame

# test_df1

In [None]:
# test_df2

In [None]:
# anonymize both test dfs and compare the results

anonymized_test_df1 = anonymize_dataframe_cellwise(test_df1)
anonymized_test_df2 = anonymize_dataframe_cellwise(test_df2)

print("ANONYMIZED TEST DF #1")
anonymized_test_df1.sort_index()

ANONYMIZED TEST DF #1


Unnamed: 0,uuid_safe,firstname,lastname,streetaddress,city,state,zip code,phonenumber,job title
0,2604,William,Wyatt,615 Matthew Motorway Apt. 338,Garyfort,Oklahoma,58079,001-304-595-7738,Further education lecturer
1,1536,Michael,Walton,7101 Simpson Views,East Katherinechester,Tennessee,76248,379-912-3915,Public house manager
2,606,Mallory,Thomas,992 Bradley Field Apt. 064,North Heather,Maryland,35740,861.857.3098,Conference centre manager
3,4916,Jeremy,Kennedy,4156 Shannon Port,South Erica,Mississippi,40856,4899600667,Equality and diversity officer
4,8104,Michael,Garcia,79892 Montgomery Stravenue,Rothmouth,Delaware,80497,001-729-581-6141x09813,Public librarian
5,4047,Dakota,Bass,352 Jennifer Mission,North Cindyview,Alaska,75279,+1-777-230-2177x78683,Call centre manager
8,5922,Robert,Wagner,092 Melody Mountains Apt. 909,New Kimberlyshire,Mississippi,87553,765.248.0121,Primary school teacher
9,9377,Crystal,Austin,625 James Run Apt. 380,South Johnny,Connecticut,31057,4617647432,"Exhibitions officer, museum/gallery"
11,1129711010011410111911564101120971091121081014...,Caleb,Murphy,722 James Roads,New Kayla,Arkansas,63635,+1-704-983-5916x035,Web designer
12,7384,Jenny,Bailey,983 Byrd Cliffs Suite 990,West Debbiestad,South Carolina,69318,311-699-0178x85916,Facilities manager


In [None]:
print("ANONYMIZED TEST DF #2")
anonymized_test_df2.sort_index()

ANONYMIZED TEST DF #2


Unnamed: 0,uuid_safe,firstname,lastname,streetaddress,city,state,zip code,phonenumber,job title
3,4916,Jeremy,Kennedy,4156 Shannon Port,South Erica,Mississippi,40856,4899600667,Equality and diversity officer
4,8104,Michael,Garcia,79892 Montgomery Stravenue,Rothmouth,Delaware,80497,001-729-581-6141x09813,Public librarian
5,4047,Dakota,Bass,352 Jennifer Mission,North Cindyview,Alaska,75279,+1-777-230-2177x78683,Call centre manager
7,9974,Jenny,Bailey,0721 Erika Causeway Suite 228,Port Scotthaven,Virginia,25623,254.784.8818,Microbiologist
8,5922,Robert,Wagner,092 Melody Mountains Apt. 909,New Kimberlyshire,Mississippi,87553,765.248.0121,Primary school teacher
9,9377,Crystal,Austin,625 James Run Apt. 380,South Johnny,Connecticut,31057,4617647432,"Exhibitions officer, museum/gallery"
11,1129711010011410111911564101120971091121081014...,Caleb,Murphy,722 James Roads,New Kayla,Arkansas,63635,+1-704-983-5916x035,Web designer
12,7384,Jenny,Bailey,983 Byrd Cliffs Suite 990,West Debbiestad,South Carolina,69318,311-699-0178x85916,Facilities manager
14,7827,Amanda,Evans,87525 Jasmine Crest Suite 363,Chapmanbury,Connecticut,99371,4493373525,Camera operator
15,4426,Joseph,Miller,301 Trujillo Pines Suite 401,North Lisa,Mississippi,78759,9067907644,"Engineer, chemical"


In [None]:
# find the indices that exist in both dataframes and compare all contents of the matching rows

# Find common indices
common_indices = anonymized_test_df1.index.intersection(anonymized_test_df2.index)

# Compare rows with common indices
for index in common_indices:
    row1 = anonymized_test_df1.loc[index]
    row2 = anonymized_test_df2.loc[index]

    if row1.equals(row2):
      print(f"Rows with index {index} are equal.")
    else:
      print(f"Rows with index {index} are NOT equal:")
      print("Row from anonymized_test_df1:\n", row1)
      print("Row from anonymized_test_df2:\n", row2)
      print("Difference:\n", row1[row1 != row2])


Rows with index 8 are equal.
Rows with index 16 are equal.
Rows with index 11 are equal.
Rows with index 9 are equal.
Rows with index 22 are equal.
Rows with index 5 are equal.
Rows with index 12 are equal.
Rows with index 15 are equal.
Rows with index 3 are equal.
Rows with index 4 are equal.




---



# Exploration of valid faker seeds

Faker's `Faker.seed()` can accept strings as arguments, so I think a lot of the conversion work is unnecessary.

HOWEVER, we still need a way to convert email addresses to something obfuscated *and consistent*, as some clients us email addresses as their unique identifiers.

Want to test this across runs. `hash()` may not be the right answer.   
REF: https://www.33rdsquare.com/comprehensive-guide-on-python-hash-method/   
>Interestingly, the hash value for a given string will always be the same within a Python session, but it may change between different runs of the program or across Python versions/implementations. This is why it‘s generally not a good idea to rely on specific hash values in your code.

In [None]:


# Function to convert email address to a seed
def email_to_seed(email):
    return sum(ord(char) for char in email)

# Example email address
email_address = "example@example.com"

# Convert email address to seed
seed = email_to_seed(email_address)

# Initialize Faker with the seed
fake = Faker()
fake.seed_instance(seed)

# Generate some fake data
print("Name:", fake.name())
print("Address:", fake.address())
print("Phone Number:", fake.phone_number())


Name: Bradley Neal
Address: 695 Franklin Islands
North Bryce, AL 79324
Phone Number: 593.383.4275


In [None]:
seed

1925

In [None]:
ord(char) for char[1] in email

SyntaxError: invalid syntax (<ipython-input-43-2c9ee0001ffe>, line 1)

In [None]:
# prompt: convert an email address into an int by replacing each character with its ordinal value. do not use sum

def email_to_int(email):
    """Converts an email address to an integer by replacing each character with its ordinal value.

    Args:
        email: The email address to convert.

    Returns:
        An integer representation of the email address.
    """
    result = ""
    for char in email:
        result += str(ord(char))
    return int(result)

# Example usage
email = "test@email.com"
int_representation = email_to_int(email)
int_representation


11610111511664101109971051084699111109

In [None]:

# Use a string as a seed
seed_string = "my_custom@_seed"
# fake.seed_instance(seed_string)
Faker.seed(seed_string)

# Generate some fake data
print(fake.name())
print(fake.address())
print(fake.email())

Tara Murray
45356 Mindy Passage
Coxbury, FL 56131
bernardjoseph@example.net


In [None]:
# prompt: create uuid_safe column by copying the uuids. detect instances where 'uuid_safe' contains an "@", and convert those strings to integer expressions

import pandas as pd
from faker import Faker
import numpy as np

# ... (rest of your existing code)

def anonymize_dataframe(df):
    """Anonymizes a Pandas DataFrame using UUID as Faker seed.
       Handles "@" in uuid_safe by converting to integer expressions.
    """
    anonymized_df = df.copy()

    def anonymize_row(row):
        try:
            seed = int(row['uuid'])
        except ValueError:
            seed = hash(row['uuid'])
        faker = Faker()
        Faker.seed(seed)
        row['uuid_safe'] = faker.unique.random_number(digits=15)
        row['firstname'] = faker.first_name()
        row['lastname'] = faker.last_name()
        row['streetaddress'] = faker.street_address()
        row['phonenumber'] = faker.phone_number()
        return row

    anonymized_df = anonymized_df.apply(anonymize_row, axis=1)
    anonymized_df = anonymized_df.drop(columns=['uuid'])
    first_column = anonymized_df.pop('uuid_safe')
    anonymized_df.insert(0, 'uuid_safe', first_column)

    # Convert 'uuid_safe' values containing "@" to integer representations
    for index, value in anonymized_df['uuid_safe'].items():
        if "@" in str(value):  # Ensure value is treated as a string
            anonymized_df.loc[index, 'uuid_safe'] = email_to_int(str(value))

    return anonymized_df

def email_to_int(email):
    result = ""
    for char in email:
        result += str(ord(char))
    return int(result)




---


# Graveyard

In [None]:
# using python's faker library, anonymize data from a dataframe containing columns for uuid, firstname, lastname, streetaddress, and phonenumber

import pandas as pd
from faker import Faker

fake = Faker('en_US') # Specify US locale

def anonymize_dataframe(df):
    """Anonymizes a Pandas DataFrame using UUID as Faker seed.

    Args:
      df: The DataFrame to anonymize. Must contain a 'uuid' column.

    Returns:
        A new DataFrame with anonymized data.
    """

    anonymized_df = df.copy()

    def anonymize_row(row):
        try:
            seed = int(row['uuid'])
        except ValueError:
            # Handle cases where 'uuid' is not purely numeric
            seed = hash(row['uuid'])  # Use hash for deterministic conversion
        faker = Faker()
        Faker.seed(seed)
        row['uuid_safe'] = faker.unique.random_number(digits=15)
        row['firstname'] = faker.first_name()
        row['lastname'] = faker.last_name()
        row['streetaddress'] = faker.street_address()
        row['phonenumber'] = faker.phone_number()
        return row

    anonymized_df = anonymized_df.apply(anonymize_row, axis=1)

    # drop the 'uuid' column, since we've seen clients put PII in there
    anonymized_df = anonymized_df.drop(columns=['uuid'])

    # shift column 'uuid_safe' to first position
    first_column = anonymized_df.pop('uuid_safe')

    # insert column using insert(position,column_name,
    # first_column) function
    anonymized_df.insert(0, 'uuid_safe', first_column)

    return anonymized_df


**Cell-wise deterministic transformations**


The above generates consitent results given the same primary key per file. But what happens when we run into foreign keys (which exist as primary keys in separate, but related)?  
This strategy sould likely fail to maintain those relationships. So we need to explore the same approach as above, but scoped to individual **cells**, rather than individual **lines**.

Can faker accept a seed as argument?
* https://stackoverflow.com/questions/77891777/how-to-generate-a-fake-name-using-faker-passing-existing-name-as-the-seed-inst

In [None]:
def anonymize_dataframe_cellwise(df):
    """Anonymizes each cell in a Pandas DataFrame using its initial value as the seed."""
    anonymized_df = df.copy()

    for col in df.columns:
        for index, value in df[col].items():
          # try:
          #     # seed = int(value)
          #     seed = value
          # except ValueError:
          #     seed = hash(str(value))  # Use hash for deterministic conversion

          Faker.seed(value)
          faker = Faker() #create the faker instance after seeding

          if col == 'uuid':
              anonymized_df.loc[index, 'uuid_safe'] = faker.unique.random_number(digits=15)
          elif col == 'firstname':
              anonymized_df.loc[index, 'firstname'] = faker.first_name()
          elif col == 'lastname':
              anonymized_df.loc[index, 'lastname'] = faker.last_name()
          elif col == 'streetaddress':
              anonymized_df.loc[index, 'streetaddress'] = faker.street_address()
          elif col == 'phonenumber':
              anonymized_df.loc[index, 'phonenumber'] = faker.phone_number()
          # Add more elif blocks for other columns as needed
          else: #for other columns, simply copy the value over
              anonymized_df.loc[index, col] = value

    # Drop the original 'uuid' column and move 'uuid_safe' to the front
    anonymized_df = anonymized_df.drop(columns=['uuid'])
    first_column = anonymized_df.pop('uuid_safe')
    anonymized_df.insert(0, 'uuid_safe', first_column)


    return anonymized_df

In [None]:
# Example usage with the original dataframes
anonymized_df1_cellwise = anonymize_dataframe_cellwise(df)
print("ANONYMIZED DF CELLWISE #1")
anonymized_df1_cellwise.sort_index()


ANONYMIZED DF CELLWISE #1


Unnamed: 0,uuid_safe,firstname,lastname,streetaddress,phonenumber
0,813009300000000.0,Mallory,Lucas,839 Beth Ridge Apt. 064,+1-986-574-7444x374
1,322270100000000.0,David,Dickson,338 Simmons Forks,477.724.9151x3404
2,451912900000000.0,Emma,Smith,6898 Jones Creek,001-283-287-0202


In [None]:
anonymized_df2_cellwise = anonymize_dataframe_cellwise(df2)
print("ANONYMIZED DF CELLWISE #2")
anonymized_df2_cellwise.sort_index()

ANONYMIZED DF CELLWISE #2


Unnamed: 0,uuid_safe,firstname,lastname,streetaddress,phonenumber
0,813009300000000.0,Mallory,Lucas,839 Beth Ridge Apt. 064,+1-986-574-7444x374
1,451912900000000.0,Emma,Smith,6898 Jones Creek,001-283-287-0202
2,521355300000000.0,Laura,Yates,4415 William Park Apt. 815,382.499.3940


To actually show that this worked, we need to jumble up individual cell values

In [None]:
# set up a df
# Create a new DataFrame with specified rows
data = []
# data.append({'uuid': '101', 'firstname': 'Adam', 'lastname': 'Zuro', 'streetaddress': '42 Wallaby Way', 'phonenumber': '555-111-0000'})
# data.append({'uuid': '102', 'firstname': 'Billy', 'lastname': 'Yondu', 'streetaddress': '123 Main St', 'phonenumber': '555-222-0000'})
# data.append({'uuid': '103', 'firstname': 'Catherine', 'lastname': 'Xero', 'streetaddress': '915 Park Ave', 'phonenumber': '5552569857'})
data.append({'uuid': '101', 'firstname': 'Adam', 'lastname': 'Zuro', 'streetaddress': '343 Monitor Road', 'phonenumber': '555-111-0000'}) # address is NEW, anon result should be different
data.append({'uuid': '102', 'firstname': 'Billy', 'lastname': 'Yondu', 'streetaddress': '123 Main St', 'phonenumber': '555-111-0000'}) # phone is copied from uuid 101; anon result should match
data.append({'uuid': '103', 'firstname': 'Edgar', 'lastname': 'Valentine', 'streetaddress': '20254 Sunset Way', 'phonenumber': '5559991254'}) # all non-PK data is different; results should be different

df_jumbled_cells = pd.DataFrame(data)

print("STARTING DF")
df_jumbled_cells

STARTING DF


Unnamed: 0,uuid,firstname,lastname,streetaddress,phonenumber
0,101,Adam,Zuro,343 Monitor Road,555-111-0000
1,102,Billy,Yondu,123 Main St,555-111-0000
2,103,Edgar,Valentine,20254 Sunset Way,5559991254


In [None]:
anonymized_df3_cellwise = anonymize_dataframe_cellwise(df_jumbled_cells)
print("ANONYMIZED DF CELLWISE #2")
anonymized_df3_cellwise.sort_index()

ANONYMIZED DF CELLWISE #2


Unnamed: 0,uuid_safe,firstname,lastname,streetaddress,phonenumber
0,813009300000000.0,Mallory,Lucas,51436 Lewis Views Apt. 059,+1-986-574-7444x374
1,322270100000000.0,David,Dickson,338 Simmons Forks,+1-986-574-7444x374
2,451912900000000.0,Heidi,Graham,32934 Noah Valley Apt. 123,(290)279-7215x938


In [None]:
# test a case where the `uuid` is an email address

