# Anonymizing data with Faker

https://faker.readthedocs.io/en/master/

This is from a real example, so there is some wrangling and merging at the top which isn't relevant. I'll do a clean gist on this someday.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from faker import Faker

# date and time handling
from datetime import datetime, date
from dateutil.relativedelta import relativedelta

In [2]:
# read csv.  encoding gets around an error thrown by reading in with default utf-8
df = pd.read_csv('ssw data1-22-19.csv', encoding='ISO-8859-1')
users = pd.read_csv('users data1-22-19.csv')
roles = pd.read_csv('userRole update.csv')

In [3]:
df = pd.merge(df, users, left_on='user_id', right_on='ID', how='left')

In [4]:
df = pd.merge(df, roles, left_on='user_login', right_on='NetID', how='left')

In [5]:
# remove template, plugins_group, new_users, new_users_role because never implemented in SSW
# keeping user_login from the users df
# keeping current primary role from roles df
df = df[['ssw_id', 'user_id', 'user_login', 'u_role', 'u_division', 'title',
       'privacy', 'template', 'theme', 'plugins_list',
        'next_stage', 'site_type', 'blog_id',
       'ssw_main_meta', 'site_created', 'wizard_completed', 'starttime',
       'endtime']]

In [7]:
uni_login = df.user_login.unique()

In [8]:
uni_login = pd.DataFrame(uni_login, dtype='string')

In [9]:
uni_login.columns = ['user_login']

I used this code (probably from StackExchange) for reference.

```
import unicodecsv as csv
from faker import Faker
from collections import defaultdict
def anonymize_rows(rows):
    """
    Rows is an iterable of dictionaries that contain name and
    email fields that need to be anonymized.
    """
    # Load the faker and its providers
    faker  = Factory.create()
    # Create mappings of names & emails to faked names & emails.
    names  = defaultdict(faker.name)
    emails = defaultdict(faker.email)
    # Iterate over the rows and yield anonymized rows.
    for row in rows:
        # Replace the name and email fields with faked fields.
        row['name']  = names[row['name']]
        row['email'] = emails[row['email']]
        # Yield the row back to the caller
        yield row

def anonymize(source, target):
    """
    The source argument is a path to a CSV file containing data to anonymize,
    while target is a path to write the anonymized CSV data to.
    """
    with open(source, 'rU') as f:
        with open(target, 'w') as o:
            # Use the DictReader to easily extract fields
            reader = csv.DictReader(f)
            writer = csv.DictWriter(o, reader.fieldnames)
            # Read and anonymize data, writing to target file.
            for row in anonymize_rows(reader):
                writer.writerow(row)
```

The way I actually implemented was to create a function which created anonymized data for each row.  In this first case, I just fed it a dataframe with a single column.

In [11]:
def anonymize_login(x):
    # Load the faker
    fake = Faker()
    # Create an empty array where we can store the fake names.
    fake_login = []
    # for each row, create a fake user name
    for rows in x['user_login']:
        fake_login.append(fake.user_name())
    
    # create a new column and assign the array
    x['anon_login'] = fake_login

I run the function with my single-column df. It returns with the anonymized values. I merge on the true login data and then drop it, leaving the anonymized field.

In [12]:
anonymize_login(uni_login)

In [14]:
df = pd.merge(df, uni_login, on='user_login', how='left')

In [16]:
df.drop(columns=['user_login'], inplace=True)

In [17]:
# Now to change site titles. 
# Same concept as above, I just don't bother with a separate df.
# I overwrite the original data as I go.
# In the future, this is the way to do it.
def anonymize_title(x):
    fake = Faker()
    fake_title = []
    
    for rows in x['title']:
        fake_title.append(fake.text(max_nb_chars=70, ext_word_list=None))
        
    x['title'] = fake_title

In [18]:
anonymize_title(df)

In [20]:
df.to_csv('anon_data', index=False)