# Introduction
Generating realistic fake data or pseudo-random data can be complicated and time consuming. A good alternative is to automatically and heavily randomize client data. This module has multiple functions to do this.

In [1]:
import pandas as pd
import numpy as np
import random
import re
from pandas.api.types import is_string_dtype
from faker import Faker
fake = Faker('de_DE')
from collections import OrderedDict

# Function
## Functions for general purpose

In [2]:
def random_replacement(series,func):
    '''Changes non-null unique values in a Series by using a function.
    
        Args:
            series: pd.Series
            func: function to apply to the Series
        
        Usage:
            # Random replacement of company names in-place
            from faker import Faker
            fake = Faker('de_DE')
            df['Firm'] = random_replacement(df['Firm'],lambda x: fake.company())
        
        Returns:
            pd.Series
    
    '''
    
    # Creates a DataFrame with a single column containing all unique non null values of the Series
    unique_values = series.dropna().unique()
    
    # Applies the function to the unique values
    new_unique_values = pd.Series(unique_values).apply(func)

    # Creates a mapper -> {unique_values:new_unique_values}
    mapper = dict(zip(unique_values,new_unique_values))
    
    # Map the original series with the new values
    mapped = series.map(mapper)
    
    
    return mapped

def shuffle_series(series, stays_same = []):
    """Randomly shuffles the values Series. If there are some values that should not be shuffled
    you can pass them into the argument stays_same (note that null values stay null).
    
        Args:
            series: pd.Series
            stays_same: list-like
        
        Returns:
            pd.Series
    
    """
    # creates a dict where both keys and values are the Series
    identical_dict = dict(zip(series.values,series.values)) # keys = values

    # shuffles keys
    keys = list(identical_dict.keys())
    random.shuffle(keys)

    # recreates dict with the shuffled keys
    mapper = OrderedDict(zip(keys, series.values)) 
    
    # if some values are supposed to stay identical, edits the keys for those values
    if stays_same != []:
        for key in stays_same:
            mapper[key] = key
    
    # Map the Series with the dictionary
    mapped = series.map(mapper)
    
    # If value was NaN before it should still be NaN
    mapped.loc[(mapped.notna()) & (series.isna())] = np.nan
    
    return mapped
    

def switch_words(series):
    """Randomly Switches the words (str that are separated by a whitespace) in a Series.
    
        Args:
            series: pd.Series, dtype str (object)
    
        Returns:
            pd.Series
    """
    
    if not is_string_dtype(series):
        raise TypeError('Series should be of dtype string (object)!')
    
    # Makes a copy and display original values
    new_series = series.copy(deep = True)
    display(new_series.head())

    # Makes a matrix of all the words
    splitted = new_series.str.split(' ',expand = True).fillna(np.nan)
    display(splitted.head())
    
    # Iterates through all columns and shuffle them
    for col in splitted.columns:
        splitted[col] = shuffle_series(splitted[col])

    display(splitted)

    # Sums the str matrix by adding a space in between
    splitted = splitted.fillna('').add(' ').sum(axis = 1).str.strip().replace('',np.nan)
    new_series = splitted
    return new_series

## Functions for specific data

In [3]:
def scramble_mail_components(mail_series,fake, shuffle_series = shuffle_series):
    """Switches and changes email components (local part and domain) in a Series of mail addresses.
    
    The function will search for "-", "." and "_" and edit values after those characters. Many people
    write firstname.lastname@... (or firstname_lastname@... etc.) in those cases the first name will remain
    unaffected by the function (in my work I use operations to retrieve the first names so I needed this).
    
    For the case firstname@ the function adds a random last name: firstname-lastname@.
    
    The domains also get swapped around randomly.
    
    Args:
        mail_series: pd.Series, dtype str (object)
        fake: an instance of Faker class
    
    Returns:
        tuple of pd.Series: new_mail_series,local_part_series,domain_series
    
    """
    
    # Validate arguments
    if not is_string_dtype(mail_series):
        raise TypeError('mail_series should be of dtype string (object)!')
        
    # Removes warning from Pandas that tells how to extract the values of the regex groups
    import warnings
    warnings.filterwarnings("ignore", 'This pattern has match groups')

    # Extracts local part and domain
    local_part_series = mail_series.fillna('').str.split('@').str[0]
    domain_series = mail_series.fillna('').str.split('@').str[1]

    # replaces toto-test@domain.com by toto-something@domain.com
    mail_scrambler = re.compile('(\.|-|_).*')
    local_part_series = local_part_series.apply(lambda x: re.sub(mail_scrambler,'-'+fake.last_name(),x))
    
    # replaces toto@domain.com by toto-something@domain.com
    randomize_local = lambda x: x+'-'+fake.last_name() if x != '' else x
    
    local_part_series.loc[~local_part_series.str.contains(mail_scrambler)] = local_part_series.apply(randomize_local)
    local_part_series = local_part_series.str.lower()
    
    # Shuffles the domains
    domain_series = shuffle_series(domain_series.fillna(''))
    domain_series = domain_series.str.lower()
    
    # Concatenating new local part and domain
    new_mail_series = local_part_series +'@'+ domain_series
    
    # Sets true nulls
    ## we filled NaNs with '' everywhere so null values in the new mail Series should be equal to ''+'@'+'' -> '@'
    new_mail_series = new_mail_series.replace(['@',''],np.nan) 

    ## replaces '' with np.nan in the domain and local part Series
    domain_series = domain_series.replace('',np.nan)
    local_part_series = local_part_series.replace('',np.nan)
                                                
    return (new_mail_series,local_part_series,domain_series)


def scramble_phone_numbers(phone_number, max_changes = 4):
    """Randomly changes digits in a phone number. If there are no digits the original value
    is returned.
    
        Args:
            phone_number: str
            max_changes: integer, how many digits to replace at most
            
        Usage : 
            df['Phone number'].apply(scramble_phone_numbers)
            
        Returns:
            str
    
    """
    # Null? return null
    if pd.isna(phone_number):
        return phone_number
    
    else:
        # Contains number? -> change digits
        if bool(re.search('\d',str(phone_number))):
            return re.sub('\d',str(random.randint(0,9)),str(phone_number),count = max_changes)
        else:
            return phone_number

# Usage
Change every cell to code to test the functions.

## Random replacement

data = {'Firma':['Mustermann AG','Beispiel GmbH',
                 'Beispiel GmbH','Beispiel GmbH','Mustermann AG',np.nan]*2,
        'Umsatz':[random.randint(0,1000) for i in range (12)]}

df_test = pd.DataFrame(data)
display(df_test)
df_test['Firma (randomized)'] = random_replacement(df_test['Firma'],lambda x: fake.company())
df_test.head()

## Shuffle series

df_test['Firma (shuffled)'] = shuffle_series(df_test['Firma'],stays_same = ['Mustermann AG'])
df_test

## Switch words

fake.seed(123)
df_test = pd.DataFrame({'fake_sentences':fake.sentences(nb=10, ext_word_list=None)})
df_test['fake_sentences_(switched_words)'] = switch_words(df_test['fake_sentences'])
df_test

## Scramble mail components

mails_list = [fake.free_email() for i in range(5)]
mails_list += ['toto_test@gmail.com','robert.mustermann@yahoo.de','frank-beispiel@test.de']

df_test = pd.DataFrame({'eMail':mails_list})
df_test['eMail_scrambled'],df_test['Local_part_scrambled'],df_test['Domain_scrambled'] = scramble_mail_components(df_test['eMail'],
                                                                                                                  fake)
df_test

## Scramble phone numbers

phone_numbers = [fake.phone_number() for i in range(10)]+['test',np.nan]
df_test = pd.DataFrame({'phone_number':phone_numbers})
df_test['phone_number_scrambled'] = df_test['phone_number'].map(scramble_phone_numbers)
df_test