In [1]:
num_elements = 100

In [2]:
import pandas as pd
from faker import Faker
fake = Faker()

In [3]:
import numpy as np
elements = np.array(["John", "James", "Alex", "Oliver", "Matt", "Tom", "Luke"])
probabilites = np.array([0.4,0.2,0.1,0.1,0.1,0.05,0.05])
male_first_names = np.random.choice(elements, int(num_elements*0.8), list(probabilites))
males = male_first_names.copy()
males[:] = "Male"

elements = np.array(["Emily", "Emma", "Sophie", "Hannah", "Kate", "Rebecca", "Chloe"])
female_first_names = np.random.choice(elements, int(num_elements*0.2), list(probabilites))
females = female_first_names.copy()
females[:] = "Female"

first_names = np.concatenate([male_first_names, female_first_names])

genders = np.concatenate([males,females])

In [4]:
elements = np.array(["Smith",
"Jones",
"Williams",
"Brown",
"Taylor",
"Davies",
"Wilson",
"Evans",
"Thomas",
"Johnson",
"Roberts",
"Walker",
"Wright",
"Thompson",
"Robinson",
"White",
"Hughes",
"Edwards",
"Hall",
"Green",
"Martin",
"Wood"])
surnames = np.random.choice(elements, num_elements)

In [5]:
elements = np.array(["London",
"Birmingham",
"Leeds",
"Glasgow",
"Sheffield",
"Bradford",
"Liverpool",
"Edinburgh",
"Manchester",
"Bristol",
"Kirklees",
"Fife"])
cities = np.random.choice(elements, num_elements)

In [6]:
target_dataset = pd.DataFrame([genders, first_names, surnames, cities]).T
target_dataset.columns = ["gender", "first_name", "surname", "city"]

In [7]:
for r in target_dataset.iterrows():
    index = r[0]
    target_dataset.loc[index,"dob"] =  fake.date_time_between(start_date="-30y", end_date="now", tzinfo=None).strftime("%Y-%b-%d")
    target_dataset.loc[index,"code"] = fake.md5()

In [8]:
target_dataset.head()

Unnamed: 0,gender,first_name,surname,city,dob,code
0,Male,Matt,Hughes,Kirklees,2003-Jun-11,a0f7d1e3def8a22d2708479d3e14a1e8
1,Male,Tom,Brown,Birmingham,2006-Jan-19,0bd7766ac3568d300d6c11e1369021e4
2,Male,Tom,Hughes,Bristol,1997-Jun-21,12693dc197b8e29185c8c8fb847599fa
3,Male,Alex,Wilson,Birmingham,2012-Jan-20,2ec6b298ee37161204f187e32a1981b0
4,Male,John,Walker,Bradford,1993-Sep-29,8be9a41303d2f52784d4c55a1561064a


In [9]:
target_dataset = target_dataset.join(target_dataset.dob.str.split("-",expand=True))
target_dataset = target_dataset.rename(columns={0:"year", 1:"month", 2:"day"})
target_dataset = target_dataset.drop("dob", axis=1)
target_dataset.head()

Unnamed: 0,gender,first_name,surname,city,code,year,month,day
0,Male,Matt,Hughes,Kirklees,a0f7d1e3def8a22d2708479d3e14a1e8,2003,Jun,11
1,Male,Tom,Brown,Birmingham,0bd7766ac3568d300d6c11e1369021e4,2006,Jan,19
2,Male,Tom,Hughes,Bristol,12693dc197b8e29185c8c8fb847599fa,1997,Jun,21
3,Male,Alex,Wilson,Birmingham,2ec6b298ee37161204f187e32a1981b0,2012,Jan,20
4,Male,John,Walker,Bradford,8be9a41303d2f52784d4c55a1561064a,1993,Sep,29


In [10]:
# Now we need to create a corrupted version of this for matching
candidate_dataset = target_dataset.sample(frac=0.8,replace=False).copy()

In [11]:
candidate_dataset.head()

Unnamed: 0,gender,first_name,surname,city,code,year,month,day
51,Male,Luke,Evans,Sheffield,c179075a2e6680af55310c9cb0ad4c89,1990,Nov,25
42,Male,James,Wood,Bradford,448b52cdef08706822011387a9e3946c,2002,Nov,26
59,Male,Matt,Martin,Kirklees,83f539d8cd04e0a4da72a54c8da19a70,2009,Apr,18
98,Female,Chloe,Williams,Glasgow,bc453ca0d852fd6c58fe739c5edc1c4e,2008,Jan,18
40,Male,John,Robinson,Fife,30734a59ef861c6d465213fe020ca8bb,2008,May,13


In [12]:
import random
import string


def switch(my_string):
    pos1 = random.randrange(0,len(my_string)) 
    pos2 = random.randrange(0,len(my_string)) 
    my_string = list(my_string)
    char1 = my_string[pos1]
    char2 = my_string[pos2]
    my_string[pos1] = char2
    my_string[pos2] = char1
    return "".join(my_string)

def new_letter(my_string):
    pos1 = random.randrange(0,len(my_string)) 
    letter = random.choice(string.ascii_lowercase)
    return my_string[:pos1] + letter + my_string[pos1:]

def delete_letter(my_string):
    pos1 = random.randrange(1,len(my_string)) 
    return my_string[:pos1] + my_string[pos1+1:]

def corrupt_string(my_string, num_switches=1,num_new_letters=0,num_deletes=0):
    
    for i in range(num_switches):
        my_string = switch(my_string)
        
    for i in range(num_new_letters):
        my_string = new_letter(my_string)
        
    for i in range(num_deletes):
        my_string = delete_letter(my_string)
        
    return my_string

In [13]:
for r in candidate_dataset.iterrows():
    index = r[0]
    row = r[1]
    
    if (random.random()>0.9):
        candidate_dataset.loc[index, "first_name"] = corrupt_string(row["first_name"])
    
    if (random.random()>0.9):
        candidate_dataset.loc[index, "surname"] = corrupt_string(row["surname"], num_switches=0, num_deletes=1)
        
    if (random.random()>0.9):
        candidate_dataset.loc[index, "city"] = corrupt_string(row["city"], num_switches=0, num_deletes=1)
        
    if (random.random()>0.9):
        surname = row["surname"]
        first_name = row["first_name"]
        candidate_dataset.loc[index, "first_name"] = surname
        candidate_dataset.loc[index, "surname"] = first_name
        
    if (random.random()>0.9):
        candidate_dataset.loc[index,"year"] = fake.date_time_between(start_date="-30y", end_date="now", tzinfo=None).strftime("%Y")
        
    if (random.random()>0.9):
        candidate_dataset.loc[index,"day"] = fake.date_time_between(start_date="-30y", end_date="now", tzinfo=None).strftime("%d")
    
    if (random.random()>0.8):
        candidate_dataset.loc[index, "gender"] = None
        
    if (random.random()>0.8):
        candidate_dataset.loc[index, "first_name"] = None
        
    if (random.random()>0.8):
        candidate_dataset.loc[index, "surname"] = None
        
    if (random.random()>0.8):
        candidate_dataset.loc[index, "year"] = None
        
    if (random.random()>0.8):
        candidate_dataset.loc[index, "month"] = None
        
    if (random.random()>0.8):
        candidate_dataset.loc[index, "day"] = None
        

        

In [14]:
target_dataset.to_csv("target_dataset.csv", encoding="utf-8", index=False)

In [15]:
candidate_dataset.to_csv("candidate_dataset.csv", encoding="utf-8", index=False)