In [23]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import math
import pandasql as ps
import time
import string
import random
from faker import Faker

from lib.masking_functions import mask, bucketize_age, blur_zip, generalize_diagnosis, add_relative_noise, blur_phone

In [24]:
def plot_probabilities(dist):
    dist_copy = dist.copy().sort_index()
    # labels = dist.index.to_series().apply(lambda x: '{0}-{1}'.format(*x))
    # labels = dist.index

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar(range(len(dist_copy.values)),dist_copy.values)
    plt.show()

## Data Generation

In [25]:
fake = Faker('de_DE')

from faker.providers import BaseProvider

# create new provider class for diagnosis following the ICD_10_GM medical code
class ICD_10_GM_Provider(BaseProvider):
    def diagnosis(self) -> str:
        letter = random.choice(string.ascii_uppercase.replace("U", "").replace("W", "").replace("X", ""))
        number = np.trunc(np.random.uniform(0.0, 99.9)*10)/(10)
        if number < 10:
            return letter+"0"+str(number)
        else:
            return letter+str(number)
        
fake.add_provider(ICD_10_GM_Provider)

In [32]:
nRows = 50

pid = np.arange(1,nRows+1)
age = np.random.normal(60, 20, nRows).astype(int)

# Remove patients with age lower than 18 or higher than 100
invalid = age[np.logical_or(age < 18, age > 100)].size
while invalid > 0:
    age[np.logical_or(age < 18, age > 100)] = np.random.normal(75, 10, invalid).astype(int)
    invalid = age[np.logical_or(age < 18, age > 100)].size

diagnosis = np.asarray([fake.diagnosis() for i in range(nRows)])
address = np.asarray([fake.street_address() for i in range(nRows)])
zip_code = np.asarray([fake.postcode() for i in range(nRows)])
city = np.asarray([fake.city_name() for i in range(nRows)])
phone_number = np.asarray([fake.unique.phone_number() for i in range(nRows)])

In [33]:
name = []
gender = []
weight = np.empty(nRows).astype(int)
height = np.empty(nRows).astype(int)

for i in range(nRows):
    rand = np.random.rand()
    if rand < 0.48:
        name.append(fake.unique.name_male())
        gender.append("Male")
        height[i] = int(np.random.normal(180, 7.5))
        weight[i] = int(np.random.normal(80, 10))
    elif rand < 0.98:
        name.append(fake.unique.name_female())
        gender.append("Female")
        height[i] = int(np.random.normal(167, 5))
        weight[i] = int(np.random.normal(60, 7.5))
    else:
        name.append(fake.unique.name_nonbinary())
        gender.append("Non-Binary")
        height[i] = int(np.random.normal(175, 6))
        weight[i] = int(np.random.normal(70, 10))
        
name = np.asarray(name)
gender = np.asarray(gender)

### Single Dataset

In [34]:
data = {'id': pid,
    'name': name, 
    'gender': gender,
    'age': age,
    'height': height,
    'weight': weight,
    'diagnosis': diagnosis,
    'address': address,
    'zip': zip_code,
    'phone': phone_number
}
data

{'id': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
        170, 171, 172, 173, 174, 175, 176, 17

In [35]:
df = pd.DataFrame.from_dict(data=data)
#df['phone'] = df['phone'].str.replace(r'[()]',"", regex=True).str.replace(r' ',"", regex=True).str.replace("+49","", regex=False)
df

Unnamed: 0,id,name,gender,age,height,weight,diagnosis,address,zip,phone
0,1,Karl-Dieter Drubin-Barkholz,Male,50,188,70,T04.4,Truppplatz 94,82307,+49(0)8217 245871
1,2,Hans-Wolfgang Drewes,Male,75,186,60,L70.5,Niklas-van der Dussen-Platz 3/9,60258,+49(0) 403461405
2,3,Sigfried Otto,Male,40,194,68,A83.6,Kuschplatz 93,76834,+49(0) 351544562
3,4,Hermine Köster MBA.,Female,56,171,57,F90.9,Eckehard-Schleich-Allee 7,97348,+49(0)7709 65146
4,5,Inken Misicher,Female,47,165,65,P29.7,Nettestraße 8/0,39805,+49(0) 916667258
...,...,...,...,...,...,...,...,...,...,...
220,221,Marit Hentschel B.Sc.,Female,63,173,64,Y28.1,Kochstraße 1/0,48726,+49 (0) 5218 948794
221,222,Alwine Kramer-Lindau,Female,48,164,48,P67.6,Alena-Steinberg-Weg 4/1,86282,02134 28909
222,223,Kuno Scheel,Male,46,198,76,Y41.3,Neuschäfergasse 71,61524,+49(0)9891 681419
223,224,Paulina Lehmann B.Eng.,Female,63,174,62,S58.7,Weitzelstraße 0,03295,05771 83613


In [37]:
df.to_csv("syntheticData.csv", index=False)

### Multiple Dataset

In [22]:
data_patient = {
    'id' : pid,
    'name': name, 
    'diagnosis': diagnosis,
    'phone': phone_number
}

In [23]:
df = pd.DataFrame.from_dict(data=data_patient)
#df['phone'] = df['phone'].str.replace(r'[()]',"", regex=True).str.replace(r' ',"", regex=True).str.replace("+49","", regex=False)
df

Unnamed: 0,id,name,diagnosis,phone
0,1,Suzanne Finke,D56.3,626246164
1,2,Hubertine Hornich,N20.2,3133778787
2,3,Friedbert Binner MBA.,C98.5,505712231
3,4,Eckhardt Tlustek-Köster,J00.5,502369915
4,5,Univ.Prof. Babett Nohlmans B.Eng.,Y51.1,4645003315
5,6,Aysel Ritter-Flantz,Z76.6,927071282
6,7,Prof. Matthäus Söding B.A.,M84.9,9969442001
7,8,Helena Klingelhöfer B.Eng.,O85.5,378308679
8,9,Marius Schmidt,T92.4,9128361998
9,10,Halina Schweitzer,P15.7,4278789177


In [45]:
df.to_csv("datasets/multiple/patient.csv", index=False)

In [24]:
data_physical = {
    'id': pid,
    'gender': gender,
    'age': age,
    'height': height,
    'weight': weight,
}

In [25]:
df = pd.DataFrame(data=data_physical)
df

Unnamed: 0,id,gender,age,height,weight
0,1,Male,49,175,88
1,2,Male,66,185,93
2,3,Male,55,176,86
3,4,Female,58,166,62
4,5,Female,75,173,50
...,...,...,...,...,...
149995,149996,Male,23,184,78
149996,149997,Female,41,172,74
149997,149998,Female,88,170,57
149998,149999,Male,61,196,104


In [26]:
df.to_csv("datasets/multiple/physical.csv")

In [37]:
data_address = {
    'id': pid,
    'address': address,
    'zip': zip_code,
    'city' : city
}

In [38]:
df = pd.DataFrame(data=data_address)
df

Unnamed: 0,id,address,zip,city
0,1,Blochstr. 26,88973,Genthin
1,2,Kargestraße 2,40172,Regensburg
2,3,Krokerstraße 23,72092,Husum
3,4,Annelene-Krein-Gasse 4,50000,Eisenhüttenstadt
4,5,Annelene-Bonbach-Straße 6/4,67513,Donaueschingen
...,...,...,...,...
149995,149996,Dursun-Werner-Platz 04,33212,Kleve
149996,149997,Kostolzinstr. 9,17342,Görlitz
149997,149998,Koch IIplatz 1,66630,Kronach
149998,149999,Lotti-Ehlert-Weg 8/4,38848,Ebern


In [39]:
np.unique(address).size

143628

In [29]:
df.to_csv("datasets/multiple/contact.csv")

## Generate Masked Tables

In [267]:
b_size = 5
nFields = 2
level = 1
relNoise = 0.05
nFields_phone = 3

masked_low = mask(df, bucketize_age, 'age', b_size)
masked_low = mask(masked_low, blur_zip, 'zip', nFields)
masked_low = mask(masked_low, generalize_diagnosis, 'diagnosis', level)
masked_low = mask(masked_low, add_relative_noise, 'height', relNoise)
masked_low = mask(masked_low, blur_phone, 'phone', nFields_phone)

In [269]:
masked_low.to_csv("masked_low.csv")

In [270]:
b_size = 10
nFields = 3
level = 2
relNoise = 0.10
nFields_phone = 5

masked_medium = mask(df, bucketize_age, 'age', b_size)
masked_medium = mask(masked_medium, blur_zip, 'zip', nFields)
masked_medium = mask(masked_medium, generalize_diagnosis, 'diagnosis', level)
masked_medium = mask(masked_medium, add_relative_noise, 'height', relNoise)
masked_medium = mask(masked_medium, blur_phone, 'phone', nFields_phone)

In [271]:
masked_medium.to_csv("masked_medium.csv")

In [272]:
b_size = 20
nFields = 4
level = 2
relNoise = 0.20
nFields_phone = 7

masked_high = mask(df, bucketize_age, 'age', b_size)
masked_high = mask(masked_high, blur_zip, 'zip', nFields)
masked_high = mask(masked_high, generalize_diagnosis, 'diagnosis', level)
masked_high = mask(masked_high, add_relative_noise, 'height', relNoise)
masked_high = mask(masked_high, blur_phone, 'phone', nFields_phone)

In [273]:
masked_high.to_csv("masked_high.csv")

In [274]:
masked_high

Unnamed: 0,id,name,gender,age,height,weight,diagnosis,zip,phone
0,1,Beata Holzapfel MBA.,Female,[40.0 - 59.0],151,60,PXX.X,8XXXX,0044XXXXXXX
1,2,Hans-Otto Reinhardt,Male,[60.0 - 79.0],166,86,TXX.X,0XXXX,001XXXXXXX
2,3,Prof. Harald Hölzenbecher,Male,[60.0 - 79.0],174,72,SXX.X,3XXXX,0962XXXXXXX
3,4,Maritta Bloch B.Eng.,Female,[60.0 - 79.0],138,71,DXX.X,8XXXX,0929XXXXXXX
4,5,Carl-Heinz Löwer-Wohlgemut,Male,[40.0 - 59.0],184,70,JXX.X,9XXXX,095XXXXXXX
...,...,...,...,...,...,...,...,...,...
149995,149996,Wojciech Reinhardt MBA.,Male,[40.0 - 59.0],228,52,HXX.X,4XXXX,016XXXXXXX
149996,149997,Univ.Prof. Michael Jüttner MBA.,Male,[100.0 - 119.0],176,89,FXX.X,3XXXX,062XXXXXXX
149997,149998,Denis Schäfer,Male,[20.0 - 39.0],140,78,BXX.X,9XXXX,0005XXXXXXX
149998,149999,Antonietta Bähr-Hahn,Female,[40.0 - 59.0],192,57,JXX.X,0XXXX,0901XXXXXXX


In [249]:
df['zip']

0         80061
1         02512
2         38089
3         89900
4         95357
          ...  
149995    49306
149996    32496
149997    99452
149998    04136
149999    96694
Name: zip, Length: 150000, dtype: object

In [2]:
a = 2
b = 3
(a,b)

(2, 3)