In [11]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pandasql as ps
import string
import random
from faker import Faker

from lib.masking_functions import mask, bucketize_age, blur_zip, generalize_diagnosis, add_relative_noise, blur_phone

In [12]:
def plot_probabilities(dist):
    dist_copy = dist.copy().sort_index()
    # labels = dist.index.to_series().apply(lambda x: '{0}-{1}'.format(*x))
    # labels = dist.index

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar(range(len(dist_copy.values)),dist_copy.values)
    plt.show()

## Data Generation

In [13]:
nRows = 100000
seed = 0

fake = Faker('de_DE')

fake.seed_instance(seed)
np.random.seed(seed)
random.seed(seed)

from faker.providers import BaseProvider

# create new provider class for diagnosis following the ICD_10_GM medical code
class ICD_10_GM_Provider(BaseProvider):
    def diagnosis(self) -> str:
        letter = np.random.choice(string.ascii_uppercase.replace("U", "").replace("W", "").replace("X", ""))
        number = np.trunc(np.random.uniform(0.0, 99.9)*10)/(10)
        if number < 10:
            return letter+"0"+str(number)
        else:
            return letter+str(number)
        
fake.add_provider(ICD_10_GM_Provider)

In [14]:
pid = np.arange(1,nRows+1)
age = np.random.normal(60, 20, nRows).astype(int)

# Remove patients with age lower than 18 or higher than 100
invalid = age[np.logical_or(age < 18, age > 100)].size
while invalid > 0:
    age[np.logical_or(age < 18, age > 100)] = np.random.normal(75, 10, invalid).astype(int)
    invalid = age[np.logical_or(age < 18, age > 100)].size

# diagnosis = np.asarray([fake.diagnosis() for i in range(nRows)])
address = np.asarray([fake.street_address() for i in range(nRows)])
zip_code = np.asarray([fake.postcode() for i in range(nRows)])
city = np.asarray([fake.city_name() for i in range(nRows)])
phone_number = np.asarray([fake.unique.phone_number() for i in range(nRows)])

In [15]:
name = []
gender = []
weight = np.empty(nRows).astype(int)
height = np.empty(nRows).astype(int)

for i in range(nRows):
    rand = np.random.rand()
    if rand < 0.48:
        name.append(fake.unique.name_male())
        gender.append("Male")
        height[i] = int(np.random.normal(180, 7.5))
        weight[i] = int(np.random.normal(80, 10))
    elif rand < 0.98:
        name.append(fake.unique.name_female())
        gender.append("Female")
        height[i] = int(np.random.normal(167, 5))
        weight[i] = int(np.random.normal(60, 7.5))
    else:
        name.append(fake.unique.name_nonbinary())
        gender.append("Non-Binary")
        height[i] = int(np.random.normal(175, 6))
        weight[i] = int(np.random.normal(70, 10))

name = np.asarray(name)
gender = np.asarray(gender)

In [16]:
diagnosis = []
medication = []

for i in range(nRows):
    rand = np.random.rand()
    if rand < 0.09:
        diagnosis.append("E10")
        medication.append("Insulin")
    elif rand < 0.98:
        diagnosis.append("E11")
        medication.append("Metformin")
    else:
        diagnosis.append("E13")
        medication.append("Insulin")

diagnosis = np.asarray(diagnosis)
medication = np.asarray(medication)
glucose = np.asarray([np.random.randint(60, 450) for i in range(nRows)])
HbA1C = np.asarray([round(np.random.uniform(4, 12), 2) for i in range(nRows)])

In [17]:
ersatz_kassen = [104940005, 103306961, 104450915, 109519176, 103508742, 101002659, 101575519] 
ikks = [109888001, 108888888, 109500787, 109500044, 109500490, 109500398] 
aok =  [108918320, 108814099, 108928697, 108811072, 108815718] 
bkk = [108313123, 108918428, 108817930, 108811215, 108334056, 108815217, 108312586]  
lkk = [109000051] 
alle_kassen = ersatz_kassen + ikks + aok + bkk + lkk
kassen_gewichte = (
    [0.4 / len(ersatz_kassen)] * len(ersatz_kassen) +
    [0.07 / len(ikks)] * len(ikks) +
    [0.37 / len(aok)] * len(aok) +
    [0.15 / len(bkk)] * len(bkk) +
    [0.01]  
)
insurance_company = np.asarray([random.choices(
    alle_kassen,
    weights=kassen_gewichte,
    k=1
)[0]for i in range(nRows)])
insurance_number = np.asarray([fake.bothify(text='?#########', letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ') for i in range(nRows)])

### Single Dataset

In [18]:
name[nRows - 1] = "Henri Allgöwer"
address[nRows - 1] = "Einsteinufer 17"
zip_code[nRows - 1] = 10587
phone_number[nRows - 1] = "01765 123456"
gender[nRows - 1] = "Male"
height[nRows - 1] = 178
weight[nRows - 1] = 68
age[nRows - 1] = 27
insurance_company[nRows - 1] = 101575519
diagnosis[nRows - 1] = "E10"
glucose[nRows - 1] = 453
HbA1C[nRows - 1] = 10.13
medication[nRows - 1] = "Insulin"

data = {'id': pid,
    'name': name, 
    'address': address,
    'zip': zip_code,
    'phone': phone_number,
    'gender': gender, 
    'height': height,
    'weight': weight,
    'age': age,
    'insurance_company': insurance_company,
    'insurance_number': insurance_number,
    'diagnosis': diagnosis,
    'glucose': glucose,
    'HbA1C': HbA1C,
    'medication': medication
}
data

{'id': array([     1,      2,      3, ...,  99998,  99999, 100000]),
 'name': array(['Prof. Zbigniew Schwital B.A.', 'Bernadette Tlustek',
        'Friederike Kaul', ..., 'Herr Zeki Haering B.Sc.', 'Franca Bloch',
        'Henri Allgöwer'], dtype='<U45'),
 'address': array(['Benthinring 6/4', 'Sauerplatz 42', 'Reinhardtweg 41', ...,
        'Hanny-Haering-Ring 4/0', 'Antonino-Ruppersberger-Ring 7',
        'Einsteinufer 17'], dtype='<U40'),
 'zip': array(['17608', '25938', '53888', ..., '01648', '30033', '10587'],
       dtype='<U5'),
 'phone': array(['(03667) 87527', '(06194) 75793', '09579188787', ...,
        '+49 (0) 7727 090324', '06865 52922', '01765 123456'], dtype='<U19'),
 'gender': array(['Male', 'Female', 'Female', ..., 'Male', 'Female', 'Male'],
       dtype='<U10'),
 'height': array([171, 170, 174, ..., 170, 167, 178]),
 'weight': array([80, 60, 71, ..., 71, 50, 68]),
 'age': array([95, 68, 79, ..., 74, 36, 27]),
 'insurance_company': array([108313123, 108811072, 108888888

In [19]:
df_synthetic_data = pd.DataFrame.from_dict(data=data)
#df['phone'] = df['phone'].str.replace(r'[()]',"", regex=True).str.replace(r' ',"", regex=True).str.replace("+49","", regex=False)
df_synthetic_data

Unnamed: 0,id,name,address,zip,phone,gender,height,weight,age,insurance_company,insurance_number,diagnosis,glucose,HbA1C,medication
0,1,Prof. Zbigniew Schwital B.A.,Benthinring 6/4,17608,(03667) 87527,Male,171,80,95,108313123,N450437602,E11,348,10.24,Metformin
1,2,Bernadette Tlustek,Sauerplatz 42,25938,(06194) 75793,Female,170,60,68,108811072,X836726599,E11,393,4.23,Metformin
2,3,Friederike Kaul,Reinhardtweg 41,53888,09579188787,Female,174,71,79,108888888,I148417629,E11,352,9.25,Metformin
3,4,Steve Schulz B.Sc.,Mühlestr. 565,04957,+49(0)5988 14596,Male,175,80,70,103508742,D135937139,E11,404,5.71,Metformin
4,5,Gabriela Lübs,Martinallee 0,98744,03483 477430,Female,165,66,97,108918320,E602685810,E11,442,6.02,Metformin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,Anita Cichorius B.Sc.,Kranzstr. 610,17148,+49 (0) 7048 302904,Female,167,60,53,108928697,Y747695593,E11,143,6.52,Metformin
99996,99997,Sibylle Kallert,Bohlanderstr. 5/7,30086,05831 007896,Female,172,56,19,101575519,S611987093,E11,369,10.83,Metformin
99997,99998,Herr Zeki Haering B.Sc.,Hanny-Haering-Ring 4/0,01648,+49 (0) 7727 090324,Male,170,71,74,109500044,M262647672,E11,430,11.37,Metformin
99998,99999,Franca Bloch,Antonino-Ruppersberger-Ring 7,30033,06865 52922,Female,167,50,36,104450915,Y635440966,E11,449,10.46,Metformin


In [20]:
dir = os.getcwd()
df_synthetic_data.to_csv(dir + "/datasets/single/syntheticData.csv", index=False)

### Multiple Dataset

In [21]:
data_patient = {
    'id' : pid,
    'name': name, 
    'diagnosis': diagnosis,
    'phone': phone_number
}

In [22]:
df = pd.DataFrame.from_dict(data=data_patient)
#df['phone'] = df['phone'].str.replace(r'[()]',"", regex=True).str.replace(r' ',"", regex=True).str.replace("+49","", regex=False)
df

Unnamed: 0,id,name,diagnosis,phone
0,1,Prof. Zbigniew Schwital B.A.,E11,(03667) 87527
1,2,Bernadette Tlustek,E11,(06194) 75793
2,3,Friederike Kaul,E11,09579188787
3,4,Steve Schulz B.Sc.,E11,+49(0)5988 14596
4,5,Gabriela Lübs,E11,03483 477430
...,...,...,...,...
99995,99996,Anita Cichorius B.Sc.,E11,+49 (0) 7048 302904
99996,99997,Sibylle Kallert,E11,05831 007896
99997,99998,Herr Zeki Haering B.Sc.,E11,+49 (0) 7727 090324
99998,99999,Franca Bloch,E11,06865 52922


In [23]:
df.to_csv(dir + "/datasets/multiple/patient.csv", index=False)

In [24]:
data_physical = {
    'id': pid,
    'gender': gender,
    'age': age,
    'height': height,
    'weight': weight,
}

In [25]:
df = pd.DataFrame(data=data_physical)
df

Unnamed: 0,id,gender,age,height,weight
0,1,Male,95,171,80
1,2,Female,68,170,60
2,3,Female,79,174,71
3,4,Male,70,175,80
4,5,Female,97,165,66
...,...,...,...,...,...
99995,99996,Female,53,167,60
99996,99997,Female,19,172,56
99997,99998,Male,74,170,71
99998,99999,Female,36,167,50


In [26]:
df.to_csv(dir + "/datasets/multiple/physical.csv")

In [27]:
data_address = {
    'id': pid,
    'address': address,
    'zip': zip_code,
    'city' : city
}

In [28]:
df = pd.DataFrame(data=data_address)
df

Unnamed: 0,id,address,zip,city
0,1,Benthinring 6/4,17608,Uelzen
1,2,Sauerplatz 42,25938,Spremberg
2,3,Reinhardtweg 41,53888,Lemgo
3,4,Mühlestr. 565,04957,Rockenhausen
4,5,Martinallee 0,98744,Karlsruhe
...,...,...,...,...
99995,99996,Kranzstr. 610,17148,Scheinfeld
99996,99997,Bohlanderstr. 5/7,30086,Rudolstadt
99997,99998,Hanny-Haering-Ring 4/0,01648,Cottbus
99998,99999,Antonino-Ruppersberger-Ring 7,30033,Viersen


In [29]:
np.unique(address).size

97072

In [30]:
df.to_csv(dir + "/datasets/multiple/contact.csv")

## Generate Masked Tables

In [31]:
b_size = 5
nFields = 2
level = 1
relNoise = 0.05
nFields_phone = 3

masked_low = mask(df_synthetic_data, bucketize_age, 'age', b_size)
masked_low = mask(masked_low, blur_zip, 'zip', nFields)
masked_low = mask(masked_low, generalize_diagnosis, 'diagnosis', level)
masked_low = mask(masked_low, add_relative_noise, 'height', relNoise)
masked_low = mask(masked_low, blur_phone, 'phone', nFields_phone)

In [32]:
masked_low.to_csv(dir + "/datasets/masked/masked_low.csv")

In [33]:
b_size = 10
nFields = 3
level = 2
relNoise = 0.10
nFields_phone = 5

masked_medium = mask(df_synthetic_data, bucketize_age, 'age', b_size)
masked_medium = mask(masked_medium, blur_zip, 'zip', nFields)
masked_medium = mask(masked_medium, generalize_diagnosis, 'diagnosis', level)
masked_medium = mask(masked_medium, add_relative_noise, 'height', relNoise)
masked_medium = mask(masked_medium, blur_phone, 'phone', nFields_phone)

In [34]:
masked_medium.to_csv(dir + "/datasets/masked/masked_medium.csv")

In [35]:
b_size = 20
nFields = 4
level = 2
relNoise = 0.20
nFields_phone = 7

masked_high = mask(df_synthetic_data, bucketize_age, 'age', b_size)
masked_high = mask(masked_high, blur_zip, 'zip', nFields)
masked_high = mask(masked_high, generalize_diagnosis, 'diagnosis', level)
masked_high = mask(masked_high, add_relative_noise, 'height', relNoise)
masked_high = mask(masked_high, blur_phone, 'phone', nFields_phone)

In [36]:
masked_high.to_csv(dir + "/datasets/masked/masked_high.csv")

In [37]:
masked_high

Unnamed: 0,id,name,address,zip,phone,gender,height,weight,age,insurance_company,insurance_number,diagnosis,glucose,HbA1C,medication
0,1,Prof. Zbigniew Schwital B.A.,Benthinring 6/4,1XXXX,(03667XXXXXXX,Male,164,80,[80.0 - 99.0],108313123,N450437602,XX.X,348,10.24,Metformin
1,2,Bernadette Tlustek,Sauerplatz 42,2XXXX,(06194XXXXXXX,Female,199,60,[60.0 - 79.0],108811072,X836726599,XX.X,393,4.23,Metformin
2,3,Friederike Kaul,Reinhardtweg 41,5XXXX,0957XXXXXXX,Female,174,71,[60.0 - 79.0],108888888,I148417629,XX.X,352,9.25,Metformin
3,4,Steve Schulz B.Sc.,Mühlestr. 565,0XXXX,+49(0)598XXXXXXX,Male,155,80,[60.0 - 79.0],103508742,D135937139,XX.X,404,5.71,Metformin
4,5,Gabriela Lübs,Martinallee 0,9XXXX,03483XXXXXXX,Female,165,66,[80.0 - 99.0],108918320,E602685810,XX.X,442,6.02,Metformin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,Anita Cichorius B.Sc.,Kranzstr. 610,1XXXX,+49 (0) 7048XXXXXXX,Female,146,60,[40.0 - 59.0],108928697,Y747695593,XX.X,143,6.52,Metformin
99996,99997,Sibylle Kallert,Bohlanderstr. 5/7,3XXXX,05831XXXXXXX,Female,165,56,[0.0 - 19.0],101575519,S611987093,XX.X,369,10.83,Metformin
99997,99998,Herr Zeki Haering B.Sc.,Hanny-Haering-Ring 4/0,0XXXX,+49 (0) 7727XXXXXXX,Male,199,71,[60.0 - 79.0],109500044,M262647672,XX.X,430,11.37,Metformin
99998,99999,Franca Bloch,Antonino-Ruppersberger-Ring 7,3XXXX,0686XXXXXXX,Female,151,50,[20.0 - 39.0],104450915,Y635440966,XX.X,449,10.46,Metformin


In [38]:
df['zip']

0        17608
1        25938
2        53888
3        04957
4        98744
         ...  
99995    17148
99996    30086
99997    01648
99998    30033
99999    10587
Name: zip, Length: 100000, dtype: object

In [39]:
a = 2
b = 3
(a,b)

(2, 3)