In [34]:
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import math
import pandasql as ps
import time
import string
import random
from faker import Faker

from lib.masking_functions import mask, bucketize_age, blur_zip, generalize_diagnosis, add_relative_noise, blur_phone

In [35]:
def plot_probabilities(dist):
    dist_copy = dist.copy().sort_index()
    # labels = dist.index.to_series().apply(lambda x: '{0}-{1}'.format(*x))
    # labels = dist.index

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar(range(len(dist_copy.values)),dist_copy.values)
    plt.show()

## Data Generation

In [36]:
fake = Faker('de_DE')

from faker.providers import BaseProvider

# create new provider class for diagnosis following the ICD_10_GM medical code
class ICD_10_GM_Provider(BaseProvider):
    def diagnosis(self) -> str:
        letter = random.choice(string.ascii_uppercase.replace("U", "").replace("W", "").replace("X", ""))
        number = np.trunc(np.random.uniform(0.0, 99.9)*10)/(10)
        if number < 10:
            return letter+"0"+str(number)
        else:
            return letter+str(number)
        
fake.add_provider(ICD_10_GM_Provider)

In [37]:
nRows = 1500

pid = np.arange(1,nRows+1)
age = np.random.normal(60, 20, nRows).astype(int)

# Remove patients with age lower than 18 or higher than 100
invalid = age[np.logical_or(age < 18, age > 100)].size
while invalid > 0:
    age[np.logical_or(age < 18, age > 100)] = np.random.normal(75, 10, invalid).astype(int)
    invalid = age[np.logical_or(age < 18, age > 100)].size

# diagnosis = np.asarray([fake.diagnosis() for i in range(nRows)])
address = np.asarray([fake.street_address() for i in range(nRows)])
zip_code = np.asarray([fake.postcode() for i in range(nRows)])
city = np.asarray([fake.city_name() for i in range(nRows)])
phone_number = np.asarray([fake.unique.phone_number() for i in range(nRows)])

In [38]:
name = []
gender = []
weight = np.empty(nRows).astype(int)
height = np.empty(nRows).astype(int)

for i in range(nRows):
    rand = np.random.rand()
    if rand < 0.48:
        name.append(fake.unique.name_male())
        gender.append("Male")
        height[i] = int(np.random.normal(180, 7.5))
        weight[i] = int(np.random.normal(80, 10))
    elif rand < 0.98:
        name.append(fake.unique.name_female())
        gender.append("Female")
        height[i] = int(np.random.normal(167, 5))
        weight[i] = int(np.random.normal(60, 7.5))
    else:
        name.append(fake.unique.name_nonbinary())
        gender.append("Non-Binary")
        height[i] = int(np.random.normal(175, 6))
        weight[i] = int(np.random.normal(70, 10))

name = np.asarray(name)
gender = np.asarray(gender)

In [51]:
diagnosis = []
medication = []

for i in range(nRows):
    rand = np.random.rand()
    if rand < 0.09:
        diagnosis.append("E10")
        medication.append("Insulin")
    elif rand < 0.98:
        diagnosis.append("E11")
        medication.append("Metformin")
    else:
        diagnosis.append("E13")
        medication.append("Insulin")

diagnosis = np.asarray(diagnosis)
medication = np.asarray(medication)
glucose = np.asarray([np.random.randint(60, 450) for i in range(nRows)])
HbA1C = np.asarray([round(random.uniform(4, 12), 2) for i in range(nRows)])
insurance_company = np.asarray([fake.bothify(text='#########') for i in range(nRows)])
insurance_number = np.asarray([fake.bothify(text='?#########', letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ') for i in range(nRows)])

### Single Dataset

In [40]:
name[nRows - 1] = "Henri Allgöwer"
address[nRows - 1] = "Einsteinufer 17"
zip_code[nRows - 1] = 10587
phone_number[nRows - 1] = "01765 123456"
gender[nRows - 1] = "Male"
height[nRows - 1] = 178
weight[nRows - 1] = 68
age[nRows - 1] = 27
insurance_company[nRows - 1] = 101575519
diagnosis[nRows - 1] = "E10"
glucose[nRows - 1] = 453
HbA1C[nRows - 1] = 10.13
medication[nRows - 1] = "Insulin"

data = {'id': pid,
    'name': name, 
    'address': address,
    'zip': zip_code,
    'phone': phone_number,
    'gender': gender, 
    'height': height,
    'weight': weight,
    'age': age,
    'insurance_company': insurance_company,
    'insurance_number': insurance_number,
    'diagnosis': diagnosis,
    'glucose': glucose,
    'HbA1C': HbA1C,
    'medication': medication
}
data

{'id': array([   1,    2,    3, ..., 1498, 1499, 1500]),
 'name': array(['Ewa Bolnbach B.Eng.', 'Prof. Kamil Killer',
        'Zdravko Junck-Ritter', ..., 'Grzegorz Striebitz-Steinberg',
        'Mohamed Speer-Trapp', 'Henri Allgöwer'], dtype='<U41'),
 'address': array(['Raphael-Mitschke-Ring 54', 'Sybille-Gehringer-Platz 5/1',
        'Urte-Buchholz-Weg 05', ..., 'Bekir-Hübel-Straße 9',
        'Häringgasse 463', 'Einsteinufer 17'], dtype='<U34'),
 'zip': array(['44323', '62668', '02863', ..., '92770', '73153', '10587'],
       dtype='<U5'),
 'phone': array(['+49 (0) 8587 907438', '(06798) 192770', '+49(0) 228387024', ...,
        '+49(0)5554 89976', '+49(0)0221 378223', '+49(0)9475 88536'],
       dtype='<U19'),
 'gender': array(['Female', 'Male', 'Male', ..., 'Male', 'Male', 'Male'],
       dtype='<U10'),
 'height': array([167, 186, 182, ..., 192, 181, 178]),
 'weight': array([49, 72, 65, ..., 77, 75, 68]),
 'age': array([33, 49, 60, ..., 50, 29, 27]),
 'insurance_company': array(['

In [41]:
df_synthetic_data = pd.DataFrame.from_dict(data=data)
#df['phone'] = df['phone'].str.replace(r'[()]',"", regex=True).str.replace(r' ',"", regex=True).str.replace("+49","", regex=False)
df_synthetic_data

Unnamed: 0,id,name,address,zip,phone,gender,height,weight,age,insurance_company,insurance_number,diagnosis,glucose,HbA1C,medication
0,1,Ewa Bolnbach B.Eng.,Raphael-Mitschke-Ring 54,44323,+49 (0) 8587 907438,Female,167,49,33,386481362,X366630042,E11,299,10.17,Metformin
1,2,Prof. Kamil Killer,Sybille-Gehringer-Platz 5/1,62668,(06798) 192770,Male,186,72,49,597941858,Z382985482,E11,148,10.96,Metformin
2,3,Zdravko Junck-Ritter,Urte-Buchholz-Weg 05,02863,+49(0) 228387024,Male,182,65,60,621170846,B567964048,E11,153,5.48,Metformin
3,4,Wieland auch Schlauchin,Annaliese-Ritter-Straße 8,53291,04614385395,Male,176,78,64,487441118,K129212042,E11,266,6.34,Metformin
4,5,Dipl.-Ing. Cengiz Peukert,Metzallee 5,62747,05808 937242,Male,195,73,62,873298780,O671592897,E11,394,4.30,Metformin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1496,Luka Seifert,Volker-Beier-Gasse 337,48065,+49(0)4885 931840,Female,171,64,66,726051066,I021399442,E11,105,8.07,Metformin
1496,1497,Prof. Rosalinde Mosemann B.A.,Kostolzinallee 025,28890,(01314) 003357,Female,169,51,47,500314300,L120238073,E11,244,6.93,Metformin
1497,1498,Grzegorz Striebitz-Steinberg,Bekir-Hübel-Straße 9,92770,+49(0)5554 89976,Male,192,77,50,713021715,Z005101642,E11,300,8.34,Metformin
1498,1499,Mohamed Speer-Trapp,Häringgasse 463,73153,+49(0)0221 378223,Male,181,75,29,413137099,S101016474,E11,287,9.84,Metformin


In [42]:
dir = os.getcwd()
df_synthetic_data.to_csv(dir + "/datasets/single/syntheticData.csv", index=False)

### Multiple Dataset

In [45]:
data_patient = {
    'id' : pid,
    'name': name, 
    'diagnosis': diagnosis,
    'phone': phone_number
}

In [46]:
df = pd.DataFrame.from_dict(data=data_patient)
#df['phone'] = df['phone'].str.replace(r'[()]',"", regex=True).str.replace(r' ',"", regex=True).str.replace("+49","", regex=False)
df

Unnamed: 0,id,name,diagnosis,phone
0,1,Dr. Magrit Bonbach MBA.,P89.1,00419757814
1,2,Hanspeter Beyer B.A.,Z35.9,07300014205
2,3,Anatolij Roskoth B.A.,L85.7,+49 (0) 3924 478029
3,4,Dipl.-Ing. Teresa Karz B.Sc.,V54.1,02894 343064
4,5,Björn Misicher,N73.9,(01843) 61458
5,6,Herrmann Tröst B.Sc.,G38.8,+49(0)4200 21613
6,7,Henner Putz B.Eng.,R09.8,+49 (0) 0058 889198
7,8,Traude Hövel-Aumann,K70.7,04948 083163
8,9,Gertraud Langern,V80.4,+49(0)9389 06593
9,10,Univ.Prof. Liselotte Hecker B.A.,V45.4,+49(0) 441083097


In [62]:
df.to_csv(dir + "/datasets/multiple/patient.csv", index=False)

In [63]:
data_physical = {
    'id': pid,
    'gender': gender,
    'age': age,
    'height': height,
    'weight': weight,
}

In [64]:
df = pd.DataFrame(data=data_physical)
df

Unnamed: 0,id,gender,age,height,weight
0,1,Female,47,165,53
1,2,Male,88,180,84
2,3,Male,32,180,91
3,4,Female,62,162,57
4,5,Male,78,180,79
5,6,Male,65,183,70
6,7,Male,53,180,83
7,8,Female,26,168,54
8,9,Female,58,162,63
9,10,Female,43,167,68


In [65]:
df.to_csv(dir + "/datasets/multiple/physical.csv")

In [66]:
data_address = {
    'id': pid,
    'address': address,
    'zip': zip_code,
    'city' : city
}

In [67]:
df = pd.DataFrame(data=data_address)
df

Unnamed: 0,id,address,zip,city
0,1,Olaf-Zobel-Allee 9,24455,Eichstätt
1,2,Schomberplatz 7,16661,Osterode am Harz
2,3,Römerplatz 1,23765,Burg
3,4,Stiffelstr. 9,60825,Rottweil
4,5,Paffrathallee 9/5,78781,Pritzwalk
5,6,Mendegasse 84,53313,Hoyerswerda
6,7,Neuschäferstraße 20,16128,Mallersdorf
7,8,Jolanda-Wilms-Allee 9,44109,Guben
8,9,Schülerallee 0,47725,Neubrandenburg
9,10,Gerta-Krause-Gasse 843,26285,Pasewalk


In [68]:
np.unique(address).size

50

In [69]:
df.to_csv(dir + "/datasets/multiple/contact.csv")

## Generate Masked Tables

In [70]:
b_size = 5
nFields = 2
level = 1
relNoise = 0.05
nFields_phone = 3

masked_low = mask(df_synthetic_data, bucketize_age, 'age', b_size)
masked_low = mask(masked_low, blur_zip, 'zip', nFields)
masked_low = mask(masked_low, generalize_diagnosis, 'diagnosis', level)
masked_low = mask(masked_low, add_relative_noise, 'height', relNoise)
masked_low = mask(masked_low, blur_phone, 'phone', nFields_phone)

In [71]:
masked_low.to_csv(dir + "/datasets/masked/masked_low.csv")

In [72]:
b_size = 10
nFields = 3
level = 2
relNoise = 0.10
nFields_phone = 5

masked_medium = mask(df_synthetic_data, bucketize_age, 'age', b_size)
masked_medium = mask(masked_medium, blur_zip, 'zip', nFields)
masked_medium = mask(masked_medium, generalize_diagnosis, 'diagnosis', level)
masked_medium = mask(masked_medium, add_relative_noise, 'height', relNoise)
masked_medium = mask(masked_medium, blur_phone, 'phone', nFields_phone)

In [73]:
masked_medium.to_csv(dir + "/datasets/masked/masked_medium.csv")

In [74]:
b_size = 20
nFields = 4
level = 2
relNoise = 0.20
nFields_phone = 7

masked_high = mask(df_synthetic_data, bucketize_age, 'age', b_size)
masked_high = mask(masked_high, blur_zip, 'zip', nFields)
masked_high = mask(masked_high, generalize_diagnosis, 'diagnosis', level)
masked_high = mask(masked_high, add_relative_noise, 'height', relNoise)
masked_high = mask(masked_high, blur_phone, 'phone', nFields_phone)

In [75]:
masked_high.to_csv(dir + "/datasets/masked/masked_high.csv")

In [76]:
masked_high

Unnamed: 0,id,name,gender,age,height,weight,diagnosis,address,zip,phone
0,1,Dr. Magrit Bonbach MBA.,Female,[40.0 - 59.0],191,53,PXX.X,Olaf-Zobel-Allee 9,2XXXX,0041XXXXXXX
1,2,Hanspeter Beyer B.A.,Male,[80.0 - 99.0],167,84,ZXX.X,Schomberplatz 7,1XXXX,0730XXXXXXX
2,3,Anatolij Roskoth B.A.,Male,[20.0 - 39.0],146,91,LXX.X,Römerplatz 1,2XXXX,+49 (0) 3924XXXXXXX
3,4,Dipl.-Ing. Teresa Karz B.Sc.,Female,[60.0 - 79.0],188,57,VXX.X,Stiffelstr. 9,6XXXX,02894XXXXXXX
4,5,Björn Misicher,Male,[60.0 - 79.0],215,79,NXX.X,Paffrathallee 9/5,7XXXX,(01843XXXXXXX
5,6,Herrmann Tröst B.Sc.,Male,[60.0 - 79.0],158,70,GXX.X,Mendegasse 84,5XXXX,+49(0)420XXXXXXX
6,7,Henner Putz B.Eng.,Male,[40.0 - 59.0],158,83,RXX.X,Neuschäferstraße 20,1XXXX,+49 (0) 0058XXXXXXX
7,8,Traude Hövel-Aumann,Female,[20.0 - 39.0],173,54,KXX.X,Jolanda-Wilms-Allee 9,4XXXX,04948XXXXXXX
8,9,Gertraud Langern,Female,[40.0 - 59.0],148,63,VXX.X,Schülerallee 0,4XXXX,+49(0)938XXXXXXX
9,10,Univ.Prof. Liselotte Hecker B.A.,Female,[40.0 - 59.0],200,68,VXX.X,Gerta-Krause-Gasse 843,2XXXX,+49(0) 44XXXXXXX


In [77]:
df['zip']

0     24455
1     16661
2     23765
3     60825
4     78781
5     53313
6     16128
7     44109
8     47725
9     26285
10    84413
11    99364
12    21885
13    61019
14    50037
15    39327
16    38001
17    39626
18    53465
19    95348
20    57178
21    13615
22    49513
23    62506
24    68521
25    01897
26    39478
27    25116
28    66383
29    81076
30    03280
31    83084
32    42513
33    70517
34    22558
35    23293
36    06187
37    97154
38    01856
39    84565
40    91153
41    14542
42    71417
43    69164
44    96801
45    81922
46    01468
47    23497
48    03000
49    78621
Name: zip, dtype: object

In [78]:
a = 2
b = 3
(a,b)

(2, 3)