In [57]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pandasql as ps
import string
from faker import Faker

from lib.masking_functions import mask, bucketize_age, blur_zip, generalize_diagnosis, add_relative_noise, blur_phone

In [58]:
def plot_probabilities(dist):
    dist_copy = dist.copy().sort_index()
    # labels = dist.index.to_series().apply(lambda x: '{0}-{1}'.format(*x))
    # labels = dist.index

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar(range(len(dist_copy.values)),dist_copy.values)
    plt.show()

## Data Generation

In [85]:
seed = 0

fake = Faker('de_DE')

fake.seed_instance(seed)
np.random.seed(seed)

from faker.providers import BaseProvider

# create new provider class for diagnosis following the ICD_10_GM medical code
class ICD_10_GM_Provider(BaseProvider):
    def diagnosis(self) -> str:
        letter = np.random.choice(string.ascii_uppercase.replace("U", "").replace("W", "").replace("X", ""))
        number = np.trunc(np.random.uniform(0.0, 99.9)*10)/(10)
        if number < 10:
            return letter+"0"+str(number)
        else:
            return letter+str(number)
        
fake.add_provider(ICD_10_GM_Provider)

In [60]:
nRows = 5000

pid = np.arange(1,nRows+1)
age = np.random.normal(60, 20, nRows).astype(int)

# Remove patients with age lower than 18 or higher than 100
invalid = age[np.logical_or(age < 18, age > 100)].size
while invalid > 0:
    age[np.logical_or(age < 18, age > 100)] = np.random.normal(75, 10, invalid).astype(int)
    invalid = age[np.logical_or(age < 18, age > 100)].size

# diagnosis = np.asarray([fake.diagnosis() for i in range(nRows)])
address = np.asarray([fake.street_address() for i in range(nRows)])
zip_code = np.asarray([fake.postcode() for i in range(nRows)])
city = np.asarray([fake.city_name() for i in range(nRows)])
phone_number = np.asarray([fake.unique.phone_number() for i in range(nRows)])

In [61]:
name = []
gender = []
weight = np.empty(nRows).astype(int)
height = np.empty(nRows).astype(int)

for i in range(nRows):
    rand = np.random.rand()
    if rand < 0.48:
        name.append(fake.unique.name_male())
        gender.append("Male")
        height[i] = int(np.random.normal(180, 7.5))
        weight[i] = int(np.random.normal(80, 10))
    elif rand < 0.98:
        name.append(fake.unique.name_female())
        gender.append("Female")
        height[i] = int(np.random.normal(167, 5))
        weight[i] = int(np.random.normal(60, 7.5))
    else:
        name.append(fake.unique.name_nonbinary())
        gender.append("Non-Binary")
        height[i] = int(np.random.normal(175, 6))
        weight[i] = int(np.random.normal(70, 10))

name = np.asarray(name)
gender = np.asarray(gender)

In [86]:
diagnosis = []
medication = []

for i in range(nRows):
    rand = np.random.rand()
    if rand < 0.09:
        diagnosis.append("E10")
        medication.append("Insulin")
    elif rand < 0.98:
        diagnosis.append("E11")
        medication.append("Metformin")
    else:
        diagnosis.append("E13")
        medication.append("Insulin")

diagnosis = np.asarray(diagnosis)
medication = np.asarray(medication)
glucose = np.asarray([np.random.randint(60, 450) for i in range(nRows)])
HbA1C = np.asarray([round(np.random.uniform(4, 12), 2) for i in range(nRows)])
possible_insurance_companies = ["104940005","103306961","104450915", "109519176","103508742", "101002659", "101575519", "109888001", "108888888", "109500787", "109500044", "109500490",
  "109500398", "108918320", "108814099", "108928697", "108811072", "108815718", "108313123", "108918428", "108817930", "108811215", "108334056", "108815217", "108312586", "109000051"]
insurance_company = np.asarray([np.random.choice(possible_insurance_companies) for i in range(nRows)])
insurance_number = np.asarray([fake.bothify(text='?#########', letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ') for i in range(nRows)])

### Single Dataset

In [87]:
name[nRows - 1] = "Henri Allgöwer"
address[nRows - 1] = "Einsteinufer 17"
zip_code[nRows - 1] = 10587
phone_number[nRows - 1] = "01765 123456"
gender[nRows - 1] = "Male"
height[nRows - 1] = 178
weight[nRows - 1] = 68
age[nRows - 1] = 27
insurance_company[nRows - 1] = 101575519
diagnosis[nRows - 1] = "E10"
glucose[nRows - 1] = 453
HbA1C[nRows - 1] = 10.13
medication[nRows - 1] = "Insulin"

data = {'id': pid,
    'name': name, 
    'address': address,
    'zip': zip_code,
    'phone': phone_number,
    'gender': gender, 
    'height': height,
    'weight': weight,
    'age': age,
    'insurance_company': insurance_company,
    'insurance_number': insurance_number,
    'diagnosis': diagnosis,
    'glucose': glucose,
    'HbA1C': HbA1C,
    'medication': medication
}
data

{'id': array([   1,    2,    3, ..., 1498, 1499, 1500]),
 'name': array(['Sarah Kreusel', 'Ronny Kohl', 'Prof. Heinz-Peter Ruppersberger',
        ..., 'Univ.Prof. Hans-Hinrich Lindner', 'Christiana Noack-Gertz',
        'Henri Allgöwer'], dtype='<U40'),
 'address': array(['Benthinring 6/4', 'Sauerplatz 42', 'Reinhardtweg 41', ...,
        'Xenia-Geisel-Ring 5', 'Kevin-Gerlach-Weg 1/3', 'Einsteinufer 17'],
       dtype='<U33'),
 'zip': array(['34597', '93801', '04500', ..., '84374', '05874', '10587'],
       dtype='<U5'),
 'phone': array(['+49(0)2298 81331', '+49(0) 144074876', '+49(0) 699112940', ...,
        '+49(0)2133 736170', '+49(0)5137 632349', '01765 123456'],
       dtype='<U19'),
 'gender': array(['Female', 'Male', 'Male', ..., 'Male', 'Female', 'Male'],
       dtype='<U10'),
 'height': array([169, 180, 181, ..., 179, 173, 178]),
 'weight': array([62, 73, 70, ..., 93, 65, 68]),
 'age': array([95, 68, 79, ..., 55, 38, 27]),
 'insurance_company': array(['108312586', '103508742'

In [88]:
df_synthetic_data = pd.DataFrame.from_dict(data=data)
#df['phone'] = df['phone'].str.replace(r'[()]',"", regex=True).str.replace(r' ',"", regex=True).str.replace("+49","", regex=False)
df_synthetic_data

Unnamed: 0,id,name,address,zip,phone,gender,height,weight,age,insurance_company,insurance_number,diagnosis,glucose,HbA1C,medication
0,1,Sarah Kreusel,Benthinring 6/4,34597,+49(0)2298 81331,Female,169,62,95,108312586,L660487647,E11,65,9.36,Metformin
1,2,Ronny Kohl,Sauerplatz 42,93801,+49(0) 144074876,Male,180,73,68,103508742,R938242194,E11,157,7.77,Metformin
2,3,Prof. Heinz-Peter Ruppersberger,Reinhardtweg 41,04500,+49(0) 699112940,Male,181,70,79,108918428,L924115781,E11,115,9.62,Metformin
3,4,Lilly Geißler,Mühlestr. 565,66369,03525914827,Female,170,51,75,108928697,B659387784,E11,98,9.79,Metformin
4,5,Prof. Inga Trapp B.A.,Martinallee 0,26455,(06188) 949633,Female,171,66,97,104450915,X801609753,E11,109,5.53,Metformin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1496,Traugott Haase,Boucseinallee 2/6,31488,(05386) 644284,Male,179,81,86,101002659,Z130053070,E10,427,4.49,Insulin
1496,1497,Resi Steckel,Irmtraud-Rädel-Platz 19,66857,+49 (0) 1917 774886,Female,170,63,55,104450915,A834254370,E11,64,7.08,Metformin
1497,1498,Univ.Prof. Hans-Hinrich Lindner,Xenia-Geisel-Ring 5,84374,+49(0)2133 736170,Male,179,93,55,103306961,B994756654,E11,60,5.93,Metformin
1498,1499,Christiana Noack-Gertz,Kevin-Gerlach-Weg 1/3,05874,+49(0)5137 632349,Female,173,65,38,108312586,E105772311,E10,263,5.50,Insulin


In [89]:
dir = os.getcwd()
df_synthetic_data.to_csv(dir + "/datasets/single/syntheticData.csv", index=False)

### Multiple Dataset

In [66]:
data_patient = {
    'id' : pid,
    'name': name, 
    'diagnosis': diagnosis,
    'phone': phone_number
}

In [67]:
df = pd.DataFrame.from_dict(data=data_patient)
#df['phone'] = df['phone'].str.replace(r'[()]',"", regex=True).str.replace(r' ',"", regex=True).str.replace("+49","", regex=False)
df

Unnamed: 0,id,name,diagnosis,phone
0,1,Sarah Kreusel,E11,+49(0)2298 81331
1,2,Ronny Kohl,E11,+49(0) 144074876
2,3,Prof. Heinz-Peter Ruppersberger,E11,+49(0) 699112940
3,4,Lilly Geißler,E11,03525914827
4,5,Prof. Inga Trapp B.A.,E11,(06188) 949633
...,...,...,...,...
1495,1496,Traugott Haase,E11,(05386) 644284
1496,1497,Resi Steckel,E11,+49 (0) 1917 774886
1497,1498,Univ.Prof. Hans-Hinrich Lindner,E11,+49(0)2133 736170
1498,1499,Christiana Noack-Gertz,E11,+49(0)5137 632349


In [68]:
df.to_csv(dir + "/datasets/multiple/patient.csv", index=False)

In [69]:
data_physical = {
    'id': pid,
    'gender': gender,
    'age': age,
    'height': height,
    'weight': weight,
}

In [70]:
df = pd.DataFrame(data=data_physical)
df

Unnamed: 0,id,gender,age,height,weight
0,1,Female,95,169,62
1,2,Male,68,180,73
2,3,Male,79,181,70
3,4,Female,75,170,51
4,5,Female,97,171,66
...,...,...,...,...,...
1495,1496,Male,86,179,81
1496,1497,Female,55,170,63
1497,1498,Male,55,179,93
1498,1499,Female,38,173,65


In [71]:
df.to_csv(dir + "/datasets/multiple/physical.csv")

In [72]:
data_address = {
    'id': pid,
    'address': address,
    'zip': zip_code,
    'city' : city
}

In [73]:
df = pd.DataFrame(data=data_address)
df

Unnamed: 0,id,address,zip,city
0,1,Benthinring 6/4,34597,Finsterwalde
1,2,Sauerplatz 42,93801,Büsingen am Hochrhein
2,3,Reinhardtweg 41,04500,Waren
3,4,Mühlestr. 565,66369,Erbisdorf
4,5,Martinallee 0,26455,Roding
...,...,...,...,...
1495,1496,Boucseinallee 2/6,31488,Karlsruhe
1496,1497,Irmtraud-Rädel-Platz 19,66857,Zeulenroda
1497,1498,Xenia-Geisel-Ring 5,84374,Plauen
1498,1499,Kevin-Gerlach-Weg 1/3,05874,Scheinfeld


In [74]:
np.unique(address).size

1499

In [75]:
df.to_csv(dir + "/datasets/multiple/contact.csv")

## Generate Masked Tables

In [76]:
b_size = 5
nFields = 2
level = 1
relNoise = 0.05
nFields_phone = 3

masked_low = mask(df_synthetic_data, bucketize_age, 'age', b_size)
masked_low = mask(masked_low, blur_zip, 'zip', nFields)
masked_low = mask(masked_low, generalize_diagnosis, 'diagnosis', level)
masked_low = mask(masked_low, add_relative_noise, 'height', relNoise)
masked_low = mask(masked_low, blur_phone, 'phone', nFields_phone)

In [77]:
masked_low.to_csv(dir + "/datasets/masked/masked_low.csv")

In [78]:
b_size = 10
nFields = 3
level = 2
relNoise = 0.10
nFields_phone = 5

masked_medium = mask(df_synthetic_data, bucketize_age, 'age', b_size)
masked_medium = mask(masked_medium, blur_zip, 'zip', nFields)
masked_medium = mask(masked_medium, generalize_diagnosis, 'diagnosis', level)
masked_medium = mask(masked_medium, add_relative_noise, 'height', relNoise)
masked_medium = mask(masked_medium, blur_phone, 'phone', nFields_phone)

In [79]:
masked_medium.to_csv(dir + "/datasets/masked/masked_medium.csv")

In [80]:
b_size = 20
nFields = 4
level = 2
relNoise = 0.20
nFields_phone = 7

masked_high = mask(df_synthetic_data, bucketize_age, 'age', b_size)
masked_high = mask(masked_high, blur_zip, 'zip', nFields)
masked_high = mask(masked_high, generalize_diagnosis, 'diagnosis', level)
masked_high = mask(masked_high, add_relative_noise, 'height', relNoise)
masked_high = mask(masked_high, blur_phone, 'phone', nFields_phone)

In [81]:
masked_high.to_csv(dir + "/datasets/masked/masked_high.csv")

In [82]:
masked_high

Unnamed: 0,id,name,address,zip,phone,gender,height,weight,age,insurance_company,insurance_number,diagnosis,glucose,HbA1C,medication
0,1,Sarah Kreusel,Benthinring 6/4,3XXXX,+49(0)229XXXXXXX,Female,164,62,[80.0 - 99.0],119942031,X027553380,XX.X,287,6.98,Metformin
1,2,Ronny Kohl,Sauerplatz 42,9XXXX,+49(0) 14XXXXXXX,Male,193,73,[60.0 - 79.0],963370630,I256862433,XX.X,77,9.05,Metformin
2,3,Prof. Heinz-Peter Ruppersberger,Reinhardtweg 41,0XXXX,+49(0) 69XXXXXXX,Male,204,70,[60.0 - 79.0],840222262,T183854926,XX.X,168,7.52,Metformin
3,4,Lilly Geißler,Mühlestr. 565,6XXXX,0352XXXXXXX,Female,142,51,[60.0 - 79.0],808834671,I884097152,XX.X,440,6.76,Metformin
4,5,Prof. Inga Trapp B.A.,Martinallee 0,2XXXX,(06188)XXXXXXX,Female,153,66,[80.0 - 99.0],230984865,N910779846,XX.X,349,11.44,Metformin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1496,Traugott Haase,Boucseinallee 2/6,3XXXX,(05386)XXXXXXX,Male,196,81,[80.0 - 99.0],997183858,P554949604,XX.X,416,10.75,Metformin
1496,1497,Resi Steckel,Irmtraud-Rädel-Platz 19,6XXXX,+49 (0) 1917XXXXXXX,Female,136,63,[40.0 - 59.0],693667453,S840469597,XX.X,336,4.33,Metformin
1497,1498,Univ.Prof. Hans-Hinrich Lindner,Xenia-Geisel-Ring 5,8XXXX,+49(0)2133XXXXXXX,Male,184,93,[40.0 - 59.0],294696901,T540010784,XX.X,449,10.04,Metformin
1498,1499,Christiana Noack-Gertz,Kevin-Gerlach-Weg 1/3,0XXXX,+49(0)5137XXXXXXX,Female,143,65,[20.0 - 39.0],722205014,D514123827,XX.X,442,8.93,Metformin


In [83]:
df['zip']

0       34597
1       93801
2       04500
3       66369
4       26455
        ...  
1495    31488
1496    66857
1497    84374
1498    05874
1499    10587
Name: zip, Length: 1500, dtype: object

In [84]:
a = 2
b = 3
(a,b)

(2, 3)