## Anonymize data (Patient_id, Discharge_id)

Author: Lin Lee Cheong <br>
Last Update: 11/23/2020

In [1]:
import csv
import json

In [2]:
map_fp = './patient_id_mappings.json'
with open(map_fp) as f:
    mapping = json.load(f)

In [3]:
def anonymize(org_fp, new_fp, mapping):
    print(f'Anonymizing: {org_fp}')
    with open(org_fp, "r") as in_fh, open(new_fp, "w+") as out_fh:

        writer = csv.writer(out_fh, delimiter=',')
        reader = csv.reader(in_fh)

        header = next(reader)
        pid_idx = header.index('patient_id')
        did_idx = header.index('discharge_id')
        writer.writerow(header)

        for idx, row in enumerate(reader):
            try:
                if '.' in row[pid_idx]:
                    new_pid = mapping[str(int(float(row[pid_idx])))]
                else:
                    new_pid = mapping[row[pid_idx]]
            except:
                print(row[pid_idx])
                print(row[did_idx])
                print(row)
                print(idx)
                raise Exception('Something wrong in mapping')
            row[did_idx] = row[did_idx].replace(row[pid_idx], new_pid)
            row[pid_idx] = new_pid

            writer.writerow(row)
    print(f'Finished, write to: {new_fp}')

**ANONYMIZE RAW_DATA**

In [4]:
raw_org_fp = './raw_data/readmission_input_targets_365_v2.csv'
raw_new_fp = './raw_data/readmission_input_targets_365_v2_anony.csv'

anonymize(raw_org_fp, raw_new_fp, mapping)

Anonymizing: ./raw_data/readmission_input_targets_365_v2.csv
Finished, write to: ./raw_data/readmission_input_targets_365_v2_anony.csv


**ANONYMIZE RAW_DATA IN FOLDS**

In [6]:
org_train_fps = [f'./fold_{idx}/train/raw_train_data.csv' for idx in range(5)]
org_test_fps = [f'./fold_{idx}/test/raw_test_data.csv' for idx in range(5)]

new_train_fps = [f'./fold_{idx}/train/raw_train_data_anony.csv' for idx in range(5)]
new_test_fps = [f'./fold_{idx}/test/raw_test_data_anony.csv' for idx in range(5)]

for org_fp, new_fp in zip(org_train_fps, new_train_fps):
    anonymize(org_fp, new_fp, mapping)
    
for org_fp, new_fp in zip(org_test_fps, new_test_fps):
    anonymize(org_fp, new_fp, mapping)

Anonymizing: ./fold_0/train/raw_train_data.csv
Finished, write to: ./fold_0/train/raw_train_data_anony.csv
Anonymizing: ./fold_1/train/raw_train_data.csv
Finished, write to: ./fold_1/train/raw_train_data_anony.csv
Anonymizing: ./fold_2/train/raw_train_data.csv
Finished, write to: ./fold_2/train/raw_train_data_anony.csv
Anonymizing: ./fold_3/train/raw_train_data.csv
Finished, write to: ./fold_3/train/raw_train_data_anony.csv
Anonymizing: ./fold_4/train/raw_train_data.csv
Finished, write to: ./fold_4/train/raw_train_data_anony.csv
Anonymizing: ./fold_0/test/raw_test_data.csv
Finished, write to: ./fold_0/test/raw_test_data_anony.csv
Anonymizing: ./fold_1/test/raw_test_data.csv
Finished, write to: ./fold_1/test/raw_test_data_anony.csv
Anonymizing: ./fold_2/test/raw_test_data.csv
Finished, write to: ./fold_2/test/raw_test_data_anony.csv
Anonymizing: ./fold_3/test/raw_test_data.csv
Finished, write to: ./fold_3/test/raw_test_data_anony.csv
Anonymizing: ./fold_4/test/raw_test_data.csv
Finished

**ANONYMIZE FLATTENED 365 DAYS DATA**

In [49]:
org_test_fps = [f'./fold_{idx}/test/raw_test_data_1000_365days.csv' for idx in range(5)]
new_test_fps = [f'./fold_{idx}/test/raw_test_data_1000_365days_anony.csv' for idx in range(5)]
 
for org_fp, new_fp in zip(org_test_fps, new_test_fps):
    anonymize(org_fp, new_fp, mapping)

Anonymizing: ./fold_0/test/raw_test_data_1000_365days.csv
Finished, write to: ./fold_0/test/raw_test_data_1000_365days_anony.csv
Anonymizing: ./fold_1/test/raw_test_data_1000_365days.csv
Finished, write to: ./fold_1/test/raw_test_data_1000_365days_anony.csv
Anonymizing: ./fold_2/test/raw_test_data_1000_365days.csv
Finished, write to: ./fold_2/test/raw_test_data_1000_365days_anony.csv
Anonymizing: ./fold_3/test/raw_test_data_1000_365days.csv
Finished, write to: ./fold_3/test/raw_test_data_1000_365days_anony.csv
Anonymizing: ./fold_4/test/raw_test_data_1000_365days.csv
Finished, write to: ./fold_4/test/raw_test_data_1000_365days_anony.csv


In [4]:
org_train_fps = [f'./fold_{idx}/train/raw_train_data_1000_365days.csv' for idx in range(5)]
new_train_fps = [f'./fold_{idx}/train/raw_train_data_1000_365days_anony.csv' for idx in range(5)]

for org_fp, new_fp in zip(org_train_fps, new_train_fps):
    anonymize(org_fp, new_fp, mapping)


Anonymizing: ./fold_0/train/raw_train_data_1000_365days.csv
Finished, write to: ./fold_0/train/raw_train_data_1000_365days_anony.csv
Anonymizing: ./fold_1/train/raw_train_data_1000_365days.csv
Finished, write to: ./fold_1/train/raw_train_data_1000_365days_anony.csv
Anonymizing: ./fold_2/train/raw_train_data_1000_365days.csv
Finished, write to: ./fold_2/train/raw_train_data_1000_365days_anony.csv
Anonymizing: ./fold_3/train/raw_train_data_1000_365days.csv
Finished, write to: ./fold_3/train/raw_train_data_1000_365days_anony.csv
Anonymizing: ./fold_4/train/raw_train_data_1000_365days.csv
Finished, write to: ./fold_4/train/raw_train_data_1000_365days_anony.csv


**ANONYMIZE FLATTENED 30 DAYS DATA**

In [50]:
org_test_fps = [f'./fold_{idx}/test/raw_test_data_1000_30days.csv' for idx in range(5)]
new_test_fps = [f'./fold_{idx}/test/raw_test_data_1000_30days_anony.csv' for idx in range(5)]
 
for org_fp, new_fp in zip(org_test_fps, new_test_fps):
    anonymize(org_fp, new_fp, mapping)

Anonymizing: ./fold_0/test/raw_test_data_1000_30days.csv
Finished, write to: ./fold_0/test/raw_test_data_1000_30days_anony.csv
Anonymizing: ./fold_1/test/raw_test_data_1000_30days.csv
Finished, write to: ./fold_1/test/raw_test_data_1000_30days_anony.csv
Anonymizing: ./fold_2/test/raw_test_data_1000_30days.csv
Finished, write to: ./fold_2/test/raw_test_data_1000_30days_anony.csv
Anonymizing: ./fold_3/test/raw_test_data_1000_30days.csv
Finished, write to: ./fold_3/test/raw_test_data_1000_30days_anony.csv
Anonymizing: ./fold_4/test/raw_test_data_1000_30days.csv
Finished, write to: ./fold_4/test/raw_test_data_1000_30days_anony.csv


In [None]:
org_train_fps = [f'./fold_{idx}/train/raw_train_data_1000_30days.csv' for idx in range(5)]
new_train_fps = [f'./fold_{idx}/train/raw_train_data_1000_30days_anony.csv' for idx in range(5)]

for org_fp, new_fp in zip(org_train_fps, new_train_fps):
    anonymize(org_fp, new_fp, mapping)


Anonymizing: ./fold_0/train/raw_train_data_1000_30days.csv
Finished, write to: ./fold_0/train/raw_train_data_1000_30days_anony.csv
Anonymizing: ./fold_1/train/raw_train_data_1000_30days.csv
Finished, write to: ./fold_1/train/raw_train_data_1000_30days_anony.csv
Anonymizing: ./fold_2/train/raw_train_data_1000_30days.csv
Finished, write to: ./fold_2/train/raw_train_data_1000_30days_anony.csv
Anonymizing: ./fold_3/train/raw_train_data_1000_30days.csv
Finished, write to: ./fold_3/train/raw_train_data_1000_30days_anony.csv
Anonymizing: ./fold_4/train/raw_train_data_1000_30days.csv
Finished, write to: ./fold_4/train/raw_train_data_1000_30days_anony.csv


**ANONYMIZE TEST DATA**

In [6]:
org_fp = './raw_data/readmissions_data_new_365.csv'
new_fp = './raw_data/readmissions_data_new_365_anony.csv'

anonymize(org_fp, new_fp, mapping)

Anonymizing: ./raw_data/readmissions_data_new_365.csv
Finished, write to: ./raw_data/readmissions_data_new_365_anony.csv


In [7]:
org_fp = './raw_data/readmissions_data_testphase_1000_30days.csv'
new_fp = './raw_data/readmissions_data_testphase_1000_30days_anony.csv'

anonymize(org_fp, new_fp, mapping)

Anonymizing: ./raw_data/readmissions_data_testphase_1000_30days.csv
Finished, write to: ./raw_data/readmissions_data_testphase_1000_30days_anony.csv


In [8]:
org_fp = './raw_data/readmissions_data_testphase_1000_365days.csv'
new_fp = './raw_data/readmissions_data_testphase_1000_365days_anony.csv'

anonymize(org_fp, new_fp, mapping)

Anonymizing: ./raw_data/readmissions_data_testphase_1000_365days.csv
Finished, write to: ./raw_data/readmissions_data_testphase_1000_365days_anony.csv
