In [12]:
import pandas as pd
import numpy as np
import networkx as nx
import random

In [13]:
###################
### PIMA INDIAN ###
###################

# pre-generated random graph structure

PID_train = pd.read_csv("data/PID_train.csv")
PID_train.insert(0, "id", range(1, len(PID_train) + 1))
PID_train.to_csv("data/PID_train_new.csv", index=False)

num_nodes = len(PID_train)
probability = 0.1 
G = nx.erdos_renyi_graph(n=num_nodes, p=probability)

node_mapping = {i: int(PID_train.iloc[i]["id"]) for i in range(num_nodes)}
G = nx.relabel_nodes(G, node_mapping)

adj_list = {node: [int(neighbor) for neighbor in G.neighbors(node)] for node in G.nodes()}

with open("data/PID_graph.csv", "w") as f:
    for node, neighbors in adj_list.items():
        line = f"{int(node)}," + ",".join(map(str, neighbors)) + "\n" 
        f.write(line)

In [49]:
with open('SaNGreeA-anonymisation_PID/src/SaNGreeA_PID.py') as file:
    code = file.read()

exec(code)

Starting SaNGreeA algorithm...
Found age range of: [21:72]
Found blood pressure range of: [24.0:122.0]
Found insulin range of: [0.0:846.0]
Successfully built 67 clusters.


In [12]:
# Save local transformed dataset

PID_train_LT = pd.read_csv("output/PID_anonymized.csv")
PID_train = pd.read_csv("data/PID_train_new.csv")

PID_train['Age'] = PID_train_LT['Age']  
PID_train['BloodPressure'] = PID_train_LT[' BloodPressure'] 
PID_train['Insulin'] = PID_train_LT[' Insulin'] 
PID_train.drop(columns=['id'], inplace=True)

PID_train.to_csv("data/PID_train_LT.csv", index=False)

In [6]:
###########################
### CREDIT CARD DEFAULT ###
###########################

CCD_train = pd.read_csv("data/CCD_train.csv")
CCD_train.insert(0, "id", range(1, len(CCD_train) + 1))

education_mapping = {
    0: 'unknown',
    1: 'graduate school',
    2: 'university',
    3: 'high school',
    4: 'others',
    5: 'unknown',
    6: 'unknown' 
}

CCD_train['EDUCATION'] = CCD_train['EDUCATION'].replace(education_mapping)

## pre-generated random graph structure
subset_size = int(0.5 * len(CCD_train))
CCD_train_subset = CCD_train.sample(n=subset_size, random_state=42).reset_index(drop=True)
CCD_train_subset.to_csv("data/CCD_train_subset.csv", index=False)
num_nodes = len(CCD_train_subset)

probability = 0.0001  
G = nx.erdos_renyi_graph(n=num_nodes, p=probability)

node_mapping = {i: int(CCD_train_subset.iloc[i]["id"]) for i in range(num_nodes)} 
G = nx.relabel_nodes(G, node_mapping)

adj_list = {node: [int(neighbor) for neighbor in G.neighbors(node)] for node in G.nodes()}  

with open("data/CCD_graph.csv", "w") as f:
    for node, neighbors in adj_list.items():
        line = f"{int(node)}," + ",".join(map(str, neighbors)) + "\n" 
        f.write(line)

In [10]:
# Save local transformed dataset

CCD_train_LT = pd.read_csv("output/CCD_anonymized.csv")
CCD_train_subset = pd.read_csv("data/CCD_train_subset.csv")

CCD_train_subset['EDUCATION'] = CCD_train_LT['EDUCATION']  
CCD_train_subset['LIMIT_BAL'] = CCD_train_LT[' LIMIT_BAL'] 
CCD_train_subset['PAY_AMT1'] = CCD_train_LT[' PAY_AMT1'] 
CCD_train_subset['BILL_AMT1'] = CCD_train_LT[' BILL_AMT1'] 
CCD_train_subset.drop(columns=['id'], inplace=True)

CCD_train_subset.to_csv("data/CCD_train_LT.csv", index=False)

In [7]:
#####################
### CENSUS INCOME ###
#####################

KDD_train = pd.read_csv("data/KDD_train.csv")
KDD_train.insert(0, "id", range(1, len(KDD_train) + 1))

## pre-generated random graph structure
subset_size = int(0.2 * len(KDD_train))
KDD_train_subset = KDD_train.sample(n=subset_size, random_state=42).reset_index(drop=True)
KDD_train_subset['birth_country_mother'] = KDD_train_subset['birth_country_mother'].str.strip()
KDD_train_subset['marital_stat'] = KDD_train_subset['marital_stat'].str.strip()
KDD_train_subset.to_csv("data/KDD_train_subset.csv", index=False)
num_nodes = len(KDD_train_subset)

probability = 0.0001 
G = nx.erdos_renyi_graph(n=num_nodes, p=probability)

node_mapping = {i: int(KDD_train_subset.iloc[i]["id"]) for i in range(num_nodes)}
G = nx.relabel_nodes(G, node_mapping)

adj_list = {node: [int(neighbor) for neighbor in G.neighbors(node)] for node in G.nodes()}

with open("data/KDD_graph.csv", "w") as f:
    for node, neighbors in adj_list.items():
        line = f"{int(node)}," + ",".join(map(str, neighbors)) + "\n"
        f.write(line)

In [13]:
# Save local transformed dataset

KDD_train_LT = pd.read_csv("output/KDD_anonymized.csv")
KDD_train_subset = pd.read_csv("data/KDD_train_subset.csv")

KDD_train_subset['age'] = KDD_train_LT['age']  
KDD_train_subset['birth_country_mother'] = KDD_train_LT[' birth_country_mother'] 
KDD_train_subset['marital_stat'] = KDD_train_LT[' marital_stat'] 
KDD_train_subset.drop(columns=['id'], inplace=True)

KDD_train_subset.to_csv("data/KDD_train_LT.csv", index=False)