In [7]:
import os
import csv
import math
import itertools
import pickle

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from scipy import sparse
import networkx as nx

from mbi import (
    Dataset,
    FactoredInference,
    Domain,
    LocalInference,
    MixtureInference,
    PublicInference,
)

In [8]:
data = Dataset.load("./data/diabetes_processed.csv", "./data/diabetes_processed.json")
domain = data.domain
total = data.df.shape[0]

In [9]:
# adapted from https://github.com/ryan112358/private-pgm/blob/master/examples/adult_example.py

cliques = [('bmi', 'HbA1c_level'), ('HbA1c_level', 'diabetes'), ('age', 'bmi'), ('bmi', 'diabetes'), ('blood_glucose_level', 'diabetes'), ('smoking_history', 'bmi'), ('bmi', 'blood_glucose_level'), ('gender', 'bmi'), ('age', 'hypertension'), ('age', 'heart_disease')]

# spend half of privacy budget to measure all 1 way marginals
np.random.seed(0)

epsilon = 1.0
epsilon_split = epsilon / (len(data.domain) + len(cliques))
sigma = 2.0 / epsilon_split

measurements = []
for col in data.domain:
    x = data.project(col).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, (col,)))

# spend half of privacy budget to measure some more 2 and 3 way marginals
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, cl))

In [None]:
# THIS TAKES <2 MINUTEs
engine = FactoredInference(domain, log=True, iters=500)
model = engine.estimate(measurements, total=total)

Total clique size: 777609
iteration		time		l1_loss		l2_loss		feasibility
0.00		0.00		669939.12		6683571.77		0.00
50.00		6.52		639931.08		643470.77		0.00
100.00		15.77		637909.02		637436.20		0.00
150.00		25.42		636661.74		634798.90		0.00
200.00		35.28		635696.56		632701.01		0.00
250.00		46.59		634913.81		630883.89		0.00
300.00		58.40		634224.00		629164.12		0.00
350.00		72.07		633651.51		627562.73		0.00
400.00		83.45		633184.12		626100.14		0.00
450.00		94.41		632752.92		624661.55		0.00


In [None]:
# Save the model to a file
# this is about 13MB
with open('./model/diabetes_synth.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# this takes around 10 seconds
synth = model.synthetic_data(rows=100000)
sdf = synth.df
sdf

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,72,0,0,4,824,2,0,0
1,1,92,0,0,1,601,0,5,0
2,1,55,0,0,0,1547,5,5,0
3,1,48,0,0,1,1493,3,4,0
4,1,59,0,0,1,1868,1,3,0
...,...,...,...,...,...,...,...,...,...
99995,1,21,0,0,1,939,7,6,0
99996,1,40,0,0,0,754,4,6,0
99997,0,24,0,0,0,3117,7,3,0
99998,1,101,0,0,4,2123,1,0,0
