In [1]:
import os
import csv
import math
import itertools
import pickle

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from scipy import sparse
import networkx as nx

from mbi import (
    Dataset,
    FactoredInference,
    Domain,
    LocalInference,
    MixtureInference,
    PublicInference,
)

In [2]:
data = Dataset.load("./data/compas_processed.csv", "./data/compas_processed.json")
domain = data.domain
total = data.df.shape[0]

In [3]:
# adapted from https://github.com/ryan112358/private-pgm/blob/master/examples/adult_example.py

cliques = [('days_in_jail', 'c_days_from_compas'), ('age', 'days_in_jail'), ('is_violent_recid', 'is_recid'), ('sex', 'c_days_from_compas'), ('age', 'v_decile_score'), ('decile_score', 'v_decile_score'), ('age', 'c_days_from_compas'), ('priors_count', 'days_in_jail'), ('race', 'c_days_from_compas'), ('decile_score', 'is_recid')]

# spend half of privacy budget to measure all 1 way marginals
np.random.seed(0)

epsilon = 1.0
epsilon_split = epsilon / (len(data.domain) + len(cliques))
sigma = 2.0 / epsilon_split

measurements = []
for col in data.domain:
    x = data.project(col).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, (col,)))

# spend half of privacy budget to measure some more 2 and 3 way marginals
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, cl))

In [None]:
# THIS TAKES 27m33s
engine = FactoredInference(domain, log=True, iters=500)
model = engine.estimate(measurements, total=total)

Total clique size: 16505315
iteration		time		l1_loss		l2_loss		feasibility
0.00		0.00		351113.04		617043.60		0.00
50.00		126.54		344623.30		345062.99		0.00
100.00		297.56		343839.05		343373.05		0.00
150.00		467.22		343545.42		342562.66		0.00
200.00		641.80		343421.08		342054.91		0.00
250.00		813.30		343364.93		341783.07		0.00
300.00		982.96		343338.83		341641.03		0.00
350.00		1147.24		343325.36		341560.94		0.00
400.00		1316.19		343316.93		341513.66		0.00
450.00		1487.56		343310.98		341483.94		0.00


In [5]:
# Save the model to a file
with open('./model/compas_synth.pkl', 'wb') as f:
    pickle.dump(model, f)

In [6]:
# this takes around <1 seconds
synth = model.synthetic_data(rows=7000)
sdf = synth.df
sdf

Unnamed: 0,age,sex,decile_score,priors_count,race,days_in_jail,c_days_from_compas,is_violent_recid,v_decile_score,is_recid
0,3,1,10,0,2,336,464,0,8,0
1,19,1,3,1,0,0,0,0,5,1
2,31,1,1,7,3,0,1,0,1,0
3,56,0,6,26,0,67,0,0,8,0
4,2,1,9,0,0,0,1,0,7,0
...,...,...,...,...,...,...,...,...,...,...
6995,13,1,8,0,0,1,0,0,8,1
6996,15,1,1,18,2,354,1,0,1,0
6997,8,1,7,11,2,312,237,0,5,1
6998,5,1,7,0,0,0,1,0,5,0


# Without heuristic

In [7]:
cliques = [('age', 'v_decile_score'), ('age', 'days_in_jail'), ('sex', 'c_days_from_compas'), ('decile_score', 'v_decile_score'), ('decile_score', 'is_recid'), ('priors_count', 'days_in_jail'), ('race', 'c_days_from_compas'), ('days_in_jail', 'c_days_from_compas'), ('is_violent_recid', 'is_recid')]

# spend half of privacy budget to measure all 1 way marginals
np.random.seed(0)

epsilon = 1.0
epsilon_split = epsilon / (len(data.domain) + len(cliques))
sigma = 2.0 / epsilon_split

measurements = []
for col in data.domain:
    x = data.project(col).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, (col,)))

# spend half of privacy budget to measure some more 2 and 3 way marginals
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, cl))

In [None]:
# THIS TAKES 33s
engine = FactoredInference(domain, log=True, iters=500)
model = engine.estimate(measurements, total=total)

Total clique size: 299877
iteration		time		l1_loss		l2_loss		feasibility
0.00		0.00		308281.72		601422.45		0.00
50.00		2.46		301875.87		302515.89		0.00
100.00		5.77		301148.78		300873.14		0.00
150.00		9.21		300855.12		300210.33		0.00
200.00		12.68		300709.32		299737.13		0.00
250.00		16.22		300632.63		299410.48		0.00
300.00		19.63		300589.73		299192.56		0.00
350.00		23.03		300566.58		299054.17		0.00
400.00		26.58		300552.50		298963.48		0.00
450.00		30.11		300545.10		298903.01		0.00


In [9]:
# Save the model to a file
with open('./model/compas_synth_mst.pkl', 'wb') as f:
    pickle.dump(model, f)

In [10]:
# this takes around <1 seconds
synth = model.synthetic_data(rows=7000)
sdf = synth.df
sdf

Unnamed: 0,age,sex,decile_score,priors_count,race,days_in_jail,c_days_from_compas,is_violent_recid,v_decile_score,is_recid
0,17,1,6,38,0,0,0,0,2,0
1,23,1,1,7,2,348,380,0,1,0
2,13,1,6,0,0,0,0,0,4,0
3,52,0,7,4,2,70,203,0,3,0
4,18,1,4,5,2,1,1,0,2,0
...,...,...,...,...,...,...,...,...,...,...
6995,12,1,4,2,2,1,574,0,2,0
6996,25,1,10,0,0,1,1,0,1,1
6997,7,1,7,1,2,0,3,0,4,0
6998,8,1,4,2,0,1,1,0,6,0
