In [1]:
import os
import csv
import math
import itertools
import pickle

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from scipy import sparse
import networkx as nx

from mbi import (
    Dataset,
    FactoredInference,
    Domain,
    LocalInference,
    MixtureInference,
    PublicInference,
)

In [2]:
data = Dataset.load("./data/adult_processed.csv", "./data/adult_processed.json")
domain = data.domain
total = data.df.shape[0]

In [3]:
# adapted from https://github.com/ryan112358/private-pgm/blob/master/examples/adult_example.py

cliques = [('workclass', 'occupation'), ('workclass', 'fnlwgt'), ('marital.status', 'income'), ('fnlwgt', 'occupation'), ('fnlwgt', 'race'), ('fnlwgt', 'income'), ('education', 'education.num'), ('fnlwgt', 'capital.loss'), ('race', 'native.country'), ('relationship', 'sex'), ('marital.status', 'relationship'), ('occupation', 'sex'), ('fnlwgt', 'relationship'), ('fnlwgt', 'marital.status'), ('fnlwgt', 'sex'), ('fnlwgt', 'education'), ('age', 'capital.gain'), ('capital.gain', 'income'), ('relationship', 'income'), ('fnlwgt', 'native.country'), ('fnlwgt', 'education.num'), ('fnlwgt', 'hours.per.week'), ('marital.status', 'sex'), ('age', 'fnlwgt'), ('age', 'capital.loss'), ('fnlwgt', 'capital.gain')]

# spend half of privacy budget to measure all 1 way marginals
np.random.seed(0)

epsilon = 1.0
epsilon_split = epsilon / (len(data.domain) + len(cliques))
sigma = 2.0 / epsilon_split

measurements = []
for col in data.domain:
    x = data.project(col).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, (col,)))

# spend half of privacy budget to measure some more 2 and 3 way marginals
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, cl))

In [6]:
# THIS TAKES ~1000 MINUTES!
engine = FactoredInference(domain, log=True, iters=500)
model = engine.estimate(measurements, total=total)

Total clique size: 355979712
iteration		time		l1_loss		l2_loss		feasibility
0.00		0.01		10720602.35		11210028.55		0.00
50.00		4831.63		10711458.22		10714903.93		0.00
100.00		10694.40		10710113.53		10710655.33		0.00
150.00		16660.96		10709301.19		10708655.66		0.00
200.00		22787.77		10708738.82		10707145.16		0.00
250.00		29145.64		10708346.61		10705960.43		0.00
300.00		36203.70		10708067.44		10704995.58		0.00
350.00		42070.43		10707870.26		10704198.27		0.00
400.00		47620.23		10707734.69		10703559.30		0.00
450.00		53178.74		10707623.52		10703052.92		0.00


In [7]:
# Save the model to a file
# this is about 5GB
with open('./model/adult_synth.pkl', 'wb') as f:
    pickle.dump(model, f)

In [9]:
# this takes around 45 seconds
synth = model.synthetic_data(rows=30000)
sdf = synth.df
sdf

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,6,4,12445,15,9,0,1,3,4,1,0,0,9,39,0
1,12,4,19268,11,8,0,10,1,4,1,0,0,44,39,0
2,3,4,14421,7,11,0,7,4,1,1,0,0,39,30,0
3,13,6,9822,11,8,0,12,1,4,0,0,0,39,39,0
4,13,4,6249,11,8,0,10,4,4,0,0,0,76,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,25,7,7321,12,13,1,1,5,4,1,0,0,49,39,1
29996,13,2,9056,15,9,1,3,0,4,0,0,0,54,39,0
29997,18,4,7533,11,8,0,3,1,4,0,0,0,39,39,1
29998,36,4,7284,12,13,1,1,0,4,0,0,0,0,39,1
