In [1]:
import os
import csv
import math
import itertools
import pickle

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from scipy import sparse
import networkx as nx

from mbi import (
    Dataset,
    FactoredInference,
    Domain,
    LocalInference,
    MixtureInference,
    PublicInference,
)

In [2]:
data = Dataset.load("./data/loans_processed.csv", "./data/loans_processed.json")
domain = data.domain
total = data.df.shape[0]

In [3]:
# adapted from https://github.com/ryan112358/private-pgm/blob/master/examples/adult_example.py

# cliques = [('LIMIT_BAL', 'BILL_AMT1'), ('SEX', 'BILL_AMT1'), ('EDUCATION', 'BILL_AMT1'), ('MARRIAGE', 'BILL_AMT1'), ('AGE', 'BILL_AMT1'), ('PAY_0', 'BILL_AMT1'), ('PAY_2', 'BILL_AMT1'), ('PAY_3', 'BILL_AMT2'), ('PAY_4', 'BILL_AMT3'), ('PAY_5', 'BILL_AMT4'), ('PAY_6', 'BILL_AMT5'), ('BILL_AMT1', 'BILL_AMT2'), ('BILL_AMT1', 'PAY_AMT6'), ('BILL_AMT1', 'default.payment.next.month'), ('BILL_AMT2', 'BILL_AMT3'), ('BILL_AMT2', 'PAY_AMT1'), ('BILL_AMT3', 'BILL_AMT4'), ('BILL_AMT3', 'PAY_AMT2'), ('BILL_AMT4', 'BILL_AMT5'), ('BILL_AMT4', 'PAY_AMT3'), ('BILL_AMT5', 'BILL_AMT6'), ('BILL_AMT5', 'PAY_AMT4'), ('BILL_AMT6', 'PAY_AMT5')]
cliques = []

# spend half of privacy budget to measure all 1 way marginals
np.random.seed(0)

epsilon = 1.0
epsilon_split = epsilon / (len(data.domain) + len(cliques))
sigma = 2.0 / epsilon_split

measurements = []
for col in data.domain:
    x = data.project(col).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, (col,)))

# spend half of privacy budget to measure some more 2 and 3 way marginals
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements.append((I, y, sigma, cl))

In [4]:
# THIS TAKES ~? MINUTES! dont have enough memory!
engine = FactoredInference(domain, log=True, iters=500)
model = engine.estimate(measurements, total=total)

Total clique size: 174606
iteration		time		l1_loss		l2_loss		feasibility


0.00		0.00		184115.98		683482.61		0.00
50.00		1.57		174834.19		178387.36		0.00
100.00		3.63		172260.84		168264.42		0.00
150.00		5.70		170663.31		161767.20		0.00
200.00		7.77		169857.18		157855.69		0.00
250.00		9.90		169532.42		155926.57		0.00
300.00		12.10		169408.17		154993.86		0.00
350.00		14.33		169360.16		154521.11		0.00
400.00		16.77		169340.85		154266.15		0.00
450.00		20.77		169332.81		154119.75		0.00


In [5]:
# Save the model to a file
# this is about ?
with open('./model/loans_synth.pkl', 'wb') as f:
    pickle.dump(model, f)

In [6]:
# this takes around ? seconds
synth = model.synthetic_data(rows=30000)
sdf = synth.df
sdf

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,5,0,2,1,25,1,4,1,2,2,2,6015,11424,21928,8334,266,15470,1056,2463,1918,3131,3967,0,0
1,26,0,3,2,12,2,2,2,4,1,2,2152,4551,18397,17627,8506,8375,2107,3695,0,2196,0,928,1
2,8,1,1,3,2,2,2,2,1,3,3,21743,3803,12081,629,192,414,5162,3705,5346,2571,0,6335,0
3,7,1,2,1,16,3,2,0,1,0,3,6626,7532,10168,11201,384,10437,1926,6466,758,171,1795,1617,0
4,3,1,3,2,46,0,2,2,1,2,2,19556,1110,6930,377,7289,12726,0,4608,611,171,1795,1215,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,23,1,3,1,49,3,0,2,2,3,2,3149,347,3207,13336,4376,1848,4942,3217,567,0,6851,2169,0
29996,70,1,1,1,11,2,2,2,2,2,2,4871,4474,14870,10244,4034,5592,5600,0,6470,0,5118,0,0
29997,8,1,1,1,14,2,4,2,4,3,2,22067,347,916,377,4917,1394,6017,4011,6667,1362,0,0,0
29998,66,0,3,2,13,2,0,2,4,2,2,16084,347,2525,17412,11310,19769,1767,6154,6070,2530,949,728,0
