In [1]:
import sys
import numpy as np
sys.path.append("LogisticCircuit")
sys.path.append("pypsdd")
from structure.Vtree import Vtree as LC_Vtree

from collections import defaultdict

import pdb

from vtree import Vtree as PSDD_Vtree
from manager import PSddManager
import psdd_io
from data import Inst, InstMap


import itertools
from algo.LogisticCircuit import LogisticCircuit

import circuit_expect
from sympy import *
from utils import *

from scipy.special import logit
from scipy.special import expit

import gzip, pickle

try:
    from time import perf_counter
except:
    from time import time
    perf_counter = time




In [2]:
def print_summary(obs, psdd, lgc, title = "title"):
    ins = InstMap.from_list(obs)
    cache = EVCache()
    P = psdd.value(ins)
    expect = circuit_expect.Expectation(psdd, lgc, cache, np.array([obs]))
    mom2 = circuit_expect.moment(psdd, lgc, 2, cache, np.array([obs]))
    std = np.sqrt(mom2 - expect*expect)
    print(title, ": ")
    print("P(x) = {}".format(P), end = "")
    print(" Exp f(x) = {} , std = {}".format(expect, std))
    return P, expect



def filter_data_indexes(x_data, observation):
    ans = []
    for i in range(x_data.shape[0]):
        ok = True
        for j in range(len(observation)):
            if observation[j] == -1:
                continue
            elif observation[j] != x_data[i][j]:
                ok = False
                break

        if ok:
            ans.append(i)

    if len(ans) == 0:
        return np.array([],dtype=int)
    return np.array(ans)

In [3]:
VTREE_FILE = "exp/new-reg-circuit-grid/insurance/insurance_20190520-184809/insurance.vtree"
GLC_FILE = "exp/new-reg-circuit-grid/insurance/insurance_20190520-184809/best/insurance.glc"
PSDD_FILE = "exp/new-reg-circuit-grid/insurance/insurance_20190520-184809/best/insurance.psdd"
DATASET = "data/insurance/insurance.pklz"
CLASSES = 1

In [4]:
print("Loading Data")

with gzip.open(DATASET, 'rb') as f:
    data_splits = pickle.load(f)

(x_train, y_train), (x_valid, y_valid), (x_test, y_test) = data_splits


print("Loading Vtree..")
lc_vtree = LC_Vtree.read(VTREE_FILE)

print("Loading Logistic Circuit...")
with open(GLC_FILE) as circuit_file:
    lgc = LogisticCircuit(lc_vtree, CLASSES, circuit_file=circuit_file)

print("Loading PSDD..")
psdd_vtree = PSDD_Vtree.read(VTREE_FILE)
manager = PSddManager(psdd_vtree)
psdd = psdd_io.psdd_yitao_read(PSDD_FILE, manager)

Loading Data
Loading Vtree..
Loading Logistic Circuit...
Loading PSDD..


In [5]:
obsA = [-1 for i in range(36)]
obsA[6] = 1 
obsA[7] = 0 

obsB = [-1 for i in range(36)]
obsB[6] = 0 
obsB[7] = 1

PA, expA = print_summary(obsA, psdd, lgc, "Smoker A")
PB, expB = print_summary(obsB, psdd, lgc, "Smoker B")
    

print("Difference in smoke exp = {}".format(expA - expB))
print("-----------")

obsC = [-1 for i in range(36)]
obsC[12] = 1
obsC[13] = 0 

obsD = [-1 for i in range(36)]
obsD[12] = 0 
obsD[13] = 1

PC, expC = print_summary(obsC, psdd, lgc, "Gender C")
PD, expD = print_summary(obsD, psdd, lgc, "Gender D")
print("Difference in gender exp = {}".format(expC - expD))

Smoker A : 
P(x) = 0.7299238540320626 Exp f(x) = [[8741.74725831]] , std = [[4780.46254591]]
Smoker B : 
P(x) = 0.18403566812744454 Exp f(x) = [[31355.32630489]] , std = [[10772.20981438]]
Difference in smoke exp = [[-22613.57904658]]
-----------
Gender C : 
P(x) = 0.43929081570173034 Exp f(x) = [[13196.54892638]] , std = [[10412.04797653]]
Gender D : 
P(x) = 0.4717830801484342 Exp f(x) = [[14170.12546934]] , std = [[11592.53850812]]
Difference in gender exp = [[-973.57654295]]


In [6]:
cases = [ 
        [0, 1, 0, 1],
        [0, 1, 1, 0],
        [1, 0, 0, 1],
        [1, 0, 1, 0]
]

for case in cases:
    obs = [-1 for i in range(36)]
    obs[6] = case[0] 
    obs[7] = case[1] 
    obs[12] = case[2] 
    obs[13] = case[3]  
    A = filter_data_indexes(x_train, obs).shape[0]
    B = filter_data_indexes(x_valid, obs).shape[0]
    C = filter_data_indexes(x_test,  obs).shape[0]
    print("number of samples: {},{},{}".format(A,B,C))
    P, EXP = print_summary(obs, psdd, lgc, "(Smoke, Gender) {}".format(case))

number of samples: 115,19,25
(Smoke, Gender) [0, 1, 0, 1] : 
P(x) = 0.09623782748058203 Exp f(x) = [[31529.64537063]] , std = [[10768.56804479]]
number of samples: 77,18,20
(Smoke, Gender) [0, 1, 1, 0] : 
P(x) = 0.06345906743820999 Exp f(x) = [[31496.91407013]] , std = [[10626.50061444]]
number of samples: 368,67,82
(Smoke, Gender) [1, 0, 0, 1] : 
P(x) = 0.33779927958641853 Exp f(x) = [[8622.0919159]] , std = [[4767.37701358]]
number of samples: 376,83,88
(Smoke, Gender) [1, 0, 1, 0] : 
P(x) = 0.34144646362220654 Exp f(x) = [[9041.18946706]] , std = [[4717.10035995]]


In [7]:
print("Difference regions = {}".format(expC - expD))
cases = [ 
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1]
]

for case in cases:
    obs = [-1 for i in range(36)]
    obs[8] = case[0] 
    obs[9] = case[1] 
    obs[10] = case[2] 
    obs[11] = case[3]  
    A = filter_data_indexes(x_train, obs)
    B = filter_data_indexes(x_valid, obs)
    C = filter_data_indexes(x_test,  obs)
    print("number of samples: {},{},{}".format(A.shape[0],B.shape[0],C.shape[0]))
    print("====> {} [{}]".format(np.mean(y_train[A]), np.std(y_train[A])))
    P, EXP = print_summary(obs, psdd, lgc, "(Location) {}".format(case))

Difference regions = [[-973.57654295]]
number of samples: 217,49,58
====> 13634.793071382488 [11300.283828493622]
(Location) [1, 0, 0, 0] : 
P(x) = 0.17312534052078693 Exp f(x) = [[12828.35686092]] , std = [[9633.78506226]]
number of samples: 225,55,45
====> 12184.409216088889 [10195.820168255574]
(Location) [0, 1, 0, 0] : 
P(x) = 0.17410529263006871 Exp f(x) = [[11294.21609901]] , std = [[8478.89313213]]
number of samples: 254,49,61
====> 15009.115852480314 [14220.295149318237]
(Location) [0, 0, 1, 0] : 
P(x) = 0.22958593119440648 Exp f(x) = [[15501.69365602]] , std = [[12776.28918214]]
number of samples: 240,34,51
====> 12561.456521875 [11779.305256540987]
(Location) [0, 0, 0, 1] : 
P(x) = 0.19078206349276028 Exp f(x) = [[11081.53682305]] , std = [[9514.35456259]]


In [8]:
combObs = [-1 for i in range(36)]
combObs[8:12] = [0, 0, 1, 0] # region 
combObs[12:14] = [0, 1] # gender
combObs[6:8] = [0,1] # Smoking

A = filter_data_indexes(x_train, combObs).shape[0]
B = filter_data_indexes(x_valid, combObs).shape[0]
C = filter_data_indexes(x_test,  combObs).shape[0]
print("number of samples: {},{},{}".format(A,B,C))
idx = filter_data_indexes(x_train, combObs)
print(np.shape(idx)[0], np.mean(y_train[idx]), np.std(y_train[idx]))
P, EXP = print_summary(combObs, psdd, lgc, "Combo {}".format(case))

number of samples: 38,6,11
38 34950.69068578947 11417.378152507816
Combo [0, 0, 0, 1] : 
P(x) = 0.027755383690319903 Exp f(x) = [[32852.29320981]] , std = [[10946.98523972]]


In [9]:
combObs = [-1 for i in range(36)]
# combObs[6:8] = [0, 1] # Smoking = Yes
# combObs[0:6] = [0,1,0,0,0,0] # Children = 1
# combObs[12:14] = [1, 0] # Gender = Female
combObs[8:12] = [0, 1, 0, 0] # Region = southeast

A = filter_data_indexes(x_train, combObs).shape[0]
B = filter_data_indexes(x_valid, combObs).shape[0]
C = filter_data_indexes(x_test,  combObs).shape[0]
print("number of samples: {}".format(A+B+C))

idx = filter_data_indexes(x_train, combObs)
print(np.shape(idx)[0], np.mean(y_train[idx]), np.std(y_train[idx]))

number of samples: 325
225 12184.409216088889 10195.820168255574


In [10]:
combObs = [-1 for i in range(36)]
combObs[6:8] = [0, 1] # Smoking = Yes
combObs[0:6] = [0,1,0,0,0,0] # Children = 1
combObs[12:14] = [1, 0] # Gender = Female
combObs[8:12] = [0, 1, 0, 0] # Region = southeast ?southwest???

A = filter_data_indexes(x_train, combObs).shape[0]
B = filter_data_indexes(x_valid, combObs).shape[0]
C = filter_data_indexes(x_test,  combObs).shape[0]
print("number of samples: {},{},{}".format(A,B,C))

idx = filter_data_indexes(x_train, combObs)
print(np.shape(idx)[0], np.mean(y_train[idx]), np.std(y_train[idx]))

number of samples: 3,1,0
3 35115.5592 9657.089882888411


In [11]:
P, EXP = print_summary(combObs, psdd, lgc, "Combo {}".format(combObs))

Combo [0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] : 
P(x) = 0.000735320694302092 Exp f(x) = [[27250.65854862]] , std = [[7717.09623874]]


In [12]:
combObs = [-1 for i in range(36)]
combObs[0:6] = [1,0,0,0,0,0]
combObs[8:12] = [1, 0, 0, 0] # region
combObs[12:14] = [1, 0] # gender
combObs[6:8] = [0, 1] # Smoking
combObs[15:26] = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

A = filter_data_indexes(x_train, combObs).shape[0]
B = filter_data_indexes(x_valid, combObs).shape[0]
C = filter_data_indexes(x_test,  combObs).shape[0]
print("number of samples: {},{},{}".format(A,B,C))

idx = filter_data_indexes(x_train, combObs)
print(np.shape(idx)[0], np.mean(y_train[idx]), np.std(y_train[idx]))

number of samples: 1,0,0
1 14571.8908 0.0


In [13]:
P, EXP = print_summary(combObs, psdd, lgc, "Combo {}".format(combObs))

Combo [1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] : 
P(x) = 8.471170844516204e-05 Exp f(x) = [[26569.69205926]] , std = [[4003.07591662]]


In [14]:
##########

In [15]:
combObs = [-1 for i in range(36)]

P1, exp1 = print_summary(combObs, psdd, lgc, "")

combObs[6:8] = [0, 1] # Smoking = Yes

P1, exp1 = print_summary(combObs, psdd, lgc, "Smoke")

combObs[0:6] = [0,1,0,0,0,0] # Children = 1

P2, exp2 = print_summary(combObs, psdd, lgc, "Children")

combObs[12:14] = [1, 0] # Gender = Female

P3, exp3 = print_summary(combObs, psdd, lgc, "Gender")

combObs[8:12] = [0, 0, 1, 0] # Region = southeast

P4, exp4 = print_summary(combObs, psdd, lgc, "Region")

 : 
P(x) = 1.0000000000000002 Exp f(x) = [[13856.18299956]] , std = [[11224.28324704]]
Smoke : 
P(x) = 0.18403566812744454 Exp f(x) = [[31355.32630489]] , std = [[10772.20981438]]
Children : 
P(x) = 0.014010355852028696 Exp f(x) = [[31578.81577859]] , std = [[11141.94772446]]
Gender : 
P(x) = 0.004900729526751024 Exp f(x) = [[31027.99574661]] , std = [[10855.36547545]]
Region : 
P(x) = 0.0009639268505233895 Exp f(x) = [[30974.77283495]] , std = [[11229.0358695]]


In [16]:
A = filter_data_indexes(x_train, combObs).shape[0]
B = filter_data_indexes(x_valid, combObs).shape[0]
C = filter_data_indexes(x_test,  combObs).shape[0]
print("number of samples: {},{},{}".format(A,B,C))

idx = filter_data_indexes(x_train, combObs)
print(np.shape(idx)[0], np.mean(y_train[idx]), np.std(y_train[idx]))

number of samples: 4,1,1
4 24476.33165 10711.943335737955


In [17]:
combObs = [-1 for i in range(36)]
combObs[12:14] = [1, 0] # gender
combObs[6:8] = [0, 1] # Smoking

A = filter_data_indexes(x_train, combObs).shape[0]
B = filter_data_indexes(x_valid, combObs).shape[0]
C = filter_data_indexes(x_test,  combObs).shape[0]
print("number of samples: {},{},{}".format(A,B,C))

idx = filter_data_indexes(x_train, combObs)
print(np.shape(idx)[0], np.mean(y_train[idx]), np.std(y_train[idx]))
print_summary(combObs, psdd, lgc, "Combo {}".format(combObs))

number of samples: 77,18,20
77 31686.910155584414 12080.390157445532
Combo [-1, -1, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] : 
P(x) = 0.06345906743820999 Exp f(x) = [[31496.91407013]] , std = [[10626.50061444]]


(0.06345906743820999, array([[31496.91407013]]))