In [179]:
class Parser:
    def fixupBrak(s):
        orig = s
        pos1 = 0
        pos2 = 0
        while pos1 >= 0:
            pos1 = s.find('[', pos2)
            if pos1 >= 0:
                pos2 = s.find(']', pos1)
                assert pos2 >= 0, 'Error parsing probability query.  Mismatched brackets in:' + orig
                if pos2 >= 0:
                    inner = s[pos1+1:pos2]
                    inner = inner.replace(',',';')
                    s = s[:pos1 + 1] + inner + s[pos2:]                
        return s

    def fixupType(item):
        if item.isnumeric():
            item = float(item)
        return item

    def fixupTypes(l):
        outl = []
        for item in l:
            item = fixupType(item)
            outl.append(item)
        return outl

    def parseTerms(s):
        # cases = '','in', 'between', '<', '>', '='
        splitCases = ['>=','>', '<=', '=', '<', 'in', 'between']
        terms = []
        if s:
            # Convert => to >= and =< to <=
            s = s.replace('=>', '>=')
            s = s.replace('=<', '<=')
            toks = s.split(',')
            for tok in toks:
                tcase = ''
                tok = tok.strip()
                var = tok
                val = None
                for c in splitCases:
                    t2 = tok.split(c)
                    if len(t2) > 1:
                        tcase = c
                        var = t2[0].strip()
                        val = t2[1].strip()
                        if tcase == 'in' or tcase == 'between':
                            # break up the list
                            val = val[1:-1].split(';')
                            val = [v.strip() for v in val]
                            val = fixupTypes(val)
                        else:
                            val = fixupType(val)
                        break
                        
                terms.append((tcase, var, val))
        return terms

    def convertFinalTerms(inTerms):
        inf = 999999999
        outTerms = []
        for term in inTerms:
            ttype = term[0]
            if ttype == '':
                outTerm = (term[1],)
            elif ttype == '>':
                outTerm = (term[1], term[2]+1/inf, inf)
            elif ttype == '>=':
                outTerm = (term[1], term[2], inf)
            elif ttype == '<':
                outTerm = (term[1], -inf, term[2])
            elif ttype == '<=':
                outTerm = (term[1], -inf, term[2]+1/inf)
            elif ttype == '=':
                outTerm = (term[1], term[2])
            elif ttype == 'between':
                outTerm = (term[1], term[2][0], term[2][1])
            elif ttype == 'in':
                outTerm = (term[1],) + tuple(term[2])
            else:
                outTerm == ('ERROR', term[1], term[2])
            outTerms.append(outTerm)
        return outTerms

    def parse(strList):
        if type(strList) == type(''):
            strList = [strList]
        outList = []
        for s in strList:
            orig = s
            pos = s.find('(')
            assert pos >= 0, 'Error parsing probability query.  No opening parentheses in:' + orig
            outtype = s[:pos].strip()
            s = s[pos+1:]
            pos = s.find(')')
            assert pos >=0, 'Error parsing probability query.  No closing parentheses in:' + orig
            s = s[:pos]
            tok = s.split('|')
            targ = tok[0].strip()
            if len(tok) > 1:
                cond = tok[1].strip()
            else:
                cond = ''
            targ = fixupBrak(targ)
            cond = fixupBrak(cond)
            tterms1 = parseTerms(targ)
            tterms = convertFinalTerms(tterms1)

            cterms1 = parseTerms(cond)
            cterms = convertFinalTerms(cterms1)
            assert outtype in ['P', 'E'], \
                'Error parsing probability query.  Invalid result type:"' + \
                outtype + '". Valid types are "P" for probability or "E" for expectation.'
            if outtype == 'P' and len(tterms) == 1 and len(tterms[0]) == 1:
                # It is probability with an unbound target. The answer is
                # a distribution
                outtype = 'D'
            outList.append((outtype, tterms, cterms))
        return outList

def query(ps, inList):
    vars = ps.getVarNames()
    specs = Parser.parse(inList)
    results = []
    for i in range(len(specs)):
        spec = specs[i]
        cmd, targ, cond = spec
        for term in targ + cond:
            var = term[0]
            assert var in vars, 'Error parsing probability query.  Variable name: "' + repr(var) + \
                '" is not in dataset.  In:' + inList[i] + '.  Valid variables are:' + \
                str(vars) + '.'
            vals = term[1:]
            valid = None
            for val in vals:
                if type(val) == type(''):
                    if valid is None:
                        valid = ps.getValues(var)
                    assert val in valid, 'Error parsing probability query. Invalid value:' + \
                    repr(val) + 'for variable:' + var + '. Valid values are:' + str(valid) + \
                    '.'
        if cmd == 'P':
            result = ps.P(targ,cond)
        elif cmd == 'E':
            result = ps.E(targ,cond)
        elif cmd == 'D':
            result = ps.distr(targ,cond)
        results.append(result)
    return results

def query1(ps, s):
    results = Calc(ps, [s])
    return results[0]    

In [106]:
print(ProbParse('P(ASDF=1)'))
print(ProbParse('P(A=1 | Bob=1)'))
print(ProbParse('P(A=1 | Bob=1, C > 0, D<0,E between[1,3], F in [fred, george, charlie], G)'))
print(ProbParse(['E(A)',
           'P(A>=1|B <= 2, C=yes)']))

[('P', [('ASDF', 1.0)], [])]
[('P', [('A', 1.0)], [('Bob', 1.0)])]
[('P', [('A', 1.0)], [('Bob', 1.0), ('C', 1.000000001e-09, 999999999), ('D', -999999999, 0.0), ('E', 1.0, 3.0), ('F', 'fred', 'george', 'charlie'), ('G',)])]
[('E', [('A',)], []), ('P', [('A', 1.0, 999999999)], [('B', -999999999, 2.000000001), ('C', 'yes')])]


In [5]:
%matplotlib widget
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import warnings
warnings.filterwarnings("ignore")
from because.synth import gen_data
from because.probability.prob import ProbSpace
from because.visualization import cmodel
from because.visualization import viz
from because.synth import read_data
from because.causality import cdisc
#from because.probability.probquery import query, queryList
from because.probability import probquery
r = read_data.Reader('/home/roger/Datasets/llcp.csv')
ds = r.read()

ps0 = ProbSpace(ds, power=5, categorical=['smokertype'])
ps = ps0.SubSpace([('income', 0,9),('education',[1,2,3,4,5,6]), ('smokertype',[1,2,3,4]),('othercancer','yes','no'),
                   ('skincancer', 'yes','no'), ('diabetes', 'yes','no'), ('asthma', 'yes', 'no'), ('copd','yes','no'), ('arthritis','yes','no'), 
                   ('kidneydis','yes','no'), ('depression','yes','no'), ('veteran','yes', 'no')], power=5)

print('N = ', ps.N)
print('Variables = ', ps.getVarNames())

getData:  290759 records read.
N =  285103
Variables =  ['age', 'gender', 'weight', 'height', 'ageGroup', 'genhealth', 'asthma_ever', 'asthma', 'skincancer', 'othercancer', 'copd', 'arthritis', 'depression', 'kidneydis', 'diabetes', 'maritaldetail', 'married', 'education', 'veteran', 'income', 'state', 'childcnt', 'sleephours', 'employment', 'smokertype', 'physicalactivity', 'insurance', 'checkup', 'nohospitalcost', 'bmi', 'bmicat', 'drinks']


In [6]:
print(probquery.query(ps, 'P(age > 60, gender=male | smokertype in [4-everyday, 3-former], height between [-.5,70])'))
print(probquery.query(ps, 'E(smokertype)'))

0.17264069744842714
1-never


In [3]:
probquery.queryList(ps, [
    'P(copd=yes)',
    'P(copd=yes | age > 60, gender=male)',
    'E(age | copd=yes)',
    'P(copd=yes | age >= 80, smokertype in [4-everyday,3-former])',
    'P(copd=yes | age between [70,90], smokertype = 1-never)',
    'E(age | income <= 6)',
    'P(diabetes=yes | weight >= 400)',
])

[0.07728785737084493,
 0.11721036089736643,
 63.62659405491264,
 0.2075801749271137,
 0.059257022047719723,
 55.647580862793475,
 0.34934497816593885]

In [170]:
print(ps.getVarNames())
print(ps.getValues('smokertype'))
print(ps.getValues('age'))
print(ps.isStringVar)


['age', 'gender', 'weight', 'height', 'ageGroup', 'genhealth', 'asthma_ever', 'asthma', 'skincancer', 'othercancer', 'copd', 'arthritis', 'depression', 'kidneydis', 'diabetes', 'maritaldetail', 'married', 'education', 'veteran', 'income', 'state', 'childcnt', 'sleephours', 'employment', 'smokertype', 'physicalactivity', 'insurance', 'checkup', 'nohospitalcost', 'bmi', 'bmicat', 'drinks']
['1-never', '2-occassional', '3-former', '4-everyday']
[18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0]


In [137]:
dists = Calc(ps, ['P(height | gender=female)',
                  'P(height | gender=male)'])
for dist in dists:
    #print(dist)
    #print(dist.mean(), dist.stDev(), dist.skew(), dist.kurtosis(), dist.modality(), dist.truncation())
    print(dist.mean(), dist.stDev(), dist.skew(), dist.kurtosis(), dist.modality())

64.35367667942495 2.8609842137566384 0.11747007860161898 1.0305287684462598 1
70.22651120422661 3.0363745771046586 -0.1434307639652124 0.8861909990024075 1
