## Maximum Likelyhood for PGM

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Class definitions - RV, Distribution and Category

In [2]:
class RV:
    def __init__(self, cattup,name):
        self.name = name
        self.categories = cattup
        self.catcountdict = {}
        for category in self.categories:
            self.catcountdict[category] = Category(category)
    def __str__(self):
        return_string = " \t < RV: " + self.name + " > \n"
        count = 0
        for category in self.catcountdict:
            return_string = return_string + str(category) + "\t\t"+ str(self.catcountdict[category].count) + "\n"
            count = count + self.catcountdict[category].count
        return_string = return_string + "total count: " + str(count)
        return return_string
    def clear_count(self):
        for cat in self.catcountdict:
            self.catcountdict[cat].count = 0
        
class Distribution:
    def __init__(self,randomvar):
        self.name = randomvar.name
        categories = randomvar.categories
        catcountdict = randomvar.catcountdict
        distdict = {}
        total = 0
        for category in categories:
            total = total + catcountdict[category].count
        for category in categories:
            distdict[category] = catcountdict[category].count/(1.0*total)
        
        self.categories = categories
        self.distdict = distdict
        self.total = total
    def __str__(self):
        return_string = " \t < Distribution: " + self.name + " > \n"
        for category in self.distdict:
            return_string = return_string + str(category) + "\t\t"+ str(self.distdict[category]) + "\n"
        return return_string
    def to_csv(self):
        table = []
        for category in self.distdict:
            table.append([category,self.distdict[category]])
        df = pd.DataFrame(data=table, columns=['category','prob'])
        name = "prior_" + self.name + ".csv"
        df.to_csv(name,sep=',')
    def to_emdw_csv(self):
        table = []
        ind = 0
        for category in self.distdict:
            table.append([ind,self.distdict[category]])
            ind = ind +1
        df = pd.DataFrame(data=table)
        name = "prior_" + self.name + ".csv"
        df.to_csv(name,sep=',',index=False,header=False)

class Category:
    def __init__(self,category,catcount=0):
        self.name = category
        self.count = catcount
    def increment(self):
        self.count = self.count + 1
        




### Read in the graph

In [39]:
x = pd.read_csv('car2.csv')
# x = pd.read_csv('g.csv')


### Store the possible categories per RV

In [40]:
# find out how many categories each RV can take
rvcategorydict = {}
for rv in x.columns:
    rvcategorydict[rv] = tuple(set([item[0] for item in x[[rv]].values]))
for rv in rvcategorydict:
    print(rv,rvcategorydict[rv])


verdict ('unacc', 'good', 'acc', 'vgood')
doors ('5more', '4', '3', '2')
buying ('med', 'low', 'vhigh', 'high')
persons ('4', 'more', '2')
safety ('med', 'low', 'high')
lug_boot ('med', 'small', 'big')
maint ('med', 'low', 'vhigh', 'high')


### Set up the RV dictionary

In [41]:
rvdictionary = {}

for rv in rvcategorydict:
    rvdictionary[rv] = RV(rvcategorydict[rv],name=rv)
    

## Calculate the prior probabilities

In [42]:
for rv in x.columns:
    for item in x[[rv]].values:
        rvdictionary[rv].catcountdict[item[0]].increment()

distributiondict = {}

for rv in rvdictionary:
    distributiondict[rv] = Distribution(rvdictionary[rv])

# for rv in rvdictionary:
#     print(rvdictionary[rv])
    
for rv in distributiondict:
    print(distributiondict[rv])
    Distribution.to_emdw_csv(distributiondict[rv])

 	 < Distribution: verdict > 
unacc		0.7155260469867212
good		0.035240040858018386
acc		0.2160367722165475
vgood		0.03319713993871297

 	 < Distribution: doors > 
5more		0.236976506639428
4		0.23442288049029622
3		0.24974463738508682
2		0.278855975485189

 	 < Distribution: buying > 
med		0.22063329928498468
low		0.22063329928498468
vhigh		0.27425944841675176
high		0.2844739530132789

 	 < Distribution: persons > 
4		0.33146067415730335
more		0.3289070480081716
2		0.33963227783452504

 	 < Distribution: safety > 
med		0.33299284984678246
low		0.33401430030643514
high		0.33299284984678246

 	 < Distribution: lug_boot > 
med		0.33350357507660877
small		0.33401430030643514
big		0.3324821246169561

 	 < Distribution: maint > 
med		0.22676200204290092
low		0.22063329928498468
vhigh		0.27681307456588355
high		0.27579162410623087



## Maximum likelyhood

### Define the graph structure

In [43]:
# family = {'C':['A','B'], 'D':['C'],'F':['D','B','C']}
family = {'verdict':['buying','maint', 'doors', 'persons', 'lug_boot', 'safety']}
# family = {'verdict':['buying', 'doors', 'persons', 'safety'],'buying':['maint','lug_boot']}

In [44]:
for child in family:
    print("child: ",child,"\nparents: ", family[child])

child:  verdict 
parents:  ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']


### Compute the maximum likelyhood counts

In [45]:
class ChildContainer:
    def __init__(self,rvobj, childcategorydict,childrvdict):
        self.rvobj = rvobj
        self.catdict = childcategorydict

In [46]:
# dictionaries that will contain the data to calculate MLE for the specific graph structure   
childict = {}
childict_true = {}

for child in family:
    # list of all the parents for a specific child node
    parlist = [parent for parent in family[child]]
    # lists containing entries for all parent combinations as well as ones only for when the child is True
    partuplist = []
    partuplist_true = []
    # iterate through all entries and store the data in the lists
    checklist = []
    dtlist = []
    for item in x[child].values:
        checklist.append(item)
    for (check,item) in zip(checklist, x[[*parlist]].values):
#         dtlist.append(DT(parlist,item))
        partuplist.append((tuple(parlist),tuple(item)))
        if check == 'vgood' or 'acc':
            partuplist_true.append((tuple(parlist),tuple(item)))
#             print(tuple(parlist),tuple(item))
    
    # add the lists to the dictionaries
    childict[child] =partuplist
    childict_true[child] =partuplist_true
    for item in dtlist:
        print(item)
    
# get the set of all the parent combinations for each child (all other combinations will have probability 0 or Ebs)
childsetdict = {}
for child in childict:
    childsetdict[child] = set(childict[child])
# for child in childsetdict:
#     print("child: ",child,", Parents: ", childsetdict[child])
    
# dictionaries to hold the prelim counts
countdict = {}
countdict_true = {}
# initializing counts to 0
for child in childsetdict:
    for occurance in childsetdict[child]:
        countdict[occurance] = (0, child)
#         print("parnets",occurance)
        countdict_true[occurance] = (0, child)
# sum all occurances of each parent combination for each child   
for child in childict:
    for entry in childict[child]:
#         print("entry",entry)
        countdict[entry] = (countdict[entry][0] + 1,child)
# sum all occurances of each parent combination for each child when the child is True
for child in childict_true:
    for entry in childict_true[child]:
        countdict_true[entry] = (countdict_true[entry][0] + 1,child)

# for child in childict_true:
#     print("Child: ",child)
#     c = 0
#     for entry in childict_true[child]:
#         if countdict_true[entry][0] >1:
#             c = c+1
# #     print(c)
# calculate MLE from the two counts above
mledict = {}       
for entry in countdict:
    mledict[entry] = (countdict_true[entry][0]/countdict[entry][0],countdict_true[entry][1])


In [69]:
# dictionaries that will contain the data to calculate MLE for the specific graph structure   
childrvdict = {}
childtotals = {}
childsettotals = {}
graphdict = {}
graphsetdict = {}
# print(childrvdict,childcategorydict)

for rv, item in rvdictionary.items():
    item.clear_count()

for child, parents in family.items():
    childcategorydict = {}
    childsetcategorydict = {}
    dtlist = []
    partuplist = []
    print("child: ",child)
    
    for item in x[[*parents]].values:
        partuplist.append((tuple(parents),tuple(item)))
    childtotals[child] = partuplist
    childsettotals[child] = set(partuplist)
    print(len(set(partuplist)),len(partuplist))
   
    for category in rvcategorydict[child]:
        partuplistcat = []
        print("category: ",category)
        childcategorydict[category] = {}

        for (i,item) in enumerate(x[[*parents]].values):
            if x[[child]].values[i][0] == category:
                partuplistcat.append((tuple(parents),tuple(item)))
        childcategorydict[category] = partuplistcat
        childsetcategorydict[category] = set(partuplistcat)
        print(len(set(childcategorydict[category])),len(childcategorydict[category]))
    
    graphdict[child] = childcategorydict
    graphsetdict[child] = childcategorydict
    
# dictionaries to hold the prelim counts
countdicttotals = {}
countcategorydict = {}
# initializing counts to 0
for child, partuplist in childsettotals.items():
    for partup in partuplist:
        countdicttotals[partup] = (0, child)
        
for child, childcategorydict in graphdict.items():
    for category, partuplist in childcategorydict.items():
        for partup in partuplist:
            countcategorydict[partup] = (0, child, category)

# sum all occurances of each parent combination for each child   
for child, partuplist in childtotals.items():
    print("tottuplength:",len(partuplist))
    for partup in partuplist:
        countdicttotals[partup] = (countdicttotals[partup][0] + 1,child)

for child, childcategorydict in graphdict.items():
    for category, partuplist in childcategorydict.items():
        print("partuplength:",len(partuplist))
        for partup in partuplist:
            if category == countcategorydict[partup][2]:
                countcategorydict[partup] = (countcategorydict[partup][0] + 1, child,category)

mledict = {}              
for child, childcategorydict in graphdict.items():
    childmledict = {}
    for category, partuplist in childcategorydict.items():
        for partup in partuplist:
            newtup = (tuple([child]+[par for par in partup[0]]),tuple([category]+[cat for cat in partup[1]]))
            mledict[newtup] = countcategorydict[partup][0]/countdicttotals[partup][0]

# for entry,condprob in mledict.items():
#     print(entry,condprob)

child:  verdict
1728 1958
category:  unacc
1246 1401
category:  good
69 69
category:  acc
418 423
category:  vgood
65 65
tottuplength: 1958
partuplength: 1401
partuplength: 69
partuplength: 423
partuplength: 65


In [23]:
class DT:
    def __init__(self, catlist,vallist):
        self.list =[]
        for (i, cat) in enumerate(catlist):
            self.list.append(DT_item(cat,vallist[i]))
    def __str__(self):
        return_string = "assignments: "
        for item in self.list:
            return_string =  return_string + item.cat + " = " + item.val + ", "
        return return_string
                
class DT_item:
    def __init__(self,cat,val):
        self.cat = cat
        self.val = val

In [34]:
# dictionaries that will contain the data to calculate MLE for the specific graph structure   
childict = {}
childict_true = {}

for child in family:
    # list of all the parents for a specific child node
    parlist = [parent for parent in family[child]]
    # lists containing entries for all parent combinations as well as ones only for when the child is True
    partuplist = []
    partuplist_true = []
    # iterate through all entries and store the data in the lists
    checklist = []
    dtlist = []
    for item in x[child].values:
        checklist.append(item)
    for (check,item) in zip(checklist, x[[*parlist]].values):
#         dtlist.append(DT(parlist,item))
        partuplist.append((tuple(parlist),tuple(item)))
        if check == 'acc':
            partuplist_true.append((tuple(parlist),tuple(item)))
#             print(tuple(parlist),tuple(item))
    
    # add the lists to the dictionaries
    childict[child] =partuplist
    childict_true[child] =partuplist_true
#     for item in dtlist:
#         print(item)
    
# get the set of all the parent combinations for each child (all other combinations will have probability 0 or Ebs)
childsetdict = {}
for child in childict:
    childsetdict[child] = set(childict[child])
# for child in childsetdict:
#     print("child: ",child,", Parents: ", childsetdict[child])
    
# dictionaries to hold the prelim counts
countdict = {}
countdict_true = {}
# initializing counts to 0
for child in childsetdict:
    for parents in childsetdict[child]:
        countdict[parents] = (0, child)
        countdict_true[parents] = (0, child)
# sum all occurances of each parent combination for each child   
for child in childict:
    for entry in childict[child]:
        countdict[entry] = (countdict[entry][0] + 1,child)
# sum all occurances of each parent combination for each child when the child is True
for child in childict_true:
    for entry in childict_true[child]:
        countdict_true[entry] = (countdict_true[entry][0] + 1,child)
# calculate MLE from the two counts above
mledict = {}       
for entry in countdict:
    mledict[entry] = (countdict_true[entry][0]*1.0/countdict[entry][0]*1.0,countdict_true[entry][1])
#     print(countdict_true[entry][0]*1.0/countdict[entry][0]*1.0)
#     print(mledict[entry])


### Print the MLE conditional probabilities for each child

In [30]:
# print the MLE conditional probabilities for each child
# table = np.empty([len(mledict)*2,len(x.columns)+1],dtype=str)
table = []

# i = 0
# for entry in mledict:
#     j = 0
#     for val in entry[1]:
#         table[i][j] = val 
#         print(table[i][j])
#         j = j + 1
#     table[i][j] =  1
#     table[i][1+j] =  mledict[entry][0]
#     i = i + 1
#     table[i][j] = 0
#     table[i][j+1] = 1 -  mledict[entry][0]
#     i = i + 1
    
for entry in mledict:
    row = []
    for val in entry[1]:
        row.append(val)
    row.append(val)
#     print(mledict[entry][0])
    row.append(mledict[entry][0])
    table.append(row)
    
# print(table[0:100,:])
    print("P(",mledict[entry][1]," = 1 |",entry,") = ", mledict[entry][0])
#     print("P(",mledict[entry][1]," = 0 |",entry,") = ", 1-mledict[entry][0])
    

P( verdict  = 1 | (('buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'), ('low', 'vhigh', '3', '4', 'big', 'high')) ) =  1.0
P( verdict  = 1 | (('buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'), ('vhigh', 'low', '4', '4', 'big', 'high')) ) =  1.0
P( verdict  = 1 | (('buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'), ('low', 'low', '5more', 'more', 'med', 'low')) ) =  0.0
P( verdict  = 1 | (('buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'), ('vhigh', 'vhigh', '5more', 'more', 'big', 'low')) ) =  0.0
P( verdict  = 1 | (('buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'), ('low', 'high', '4', '4', 'small', 'low')) ) =  0.0
P( verdict  = 1 | (('buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'), ('high', 'high', '4', '4', 'small', 'high')) ) =  0.5
P( verdict  = 1 | (('buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'), ('vhigh', 'low', '5more', 'more', 'med', 'high')) ) =  1.0
P( verdict  = 1 | (('buying', 'maint', 'd

In [None]:
df = pd.DataFrame(data=table, columns=[c for c in x.columns] + ['condprob'])
df.to_csv('out.csv',sep=',')

In [None]:
# print(table[1000:1100,:])