## Maximum Likelyhood for PGM

In [43]:
import tensorflow as tf
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Class definitions - RV, Distribution and Category

In [44]:
class RV:
    def __init__(self, cattup,name):
        self.name = name
        self.categories = cattup
        self.catcountdict = {}
        for category in self.categories:
            self.catcountdict[category] = Category(category)
    def __str__(self):
        return_string = " \t < RV: " + self.name + " > \n"
        count = 0
        for category in self.catcountdict:
            return_string = return_string + category+ "\t\t"+ str(self.catcountdict[category].count) + "\n"
            count = count + self.catcountdict[category].count
        return_string = return_string + "total count: " + str(count)
        return return_string
    def clear_count(self):
        for cat in self.catcountdict:
            self.catcountdict[cat].count = 0
        
class Distribution:
    def __init__(self,randomvar):
        self.name = randomvar.name
        categories = randomvar.categories
        catcountdict = randomvar.catcountdict
        distdict = {}
        total = 0
        for category in categories:
            total = total + catcountdict[category].count
        for category in categories:
            distdict[category] = catcountdict[category].count/(1.0*total)
        
        self.categories = categories
        self.distdict = distdict
        self.total = total
    def __str__(self):
        return_string = " \t < Distribution: " + self.name + " > \n"
        for category in self.distdict:
            return_string = return_string + category+ "\t\t"+ str(self.distdict[category]) + "\n"
            
        return return_string
    def to_csv(self):
        table = []
        for category in self.distdict:
            table.append([category,self.distdict[category]])
        df = pd.DataFrame(data=table, columns=['category','prob'])
        name = "prior_" + self.name + ".csv"
        df.to_csv(name,sep=',')
    def to_emdw_csv(self):
        table = []
        ind = 0
        for category in self.distdict:
            table.append([ind,self.distdict[category]])
            ind = ind +1
        df = pd.DataFrame(data=table)
        name = "prior_" + self.name + ".csv"
        df.to_csv(name,sep=',',index=False,header=False)

class Category:
    def __init__(self,category,catcount=0):
        self.name = category
        self.count = catcount
    def increment(self):
        self.count = self.count + 1
        



### Read in the graph

In [45]:
x = pd.read_csv('car.csv')


### Store the possible categories per RV

In [46]:
# find out how many categories each RV can take
rvcategorydict = {}
for rv in x.columns:
    rvcategorydict[rv] = tuple(set([item[0] for item in x[[rv]].values]))
for rv in rvcategorydict:
    print(rv,rvcategorydict[rv])

maint ('high', 'low', 'med', 'vhigh')
doors ('5more', '3', '4', '2')
lug_boot ('small', 'big', 'med')
persons ('4', '2', 'more')
verdict ('good', 'unacc', 'acc', 'vgood')
safety ('high', 'low', 'med')
buying ('high', 'low', 'med', 'vhigh')


### Set up the RV dictionary

In [47]:
rvdictionary = {}

for rv in rvcategorydict:
    rvdictionary[rv] = RV(rvcategorydict[rv],name=rv)
    

## Calculate the prior probabilities

In [48]:
for rv in x.columns:
    for item in x[[rv]].values:
        rvdictionary[rv].catcountdict[item[0]].increment()

distributiondict = {}

for rv in rvdictionary:
    distributiondict[rv] = Distribution(rvdictionary[rv])

for rv in rvdictionary:
    print(rvdictionary[rv])
    
for rv in distributiondict:
    print(distributiondict[rv])
    Distribution.to_emdw_csv(distributiondict[rv])

 	 < RV: maint > 
high		432
low		432
vhigh		432
med		432
total count: 1728
 	 < RV: doors > 
5more		432
3		432
4		432
2		432
total count: 1728
 	 < RV: lug_boot > 
small		576
med		576
big		576
total count: 1728
 	 < RV: persons > 
4		576
more		576
2		576
total count: 1728
 	 < RV: verdict > 
unacc		1210
good		69
vgood		65
acc		384
total count: 1728
 	 < RV: safety > 
high		576
low		576
med		576
total count: 1728
 	 < RV: buying > 
high		432
low		432
vhigh		432
med		432
total count: 1728
 	 < Distribution: maint > 
high		0.25
low		0.25
vhigh		0.25
med		0.25

 	 < Distribution: doors > 
5more		0.25
3		0.25
4		0.25
2		0.25

 	 < Distribution: lug_boot > 
small		0.3333333333333333
med		0.3333333333333333
big		0.3333333333333333

 	 < Distribution: persons > 
4		0.3333333333333333
more		0.3333333333333333
2		0.3333333333333333

 	 < Distribution: verdict > 
unacc		0.7002314814814815
good		0.03993055555555555
vgood		0.03761574074074074
acc		0.2222222222222222

 	 < Distribution: safety > 
hi

## Maximum likelyhood

### Define the graph structure

In [49]:
# family = {'C':['A','B'], 'D':['C'],'F':['D','B','C']}
family = {'verdict':['buying','maint', 'doors', 'persons', 'lug_boot', 'safety']}

In [50]:
for child in family:
    print("child: ",child,"\nparents: ", family[child])

child:  verdict 
parents:  ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']


### Compute the maximum likelyhood counts

In [52]:
class ChildContainer:
    def __init__(self,rvobj, childcategorydict,childrvdict):
        self.rvobj = rvobj
        self.catdict = childcategorydict

In [53]:


# dictionaries that will contain the data to calculate MLE for the specific graph structure   
childrvdict = {}
graphdict = {}
# print(childrvdict,childcategorydict)

for rv, item in rvdictionary.items():
    item.clear_count()

for child, parents in family.items():
    childcategorydict = {}
    print(child)
   
    for category in rvcategorydict[child]:
        print(category)
        childcategorydict[category] = {}
    
        for rv, rvobj in rvdictionary.items():
#             print(category,rv)
            if rv != child:
                childcategorydict[category][rv] = rvobj
            
        for rv in parents:
            for (i,item) in enumerate(x[[rv]].values):
#                 print(x[[child]].values[i][0],category)
                if x[[child]].values[i][0] == category:
                    childcategorydict[category][rv].catcountdict[x[[rv]].values[i][0]].increment()
       
    graphdict[child] = childcategorydict
#     
#     print(childcategorydict['unacc']['buying'])
               
    

    #     for category in childcategorydict:
    #         for rv in childcategorydict[category]:
    #             print("thisone:,",rv, childcategorydict[category][rv])

#     for rv, rvobj in rvdictionary.items():
#         if rv != child:
#             childcategorydict[category][rv] = rvobj

#         print(childcategorydict['unacc']['buying'])
#         for rv in parents:
#             for category in childcategorydict:
#                 for (i,item) in enumerate(x[[rv]].values):
#     #                 print(x[[child]].values[i][0],category)
#                     if x[[child]].values[i][0] == category:
#                         print("yes")
    #                     print("cat:-------",category)
    #                     print(rv)
    #                     print("this:",childcategorydict[category])
    #                     childcategorydict[category][rv].catcountdict[x[[rv]].values[i]].increment()
#     graphdict[child] = childcategorydict
#     graphdict[child] = childcontainer(rvdictionary[child],childcategorydict )
                                                    



verdict
good
unacc
acc
vgood
child: verdict
category:  unacc
 	 < RV: safety > 
high		576
low		576
med		576
total count: 1728
category:  good
 	 < RV: safety > 
high		576
low		576
med		576
total count: 1728
category:  vgood
 	 < RV: safety > 
high		576
low		576
med		576
total count: 1728
category:  acc
 	 < RV: safety > 
high		576
low		576
med		576
total count: 1728


In [54]:
for child,parents in family.items():
    print("child:",child)
    for category in graphdict[child]:
        print("-------------------------------category:---------------",category)
        for rv in parents:
#         print("category: ",category)
#         tempdict = graphdict[child] 
# #         print(category, tempdict[category] )
            print(tempdict[category][rv])


child: verdict
-------------------------------category:--------------- unacc
 	 < RV: buying > 
high		432
low		432
vhigh		432
med		432
total count: 1728
 	 < RV: maint > 
high		432
low		432
vhigh		432
med		432
total count: 1728
 	 < RV: doors > 
5more		432
3		432
4		432
2		432
total count: 1728
 	 < RV: persons > 
4		576
more		576
2		576
total count: 1728
 	 < RV: lug_boot > 
small		576
med		576
big		576
total count: 1728
 	 < RV: safety > 
high		576
low		576
med		576
total count: 1728
-------------------------------category:--------------- good
 	 < RV: buying > 
high		432
low		432
vhigh		432
med		432
total count: 1728
 	 < RV: maint > 
high		432
low		432
vhigh		432
med		432
total count: 1728
 	 < RV: doors > 
5more		432
3		432
4		432
2		432
total count: 1728
 	 < RV: persons > 
4		576
more		576
2		576
total count: 1728
 	 < RV: lug_boot > 
small		576
med		576
big		576
total count: 1728
 	 < RV: safety > 
high		576
low		576
med		576
total count: 1728
-------------------------------cat

In [345]:
# dictionaries that will contain the data to calculate MLE for the specific graph structure   
childict = {}
childict_true = {}

for child, parents in family:
    print(child,parents)
    
#     # list of all the parents for a specific child node
#     parlist = [parent for parent in family[child]]
#     # lists containing entries for all parent combinations as well as ones only for when the child is True
#     partuplist = []
#     partuplist_true = []
#     # iterate through all entries and store the data in the lists
    
#     for (check,item) in zip(checklist, x[[*parlist]].values):
        
#     checklist = []
#     for item in x[child].values:
#         checklist.append(item)
#     for (check,item) in zip(checklist, x[[*parlist]].values):
#         partuplist.append((tuple(parlist),tuple(item)))
#         if check == 'vgood' or 'acc':
#             partuplist_true.append((tuple(parlist),tuple(item)))
    
#     # add the lists to the dictionaries
#     childict[child] =partuplist
#     childict_true[child] =partuplist_true
    
# get the set of all the parent combinations for each child (all other combinations will have probability 0 or Ebs)
childsetdict = {}
for child in childict:
    childsetdict[child] = set(childict[child])
# for child in childsetdict:
#     print("child: ",child,", Parents: ", childsetdict[child])
    
# dictionaries to hold the prelim counts
countdict = {}
countdict_true = {}
# initializing counts to 0
for child in childsetdict:
    for parents in childsetdict[child]:
        countdict[parents] = (0, child)
        countdict_true[parents] = (0, child)
# sum all occurances of each parent combination for each child   
for child in childict:
    for entry in childict[child]:
        countdict[entry] = (countdict[entry][0] + 1,child)
# sum all occurances of each parent combination for each child when the child is True
for child in childict_true:
    for entry in childict_true[child]:
        countdict_true[entry] = (countdict_true[entry][0] + 1,child)
# calculate MLE from the two counts above
mledict = {}       
for entry in countdict:
    mledict[entry] = (countdict_true[entry][0]/countdict[entry][0],countdict_true[entry][1])


ValueError: too many values to unpack (expected 2)

In [55]:
# dictionaries that will contain the data to calculate MLE for the specific graph structure   
childict = {}
childict_true = {}

for child in family:
    # list of all the parents for a specific child node
    parlist = [parent for parent in family[child]]
    # lists containing entries for all parent combinations as well as ones only for when the child is True
    partuplist = []
    partuplist_true = []
    # iterate through all entries and store the data in the lists
    checklist = []
    for item in x[child].values:
        checklist.append(item)
    for (check,item) in zip(checklist, x[[*parlist]].values):
        partuplist.append((tuple(parlist),tuple(item)))
        if check == 'vgood' or 'acc':
            partuplist_true.append((tuple(parlist),tuple(item)))
    
    # add the lists to the dictionaries
    childict[child] =partuplist
    childict_true[child] =partuplist_true
    
# get the set of all the parent combinations for each child (all other combinations will have probability 0 or Ebs)
childsetdict = {}
for child in childict:
    childsetdict[child] = set(childict[child])
# for child in childsetdict:
#     print("child: ",child,", Parents: ", childsetdict[child])
    
# dictionaries to hold the prelim counts
countdict = {}
countdict_true = {}
# initializing counts to 0
for child in childsetdict:
    for parents in childsetdict[child]:
        countdict[parents] = (0, child)
        countdict_true[parents] = (0, child)
# sum all occurances of each parent combination for each child   
for child in childict:
    for entry in childict[child]:
        countdict[entry] = (countdict[entry][0] + 1,child)
# sum all occurances of each parent combination for each child when the child is True
for child in childict_true:
    for entry in childict_true[child]:
        countdict_true[entry] = (countdict_true[entry][0] + 1,child)
# calculate MLE from the two counts above
mledict = {}       
for entry in countdict:
    mledict[entry] = (countdict_true[entry][0]/countdict[entry][0],countdict_true[entry][1])


### Print the MLE conditional probabilities for each child

In [56]:
# print the MLE conditional probabilities for each child
# table = np.empty([len(mledict)*2,len(x.columns)+1],dtype=str)
table = []

# i = 0
# for entry in mledict:
#     j = 0
#     for val in entry[1]:
#         table[i][j] = val 
#         print(table[i][j])
#         j = j + 1
#     table[i][j] =  1
#     table[i][1+j] =  mledict[entry][0]
#     i = i + 1
#     table[i][j] = 0
#     table[i][j+1] = 1 -  mledict[entry][0]
#     i = i + 1
    
for entry in mledict:
    row = []
    for val in entry[1]:
        row.append(val)
    row.append(val)
#     print(mledict[entry][0])
    row.append(mledict[entry][0])
    table.append(row)
    
# print(table[0:100,:])
#     print("P(",mledict[entry][1]," = 1 |",entry,") = ", mledict[entry][0])
#     print("P(",mledict[entry][1]," = 0 |",entry,") = ", 1-mledict[entry][0])
    

In [57]:
df = pd.DataFrame(data=table, columns=[c for c in x.columns] + ['condprob'])
df.to_csv('out.csv',sep=',')

In [349]:
# print(table[1000:1100,:])