In [1]:
pip install pomegranate

Note: you may need to restart the kernel to use updated packages.


#### Import all required libraries

In [2]:

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set_style('whitegrid')
import numpy as np
import pandas as pd
from pomegranate import *

### Reading data from FIle and pre-processing

In [3]:
stats = pd.read_csv('India_Test_Stats.csv')
stats.head(6)
datap = stats.drop(stats.loc[:,stats.columns.str.match("Unnamed")], axis=1)
datap = datap[datap.Ashwin.notnull()]
datap.shape

(85, 8)

In [4]:
#Preprocessing - dropping Start Date as it doesn't impact any results or not used on any calculations
dataf = datap.drop(columns=['Start Date'], axis=1)
dataf.head()

Unnamed: 0,Ashwin,Result,Margin,Toss,Bat,Opposition,Location
0,Y,won,5 wickets,lost,2nd,v West Indies,Home
1,Y,won,inns & 15 runs,won,1st,v West Indies,Home
2,Y,draw,-,lost,2nd,v West Indies,Home
3,Y,lost,122 runs,lost,2nd,v Australia,Away
4,Y,lost,inns & 68 runs,won,1st,v Australia,Away


####  1) Function to calculate Prior Probability

In [5]:
def getProbability(col,domain):
    dict = {"testk":"testvalue"}
    for prop in domain:
        d = dataf[col] == prop
        dict[prop] = len(dataf[d])/len(d)
    dict.pop("testk")
    return dict

In [6]:
# The test location domain distribution
location = getProbability('Location',{'Home','Away'})

result = getProbability('Result',{'won','draw','lost'})
#Define a domain - Ashwin Playing
ashwin = getProbability('Ashwin',{'Y','N'})

# Define a domain - Toss
toss = getProbability('Toss',{'lost','won'})

# Define a domain - Batting
bat = getProbability('Bat',{'1st','2nd'})

print(str(location))
print(str(result))
print(str(ashwin))
print(str(toss))
print(str(bat))

{'Home': 0.5058823529411764, 'Away': 0.49411764705882355}
{'won': 0.5529411764705883, 'draw': 0.18823529411764706, 'lost': 0.25882352941176473}
{'N': 0.17647058823529413, 'Y': 0.8235294117647058}
{'won': 0.47058823529411764, 'lost': 0.5294117647058824}
{'1st': 0.5411764705882353, '2nd': 0.4588235294117647}


In [7]:
dataf["Count"] = 1
dataf.head()

Unnamed: 0,Ashwin,Result,Margin,Toss,Bat,Opposition,Location,Count
0,Y,won,5 wickets,lost,2nd,v West Indies,Home,1
1,Y,won,inns & 15 runs,won,1st,v West Indies,Home,1
2,Y,draw,-,lost,2nd,v West Indies,Home,1
3,Y,lost,122 runs,lost,2nd,v Australia,Away,1
4,Y,lost,inns & 68 runs,won,1st,v Australia,Away,1


In [8]:
df = dataf[["Location","Ashwin","Count"]]
df.head()

Unnamed: 0,Location,Ashwin,Count
0,Home,Y,1
1,Home,Y,1
2,Home,Y,1
3,Away,Y,1
4,Away,Y,1


In [9]:
al = pd.pivot_table(
    df, 
    values='Count', 
    index=['Location'], 
    columns=['Ashwin'], 
    aggfunc=np.size, 
    fill_value=0
)
print(al)

Ashwin     N   Y
Location        
Away      15  27
Home       0  43


In [10]:
# p(a|b) = p(a I b)/p(b)
#p(a I b) = p (a and b)/p(all)
list(al.query('Location == ["Home"]').get('Y'))[0]


pall = list(al.query('Location == ["Home"]').get('Y'))[0] + list(al.query('Location == ["Home"]').get('N'))[0] + list(al.query('Location == ["Away"]').get('Y'))[0]+list(al.query('Location == ["Away"]').get('N'))[0]
print(pall)

#p(location=home)
h = list(al.query('Location == ["Home"]').get('Y'))[0] + list(al.query('Location == ["Home"]').get('N'))[0]
print(h)
away=list(al.query('Location == ["Away"]').get('Y'))[0] + list(al.query('Location == ["Away"]').get('N'))[0]
print(away)
aY=list(al.query('Location == ["Home"]').get('Y'))[0] + list(al.query('Location == ["Away"]').get('Y'))[0]
print(aY)
aN = list(al.query('Location == ["Home"]').get('N'))[0] + list(al.query('Location == ["Away"]').get('N'))[0]
print(aN)

#probability of playing at home
ph = h/pall
print(ph)

paway = away/pall

#probability of playing Ashwin = Y
paY = aY/pall
print(paY)

paN = aN/pall

#Ph ^ paY
hIaY = list(al.query('Location == ["Home"]').get('Y'))[0]/pall
print(hIaY)

#P away ^ paY
awayIaY = list(al.query('Location == ["Away"]').get('Y'))[0]/pall

hIaN = list(al.query('Location == ["Home"]').get('N'))[0]/pall
awayIaN = list(al.query('Location == ["Away"]').get('N'))[0]/pall

#P(home/ashwin=Y) = hIaY/paY

phaY = hIaY/paY
print(phaY)

#P(home/ashwin=N) = hIaN/paN
phaN = hIaN/paN
print(phaN)

pawayaY =  awayIaY/paY
print(pawayaY)

pawayaN = awayIaN/paN
print(pawayaN)


85
43
42
70
15
0.5058823529411764
0.8235294117647058
0.5058823529411764
0.6142857142857143
0.0
0.38571428571428573
1.0


#### Question 2 - Create a function to calculate conditional probability

In [11]:
def getPostProbability(cola,domaina, colb, domainb):
    dict = {"testk":"testvalue"}
    #p(a|b) = p(a and b)/p(b)
    
    df = dataf[[cola,colb,"Count"]]
 
    pivot = pd.pivot_table(
    df, 
    values='Count', 
    index=[cola], 
    columns=[colb], 
    aggfunc=np.size, 
    fill_value=0
)
 
    print(pivot)
    print(" ")
    pall = 0
    for idx, row in pivot.iterrows():
        for valb in domainb:
            pall = pall+row[valb]
    print("pall: "+str(pall))
    
    for idx, row in pivot.iterrows():
        idxsum = 0
        for valb in domainb:
            idxsum =  (idxsum + row[valb])   
        idxsum = idxsum/pall
        dict[idx] = idxsum 
        
    for valb in domainb:
        valsum = 0
        for idx, row in pivot.iterrows():
            valsum = (valsum + row[valb])
        valsum = valsum/pall
        dict[valb] = valsum
    
    intersect = {"dummyk":"dummyvalue"}           
    for idx, row in pivot.iterrows():
        for valb in domainb:
            intersect[idx+valb] = row[valb]/pall

    finaldict = {"testk":"testvalue"}
    for idx, row in pivot.iterrows():
        for valb in domainb:
            finaldict[idx+"-"+valb]=intersect.get(idx+valb)/dict.get(valb)
            

    finaldict.pop("testk")

    lst=[]
    for key in finaldict:
        entry = key.split("-")
        entry.append(finaldict[key])
        lst.append(entry)
    
    return lst

In [12]:
resultAshwin = getPostProbability('Result',{'won','draw','lost'},'Ashwin',{'Y','N'})

print(resultAshwin)


for item in resultAshwin:
    item.insert(2,'2nd')
finalarray = []
finalarray.append(resultAshwin)

Ashwin  N   Y
Result       
draw    3  13
lost    7  15
won     5  42
 
pall: 85
[['draw', 'N', 0.19999999999999998], ['draw', 'Y', 0.18571428571428575], ['lost', 'N', 0.4666666666666666], ['lost', 'Y', 0.2142857142857143], ['won', 'N', 0.3333333333333333], ['won', 'Y', 0.6000000000000001]]


#### 3) pomegranate library to define conditionality between variables to create Bayesian Network 

In [13]:
#Question3
matchresult = DiscreteDistribution({'W':0.33,'L':0.33,'D':0.33})
ashwinplaying = DiscreteDistribution({'Y':0.5,'N':0.5})
secondbat = DiscreteDistribution({'Y':0.5,'N':0.5})
BaysNetwork = ConditionalProbabilityTable([['W', 'Y', 'Y', 0.410256], 
                                           ['W', 'Y', 'N', 0.565217],
                                           ['W', 'N', 'Y', 0.0], 
                                           ['W', 'N', 'N', 0.108696],
                                           ['L', 'Y', 'Y', 0.205128], 
                                           ['L', 'Y', 'N', 0.152174],
                                           ['L', 'N', 'Y', 0.128205], 
                                           ['L', 'N', 'N', 0.043478],
                                           ['D', 'Y', 'Y', 0.230769], 
                                           ['D', 'Y', 'N', 0.086957],
                                           ['D', 'N', 'Y', 0.025647], 
                                           ['D', 'N', 'N', 0.043478],
                                          ],[matchresult, ashwinplaying])

In [14]:
# State objects hold both the distribution, and a high level name
bat = State(secondbat, name="bat")
ashwin = State(ashwinplaying, name="ashwin")
result = State(matchresult, name="result")

In [15]:
# Create the Bayesian network object with a useful name
bayesianmodel = BayesianNetwork("Assignment 2 - Spinning the Bayes Net")

In [16]:
# Add the three states to the network 
#model.add_states(result, ashwin, bat)
bayesianmodel.add_nodes(result, ashwin, bat)

In [17]:
# Add edges which represent conditional dependencies, where the second node is 
# conditionally dependent on the first node (Monty is dependent on both guest and prize)
bayesianmodel.add_edge(bat, result)
bayesianmodel.add_edge(ashwin, result)

In [18]:
bayesianmodel.bake()

####  4) Use the Bayesian Network model created in (3) to calculate the probability of:

In [19]:
# a. India winning, Ashwin playing, batting 2ndv
bayesianmodel.probability([['W', 'Y', 'Y']])

0.08249999999999999

In [20]:
#b. India winning,  Ashwin not playing, batting 2nd
bayesianmodel.probability([['W', 'N', 'Y']])

0.08249999999999999

In [21]:
#c. India losing,  Ashwin playing, batting 2nd
bayesianmodel.probability([['L', 'Y', 'Y']])

0.08249999999999999

In [22]:
#d. India losing,  Ashwin not playing, batting 2nd
bayesianmodel.probability([['L', 'N', 'Y']])

0.08249999999999999