### Representing independencies using pgmpy
To represent independencies, pgmpy has two classes, namely IndependenceAssertion and Independencies.

In [2]:
# represent independence assertions using the class - IndependenceAssertion
from pgmpy.independencies import IndependenceAssertion
assertion1 = IndependenceAssertion('X', 'Y')
assertion1

(X _|_ Y)

In [3]:
#assertion1 represents that the variable X is independent of the variable Y. 
#To represent conditional assertions, we just need to add a third argument to IndependenceAssertion
assertion2 = IndependenceAssertion('X', 'Y', 'Z')
assertion2

(X _|_ Y | Z)

In [9]:
# class Independencies is used to represent set of independence assertions
from pgmpy.independencies import Independencies
independencies = Independencies()
print(independencies.get_assertions())
independencies.add_assertions(assertion1, assertion2)
print(independencies.get_assertions())

[]
[(X _|_ Y), (X _|_ Y | Z)]


In [10]:
#The other way to do the same is direct initialization
independencies2 = Independencies(assertion1, assertion2)
print(independencies2.get_assertions())

independencies3 = Independencies(['X', 'Y'], ['X', 'Y', 'Z'])
print(independencies3.get_assertions())

[(X _|_ Y), (X _|_ Y | Z)]
[(X _|_ Y), (X _|_ Y | Z)]


### Representing joint probability distributions using pgmpy

In [11]:
from pgmpy.factors.discrete import JointProbabilityDistribution as Joint
dist = Joint(['coin1', 'coin2'], [2,2], [0.25, 0.25, 0.25, 0.25])
print(dist)

+---------+---------+------------------+
| coin1   | coin2   |   P(coin1,coin2) |
| coin1_0 | coin2_0 |           0.2500 |
+---------+---------+------------------+
| coin1_0 | coin2_1 |           0.2500 |
+---------+---------+------------------+
| coin1_1 | coin2_0 |           0.2500 |
+---------+---------+------------------+
| coin1_1 | coin2_1 |           0.2500 |
+---------+---------+------------------+


In [14]:
dist.check_independence(['coin1'], ['coin2'])

  phi.values = phi.values[slice_]
  phi1.values = phi1.values[slice_]


True

### Conditinal Probability Distribution - CPD
CPD is represented using a tabular CPD, which is, we construct a table containing all possible combinations of different states of the random variables and the probabilisties corresponding to those states. 

In [15]:
from pgmpy.factors.discrete.CPD import TabularCPD


In [19]:
quality = TabularCPD(variable='Quality', variable_card=3, values=[[0.3], [0.5], [0.2]])
print(quality)

+-----------+-----+
| Quality_0 | 0.3 |
+-----------+-----+
| Quality_1 | 0.5 |
+-----------+-----+
| Quality_2 | 0.2 |
+-----------+-----+
['Quality']


In [28]:
print('random variables involved in the CPD --> ', quality.variables)
print('cardinality --> ', quality.cardinality)
print('values of the CPD --> ', quality.values)

random variables involved in the CPD -->  ['Quality']
cardinality -->  [3]
values of the CPD -->  [0.3 0.5 0.2]


In [29]:
location = TabularCPD(variable='Location', variable_card=2, values=[[0.6], [0.4]])
print(location)

+------------+-----+
| Location_0 | 0.6 |
+------------+-----+
| Location_1 | 0.4 |
+------------+-----+


In [30]:
#The above were marginal distributions. Adding conditinality here
cost = TabularCPD(variable='Cost', variable_card=2, 
                  values=[[0.8, 0.6, 0.1, 0.6, 0.6, 0.05], [0.2, 0.4, 0.9, 0.4, 0.4, 0.95]], 
                  evidence=['Q', 'L'], evidence_card = [3,2])

In [31]:
print(cost)

+--------+-----+-----+-----+-----+-----+------+
| Q      | Q_0 | Q_0 | Q_1 | Q_1 | Q_2 | Q_2  |
+--------+-----+-----+-----+-----+-----+------+
| L      | L_0 | L_1 | L_0 | L_1 | L_0 | L_1  |
+--------+-----+-----+-----+-----+-----+------+
| Cost_0 | 0.8 | 0.6 | 0.1 | 0.6 | 0.6 | 0.05 |
+--------+-----+-----+-----+-----+-----+------+
| Cost_1 | 0.2 | 0.4 | 0.9 | 0.4 | 0.4 | 0.95 |
+--------+-----+-----+-----+-----+-----+------+


### Graph theory
A graph G = (V, E).
Set V = nodes or vertices of the graph
Set E = edges or arcs of the graph
No. of nodes in G= Cardinality of G = Order of G = |V| 
No. of edges in G = Size of G = |E|

Two vertices, u, v ε V are adjacent if u, v ε E.
neighbors set of v as { u | ( u , v ) ε E }
an edge is a self loop if the start vertex and the end vertex of the edge are the same.

For a vertex v ε V, we define its outdegree as the number of edges originating from the vertex v, that is, { u | ( v , u ) ε E }.
Similarly, the indegree is defined as the number of edges that end at the vertex v, that is, { u | ( u , v ) ε E }.

For a graph G = (V, E) and u,v ε V, we define a u - v walk as an alternating sequence of vertices and edges, starting with u and ending with v.

A walk with no repeated edges is known as a trail.

a walk with no repeated vertices, except possibly the first and the last, is known as a path.

### Bayesian Networks
![title](images/BN1.png)

In [7]:
from pgmpy.models import BayesianModel
model = BayesianModel()

In [8]:
#Add Nodes and Edges to the BN
model.add_nodes_from(['rain', 'traffic_jam'])
model.add_edge('rain', 'traffic_jam')

model.add_edge('accident', 'traffic_jam')
print("Nodes -> ", model.nodes())
print("Edges -> ", model.edges())

Nodes ->  ['rain', 'accident', 'traffic_jam']
Edges ->  [('rain', 'traffic_jam'), ('accident', 'traffic_jam')]


In [10]:
#Create CPD's
from pgmpy.factors.discrete import TabularCPD
cpd_rain = TabularCPD('rain', 2, [[0.4], [0.6]])
cpd_accident = TabularCPD('accident', 2, [[0.2], [0.8]])
cpd_traffic_jam = TabularCPD('traffic_jam', 2, 
                             [[0.9, 0.6, 0.7, 0.1],
                              [0.1, 0.4, 0.3, 0.9]],
                            evidence = ['rain', 'accident'],
                            evidence_card=[2,2])

In [11]:
#Add CPD's
model.add_cpds(cpd_rain, cpd_accident,cpd_traffic_jam)
model.get_cpds()

[<TabularCPD representing P(rain:2) at 0xa0a1e8ec>,
 <TabularCPD representing P(accident:2) at 0xa0a1e8ac>,
 <TabularCPD representing P(traffic_jam:2 | rain:2, accident:2) at 0xa0a1e54c>]

In [16]:
#Adding rest of the nodes and CPD's
model.add_nodes_from(['long_queues', 'getting_up_late', 'late_for_school'])

model.add_edges_from([('traffic_jam', 'long_queues'), 
                      ('getting_up_late', 'late_for_school'), 
                      ('traffic_jam', 'late_for_school')])

cpd_long_queues=TabularCPD('long_queues',2,
                           [[0.9, 0.2],
                            [0.1, 0.8]], 
                          evidence=['traffic_jam'],
                          evidence_card=[2])

cpd_getting_up_late = TabularCPD('getting_up_late', 2, [[0.6], [0.4]])

cpd_late_for_school = TabularCPD('late_for_school', 2, 
                                [[0.9, 0.45, 0.8, 0.1],
                                 [0.1, 0.55, 0.2, 0.9]],
                                evidence=['getting_up_late', 'traffic_jam'],
                                evidence_card = [2,2])

model.add_cpds(cpd_long_queues, cpd_getting_up_late, cpd_late_for_school)
print(model.get_cpds())
model.check_model()



[<TabularCPD representing P(rain:2) at 0xa0a1e8ec>, <TabularCPD representing P(accident:2) at 0xa0a1e8ac>, <TabularCPD representing P(traffic_jam:2 | rain:2, accident:2) at 0xa0a1e54c>, <TabularCPD representing P(getting_up_late:2) at 0xa0a4b28c>, <TabularCPD representing P(late_for_school:2 | getting_up_late:2, traffic_jam:2) at 0xa0a30d6c>, <TabularCPD representing P(long_queues:2 | traffic_jam:2) at 0xa0a4b1cc>]


True

In [17]:
#remove_cpds() method to remove an already added cpd in the graph
#model.remove_cpds('late_for_school') 