In [1]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
from pgmpy.estimators import ConstraintBasedEstimator, HillClimbSearch, K2Score, BicScore, BdeuScore
from scipy import stats

In [2]:
# loading data
raw = pd.read_csv("forestfires.csv")
data = pd.DataFrame(raw)
data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
# prepocessing data
data_binned = data.copy()
data_binned["X"] = pd.cut(data["X"], [0,3,6,9])
data_binned["Y"] = pd.cut(data["Y"], [0,3,6,9])
data_binned["FFMC"] = pd.qcut(data["FFMC"], 3)#[0,20,40,60,80,100])
data_binned["DMC"] = pd.qcut(data["DMC"], 3)#[0,100,200,300])
data_binned["ISI"] = pd.qcut(data["ISI"], 3)#[0,20,40,60])
data_binned["temp"] = pd.qcut(data["temp"], 3)#[0,20,40])
data_binned["RH"] = pd.qcut(data["RH"], 3)#[0,20,40,60,80,100])
data_binned["wind"] = pd.qcut(data["wind"], 3)
#data_binned["rain"] = pd.cut(data["rain"], 2)
data_binned["area"] = pd.cut(data["area"], [-1,0,50,1100])#[-1,0,600,1100])
data_binned["month"] = data_binned["month"].map({"jan":"winter","feb":"winter","mar":"spring","apr":"spring","may":"spring","jun":"summer","jul":"summer","aug":"summer","sep":"autumn","oct":"autumn","nov":"autumn","dec":"winter"})
data_binned["day"] = data_binned["day"].map({"mon":"workday","tue":"workday","wed":"workday","thu":"workday","fri":"workday","sat":"weekend","sun":"weekend"})
data_binned = data_binned.drop('rain', 1)
data_binned = data_binned.drop('DC', 1)
data_binned.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,ISI,temp,RH,wind,area
0,"(6, 9]","(3, 6]",spring,workday,"(18.699, 91.0]","(1.099, 88.0]","(-0.001, 7.1]","(2.199, 17.1]","(48.0, 100.0]","(4.9, 9.4]","(-1, 0]"
1,"(6, 9]","(3, 6]",autumn,workday,"(18.699, 91.0]","(1.099, 88.0]","(-0.001, 7.1]","(17.1, 21.4]","(14.999, 35.0]","(0.399, 3.1]","(-1, 0]"
2,"(6, 9]","(3, 6]",autumn,weekend,"(18.699, 91.0]","(1.099, 88.0]","(-0.001, 7.1]","(2.199, 17.1]","(14.999, 35.0]","(0.399, 3.1]","(-1, 0]"
3,"(6, 9]","(3, 6]",spring,workday,"(91.0, 92.4]","(1.099, 88.0]","(7.1, 9.6]","(2.199, 17.1]","(48.0, 100.0]","(3.1, 4.9]","(-1, 0]"
4,"(6, 9]","(3, 6]",spring,weekend,"(18.699, 91.0]","(1.099, 88.0]","(7.1, 9.6]","(2.199, 17.1]","(48.0, 100.0]","(0.399, 3.1]","(-1, 0]"


In [4]:
est = ConstraintBasedEstimator(data_binned)
skel, sep_sets = est.estimate_skeleton(significance_level=0.1)

In [5]:
skel.edges()

[('X', 'Y'),
 ('month', 'DMC'),
 ('month', 'ISI'),
 ('month', 'temp'),
 ('FFMC', 'DMC'),
 ('FFMC', 'ISI'),
 ('DMC', 'ISI'),
 ('temp', 'RH')]

In [6]:
pdag = est.skeleton_to_pdag(skel, sep_sets)
pdag.edges()

[('X', 'Y'),
 ('Y', 'X'),
 ('month', 'DMC'),
 ('month', 'ISI'),
 ('month', 'temp'),
 ('FFMC', 'DMC'),
 ('FFMC', 'ISI'),
 ('DMC', 'ISI'),
 ('ISI', 'month'),
 ('ISI', 'FFMC'),
 ('ISI', 'DMC'),
 ('RH', 'temp')]

In [36]:
est2 = HillClimbSearch(data_binned, scoring_method=BdeuScore(data_binned))
model = est2.estimate()

In [37]:
sorted(model.nodes())

['DMC', 'FFMC', 'ISI', 'RH', 'X', 'Y', 'area', 'day', 'month', 'temp', 'wind']

In [38]:
model.edges()

[('X', 'Y'),
 ('month', 'temp'),
 ('month', 'X'),
 ('month', 'wind'),
 ('day', 'FFMC'),
 ('day', 'area'),
 ('FFMC', 'DMC'),
 ('FFMC', 'RH'),
 ('DMC', 'month'),
 ('ISI', 'FFMC'),
 ('ISI', 'month'),
 ('ISI', 'day'),
 ('ISI', 'DMC'),
 ('RH', 'temp')]

In [47]:
BdeuScore(data_binned).score(BayesianModel([("month","temp"),
                  ("month","wind"),
                  ("month","RH"),
                  #("month", "rain"),
                  ("month", "area"), 
                  ("wind","ISI"),
                  ("wind","FFMC"),
                  #("wind","area"),
                  ("temp","FFMC"),
                  ("temp","DMC"),
                  ("temp","area"),
                  ("RH","FFMC"),
                  ("RH","DMC"),
                  #("RH","area"),
                  #("rain","DMC"),
                  #("rain","FFMC"),
                  #("rain","area"),
                  ("FFMC","ISI"),
                  ("FFMC","area"),
                  ("DMC","area"),
                  ("ISI","area"),
                  ("X","area"),
                  ("Y","area"),
                  ("day","area")
                 ]))

-5841.224468344891