In [31]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator

In [7]:
# loading data
raw = pd.read_csv("forestfires.csv")
data = pd.DataFrame(raw)
data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [8]:
# prepocessing data
#make dummy variables from X,Y,Month,Day
data_with_dummies = pd.get_dummies(data, columns=["X","Y"], prefix=["X","Y"])
data_with_dummies = pd.get_dummies(data_with_dummies, columns=["month","day"])
data_with_dummies.head()

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,X_1,...,month_nov,month_oct,month_sep,day_fri,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,0,...,0,1,0,0,0,0,0,0,1,0
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,0,...,0,1,0,0,0,1,0,0,0,0
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,0,...,0,0,0,1,0,0,0,0,0,0
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,0,...,0,0,0,0,0,0,1,0,0,0


In [4]:
# create graph for the model
G = BayesianModel()
G.add_nodes_from(data_with_dummies)
G.add_edges_from([("temp","area"),
                  ("wind","area"),
                  ("rain","area"),
                  ("RH","area"),
                  ("FFMC","area"),
                  ("DMC","area"),
                  ("ISI","area"),
                  ("RH","FFMC"),
                  ("wind","FFMC"),
                  ("rain","FFMC"),
                  ("temp","FFMC"),
                  ("RH","DMC"),
                  ("rain","DMC"),
                  ("temp","DMC"),
                  ("FFMC","ISI"),
                  ("wind","ISI")])
G.add_edges_from([(col,"area") for col in data_with_dummies if col.startswith('X')])
G.add_edges_from([(col,"rain") for col in data_with_dummies if col.startswith('Y')])
G.add_edges_from([(col,"RH") for col in data_with_dummies if col.startswith('day')])
G.add_edges_from([(col,"temp") for col in data_with_dummies if col.startswith('month')])
G.add_edges_from([(col,"wind") for col in data_with_dummies if col.startswith('month')])

In [19]:
# fit model on data
#Duurt lang, in 15 min nog niet klaar (of error msg) - laptop wordt wel erg warm.
G.fit(data_with_dummies) # only used for discrete bayesnets?

Hier komt de code die het als LGBN runt zodat het continue variabelen kunnen zijn. Is trial and error dus no flame pls.

To instantiate an object of this class, one needs to provide a variable name, the value of the beta_0 term, the variance, a list of the parent variable names and a list of the coefficient values of the linear equation (beta_vector), where the list of parent variable names and beta_vector list is optional and defaults to None.

In [2]:
from pgmpy.models import LinearGaussianBayesianNetwork

In [49]:
#calculations here
#calculate variance of each column
variances = np.var(data,0)
print(variances)
#variances.get('X')

X           5.343213
Y           1.509729
FFMC       30.412684
DMC      4094.017746
DC      61417.808706
ISI        20.748622
temp       33.651682
RH        265.744793
wind        3.203810
rain        0.087422
area     4044.225578
dtype: float64


5.343212777181253

In [18]:
model = LinearGaussianBayesianNetwork()

#Instantiation of data points
#cpd = LinearGaussianCPD(name, beta_0 term, variance, list_parent_names, list_coeff_val_lin_eq(beta vector)) last 2 are optional

#IMPORTANT: month should be replaced with the dummy variables of month, like below
#likewise for X, Y, day in cpd_area
cpd_temp = LinearGaussianCPD('temp', beta_0 term, variances.get('temp'), ['month'])
cpd_wind = LinearGaussianCPD('wind', beta_0 term, variances.get('wind'), ['month'])
cpd_rain = LinearGaussianCPD('rain', beta_0 term, variances.get('rain'), ['month'])
cpd_RH = LinearGaussianCPD('RH', beta_0 term, variances.get('RH'), ['month'])
cpd_FFMC = LinearGaussianCPD('FFMC', beta_0 term, variances.get('FFMC'), ['rain','RH','temp','wind'])
cpd_DMC = LinearGaussianCPD('DMC', beta_0 term, variances.get('DMC'), ['rain','RH','temp'])
cpd_ISI = LinearGaussianCPD('ISI', beta_0 term, variances.get('ISI'), ['FFMC','wind'])
cpd_area = LinearGaussianCPD('area', beta_0 term, variances.get('area'), ['rain','RH','temp','wind','month','FFMC','DMC','ISI','day','X','Y'])
#should every dummy var have a data point? think so but not sure, 44 in total (incl above)


model.add_nodes_from(data_with_dummies)
model.add_edges_from([("temp","area"),
                  ("wind","area"),
                  ("rain","area"),
                  ("RH","area"),
                  ("FFMC","area"),
                  ("DMC","area"),
                  ("ISI","area"),
                  ("RH","FFMC"),
                  ("wind","FFMC"),
                  ("rain","FFMC"),
                  ("temp","FFMC"),
                  ("RH","DMC"),
                  ("rain","DMC"),
                  ("temp","DMC"),
                  ("FFMC","ISI"),
                  ("wind","ISI")])
model.add_edges_from([(col,"area") for col in data_with_dummies if col.startswith('X')])
model.add_edges_from([(col,"rain") for col in data_with_dummies if col.startswith('Y')])
model.add_edges_from([(col,"RH") for col in data_with_dummies if col.startswith('day')])
model.add_edges_from([(col,"temp") for col in data_with_dummies if col.startswith('month')])
model.add_edges_from([(col,"wind") for col in data_with_dummies if col.startswith('month')])
#jgd = model.to_joint_gaussian()

#model.fit(data_with_dummies)


NotImplementedError: fit method has not been implemented for LinearGaussianBayesianNetwork.

In [None]:
[('x1', 'x2'), ('x2', 'x3')]
cpd1 = LinearGaussianCPD('x1', 1, 4)
cpd2 = LinearGaussianCPD('x2', -5, 4, ['x1'], [0.5])
cpd3 = LinearGaussianCPD('x3', 4, 3, ['x2'], [-1])
model.add_cpds(cpd1, cpd2, cpd3)
jgd = model.to_joint_gaussian()
jgd.variables