In [1]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from scipy import stats

In [2]:
# loading data
raw = pd.read_csv("forestfires.csv")
data = pd.DataFrame(raw)
data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
# prepocessing data
data_binned = data.copy()
data_binned["FFMC"] = pd.qcut(data["FFMC"], 4)#[0,20,40,60,80,100])
data_binned["DMC"] = pd.qcut(data["DMC"], 4)#[0,100,200,300])
data_binned["ISI"] = pd.qcut(data["ISI"], 4)#[0,20,40,60])
data_binned["temp"] = pd.qcut(data["temp"], 4)#[0,20,40])
data_binned["RH"] = pd.qcut(data["RH"], 4)#[0,20,40,60,80,100])
data_binned["wind"] = pd.cut(data["wind"], 5)
data_binned["rain"] = pd.cut(data["rain"], 5)
data_binned["area"] = pd.qcut(data["area"], 2)#[-2,0,1100])
data_binned = data_binned.drop('DC', 1)
data_binned.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,"(18.699, 90.2]","(1.099, 68.6]","(-0.001, 6.5]","(2.199, 15.5]","(42.0, 53.0]","(5.8, 7.6]","(-0.0064, 1.28]","(-0.001, 0.52]"
1,7,4,oct,tue,"(90.2, 91.6]","(1.099, 68.6]","(6.5, 8.4]","(15.5, 19.3]","(14.999, 33.0]","(0.391, 2.2]","(-0.0064, 1.28]","(-0.001, 0.52]"
2,7,4,oct,sat,"(90.2, 91.6]","(1.099, 68.6]","(6.5, 8.4]","(2.199, 15.5]","(14.999, 33.0]","(0.391, 2.2]","(-0.0064, 1.28]","(-0.001, 0.52]"
3,8,6,mar,fri,"(91.6, 92.9]","(1.099, 68.6]","(8.4, 10.8]","(2.199, 15.5]","(53.0, 100.0]","(2.2, 4.0]","(-0.0064, 1.28]","(-0.001, 0.52]"
4,8,6,mar,sun,"(18.699, 90.2]","(1.099, 68.6]","(8.4, 10.8]","(2.199, 15.5]","(53.0, 100.0]","(0.391, 2.2]","(-0.0064, 1.28]","(-0.001, 0.52]"


In [4]:
# split into train and test set
data_binned = data_binned.sample(frac=1) #shuffle
split = int(len(data_binned)/5)
data_binned_train = data_binned[:split] #4/5
data_binned_test = data_binned[split:] #1/5
test_labels = data_binned_test["area"]
data_binned_test = data_binned_test.drop("area", axis=1)

In [5]:
# create graph for the model
G = BayesianModel()
G.add_nodes_from(data_binned)
G.add_edges_from([("month","temp"),
                  ("month","wind"),
                  ("month","RH"),
                  ("month", "rain"),
                  #("month", "area"), 
                  ("wind","ISI"),
                  ("wind","FFMC"),
                  #("wind","area"),
                  ("temp","FFMC"),
                  ("temp","DMC"),
                  #("temp","area"),
                  ("RH","FFMC"),
                  ("RH","DMC"),
                  #("RH","area"),
                  ("rain","DMC"),
                  ("rain","FFMC"),
                  #("rain","area"),
                  ("FFMC","ISI"),
                  ("FFMC","area"),
                  ("DMC","area"),
                  ("ISI","area"),
                  ("X","area"),
                  ("Y","area"),
                  ("day","area")
                 ])

In [10]:
(bins, freq) = ([],[])
(variables, bf) = ([],(bins, freq))
for col in data_binned:
    #print(col)
    vals, counts = np.unique(data_binned.get(col), return_counts=True)
    perc = counts/np.sum(counts)*100
    threshold = np.sum(counts)*0.1 
    results = dict(zip(vals, counts)) #replace perc with counts for count
    #print(pd.Series(results))
    variables.append(col)
    bins.append(vals)
    freq.append(counts)
    #print('sum = ', np.sum(perc))
    #print("perc" , perc)

#This stores the bins and frequencies of the following variables: wind, month, rain, RH, temp
(bins_wind, freq_wind) = ([],[])
(bins_month,freq_month) = ([],[])
(bins_rain,freq_rain) = ([],[])
(bins_RH,freq_RH) = ([],[])
(bins_temp,freq_temp) = ([],[])

#print bins  
#print len(bins)
#print len(bins[0])
for i in range(len(bins)):
    #print(variables[i])
    #for j in range(len(bins[i])):
    if(variables[i]=="wind"):
        ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
        (bins_wind,freq_wind) = ([ bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
    if(variables[i]=="rain"): 
        ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
        (bins_rain,freq_rain) = ([bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
    if(variables[i]=="RH"): 
        ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
        (bins_RH,freq_RH) = ([bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
    if(variables[i]=="temp"): 
        ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
        (bins_temp,freq_temp) = ([bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
    if(variables[i]=="month"): 
        ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
        (bins_month,freq_month) = ([bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
    if(variables[i]=="area"):
        (bins_area,freq_area) = (bins[i],freq[i])
    #print(bins[i],freq[i])

chi_data = [(bins_wind, freq_wind),(bins_month,freq_month),(bins_rain,freq_rain),(bins_RH,freq_RH),(bins_temp,freq_temp),(bins_area,freq_area)]
#print chi_data


In [13]:
l2 = ["wind","rain","RH","temp"]
#print data_binned.get("wind").cat.categories
for cond in l2:
    cats = data_binned.get(cond).cat.categories
    #print bins
    #print data_binned[cond]
    #print bins.values
    #print range(len(cats))
    for c in range(len(cats)):
        #print c
        #print cond
        #print bins == c 
        data_binned_cond = data_binned.get(data_binned.get(cond).cat.codes == c)
        #print data_binned_cond
        vals_month, counts_month = np.unique(data_binned_cond.get("month"), return_counts=True)
        vals_area, counts_area = np.unique(data_binned_cond.get("area"), return_counts=True)
        #print counts_month
        #print counts_area
        #print counts_month.type
        counts_month = counts_month.tolist()
        counts_area = counts_area.tolist()
        
        table = np.array([counts_month[0:2],
                          counts_area])
        print table
        
        chisq, p = stats.chi2_contingency(table,correction=True)
        #ik krijg hier een 'too many values to unpack' error.. heb geen idee waarom, geef de tabel exact mee zoals in doc
        #https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

[[ 1 31]
 [56 56]]


ValueError: too many values to unpack

In [8]:
# fit model on data
G.fit(data_binned_train) # only used for discrete bayesnets?

KeyboardInterrupt: 

In [None]:
G.get_cpds()

In [None]:
G.get_independencies()

In [None]:
predictions = G.predict(data_binned_test)
predictions.head()

In [None]:
predictions = predictions["area"].astype('category').cat.codes

In [None]:
test_labels = test_labels.astype('category').cat.codes

In [None]:
y_pred = np.array(predictions)
y_true = np.array(test_labels)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_true, y_pred)
acc

Hier komt de code die het als LGBN runt zodat het continue variabelen kunnen zijn. Is trial and error dus no flame pls.

To instantiate an object of this class, one needs to provide a variable name, the value of the beta_0 term, the variance, a list of the parent variable names and a list of the coefficient values of the linear equation (beta_vector), where the list of parent variable names and beta_vector list is optional and defaults to None.

In [None]:
from pgmpy.models import LinearGaussianBayesianNetwork
from pgmpy.factors.continuous import LinearGaussianCPD

In [None]:
# prepocessing data
#make dummy variables from X,Y,Month,Day
data_with_dummies = pd.get_dummies(data, columns=["X","Y"], prefix=["X","Y"])
data_with_dummies = pd.get_dummies(data_with_dummies, columns=["month","day"])
data_with_dummies.head()

In [None]:
#calculations here
#calculate variance of each column
variances = np.var(data,0)
print(variances)
#variances.get('X')

In [None]:
model = LinearGaussianBayesianNetwork()

#Instantiation of data points
#cpd = LinearGaussianCPD(name, beta_0 term, variance, list_parent_names, list_coeff_val_lin_eq(beta vector)) last 2 are optional

model.add_nodes_from(data_with_dummies)
model.add_edges_from([("temp","area"),
                  ("wind","area"),
                  ("rain","area"),
                  ("RH","area"),
                  ("FFMC","area"),
                  ("DMC","area"),
                  ("ISI","area"),
                  ("RH","FFMC"),
                  ("wind","FFMC"),
                  ("rain","FFMC"),
                  ("temp","FFMC"),
                  ("RH","DMC"),
                  ("rain","DMC"),
                  ("temp","DMC"),
                  ("FFMC","ISI"),
                  ("wind","ISI")])
model.add_edges_from([(col,"area") for col in data_with_dummies if col.startswith('X')])
model.add_edges_from([(col,"rain") for col in data_with_dummies if col.startswith('Y')])
model.add_edges_from([(col,"RH") for col in data_with_dummies if col.startswith('day')])
model.add_edges_from([(col,"temp") for col in data_with_dummies if col.startswith('month')])
model.add_edges_from([(col,"wind") for col in data_with_dummies if col.startswith('month')])

estimator = MaximumLikelihoodEstimator(model,data_with_dummies)

#IMPORTANT: month should be replaced with the dummy variables of month, like below
#likewise for X, Y, day in cpd_area
#beta_0 = estimator.estimate_cpd('temp')
cpd_temp = LinearGaussianCPD('temp', estimator.estimate_cpd('temp').get_values(), variances.get('temp'), ['month'])
cpd_wind = LinearGaussianCPD('wind', estimator.estimate_cpd('wind'), variances.get('wind'), ['month'])
cpd_rain = LinearGaussianCPD('rain', estimator.estimate_cpd('rain'), variances.get('rain'), ['month'])
cpd_RH = LinearGaussianCPD('RH', estimator.estimate_cpd('RH'), variances.get('RH'), ['month'])
cpd_FFMC = LinearGaussianCPD('FFMC', estimator.estimate_cpd('FFMC'), variances.get('FFMC'), ['rain','RH','temp','wind'])
cpd_DMC = LinearGaussianCPD('DMC', estimator.estimate_cpd('DMC'), variances.get('DMC'), ['rain','RH','temp'])
cpd_ISI = LinearGaussianCPD('ISI', estimator.estimate_cpd('ISI'), variances.get('ISI'), ['FFMC','wind'])
cpd_area = LinearGaussianCPD('area', estimator.estimate_cpd('area'), variances.get('area'), ['rain','RH','temp','wind','month','FFMC','DMC','ISI','day','X','Y'])
#should every dummy var have a data point? think so but not sure, 44 in total (incl above)

#jgd = model.to_joint_gaussian()

#model.fit(data_with_dummies)


In [None]:
tmp = estimator.estimate_cpd('temp').get_values()

In [None]:
print(tmp)
print np.shape(tmp)

In [None]:
[('x1', 'x2'), ('x2', 'x3')]
cpd1 = LinearGaussianCPD('x1', 1, 4)
cpd2 = LinearGaussianCPD('x2', -5, 4, ['x1'], [0.5])
cpd3 = LinearGaussianCPD('x3', 4, 3, ['x2'], [-1])
model.add_cpds(cpd1, cpd2, cpd3)
jgd = model.to_joint_gaussian()
jgd.variables

#structure score for eval 