In [1]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from scipy import stats

In [2]:
# loading data
raw = pd.read_csv("forestfires.csv")
data = pd.DataFrame(raw)
data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
# prepocessing data
data_binned = data.copy()
data_binned["X"] = pd.cut(data["X"], [0,3,6,9])
data_binned["Y"] = pd.cut(data["Y"], [0,3,6,9])
data_binned["FFMC"] = pd.qcut(data["FFMC"], 3)#[0,20,40,60,80,100])
data_binned["DMC"] = pd.qcut(data["DMC"], 3)#[0,100,200,300])
data_binned["ISI"] = pd.qcut(data["ISI"], 3)#[0,20,40,60])
data_binned["temp"] = pd.qcut(data["temp"], 3)#[0,20,40])
data_binned["RH"] = pd.qcut(data["RH"], 3)#[0,20,40,60,80,100])
data_binned["wind"] = pd.qcut(data["wind"], 3)
#data_binned["rain"] = pd.cut(data["rain"], 2)
data_binned["area"] = pd.cut(data["area"], [-1,0,50,1100])#[-1,0,600,1100])
data_binned["month"] = data_binned["month"].map({"jan":"winter","feb":"winter","mar":"spring","apr":"spring","may":"spring","jun":"summer","jul":"summer","aug":"summer","sep":"autumn","oct":"autumn","nov":"autumn","dec":"winter"})
data_binned["day"] = data_binned["day"].map({"mon":"workday","tue":"workday","wed":"workday","thu":"workday","fri":"workday","sat":"weekend","sun":"weekend"})
data_binned = data_binned.drop('rain', 1)
data_binned = data_binned.drop('DC', 1)
data_binned.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,ISI,temp,RH,wind,area
0,"(6, 9]","(3, 6]",spring,workday,"(18.699, 91.0]","(1.099, 88.0]","(-0.001, 7.1]","(2.199, 17.1]","(48.0, 100.0]","(4.9, 9.4]","(-1, 0]"
1,"(6, 9]","(3, 6]",autumn,workday,"(18.699, 91.0]","(1.099, 88.0]","(-0.001, 7.1]","(17.1, 21.4]","(14.999, 35.0]","(0.399, 3.1]","(-1, 0]"
2,"(6, 9]","(3, 6]",autumn,weekend,"(18.699, 91.0]","(1.099, 88.0]","(-0.001, 7.1]","(2.199, 17.1]","(14.999, 35.0]","(0.399, 3.1]","(-1, 0]"
3,"(6, 9]","(3, 6]",spring,workday,"(91.0, 92.4]","(1.099, 88.0]","(7.1, 9.6]","(2.199, 17.1]","(48.0, 100.0]","(3.1, 4.9]","(-1, 0]"
4,"(6, 9]","(3, 6]",spring,weekend,"(18.699, 91.0]","(1.099, 88.0]","(7.1, 9.6]","(2.199, 17.1]","(48.0, 100.0]","(0.399, 3.1]","(-1, 0]"


In [35]:
(bins, freq) = ([],[])
(variables, bf) = ([],(bins, freq))
for col in data_binned:
    #print(col)
    vals, counts = np.unique(data_binned.get(col), return_counts=True)
    perc = counts/np.sum(counts)*100
    threshold = np.sum(counts)*0.1 
    results = dict(zip(vals, counts)) #replace perc with counts for count
    #print(pd.Series(results))
    variables.append(col)
    bins.append(vals)
    freq.append(counts)
    #print('sum = ', np.sum(perc))
    #print("perc" , perc)

def freq_bins(bins, threshold):
    """get most frequent bins (>10% of dataset)"""
    #This stores the bins and frequencies of the following variables: wind, month, rain, RH, temp
    (bins_wind, freq_wind) = ([],[])
    (bins_month,freq_month) = ([],[])
    (bins_rain,freq_rain) = ([],[])
    (bins_RH,freq_RH) = ([],[])
    (bins_temp,freq_temp) = ([],[])
    
    #print bins  
    #print len(bins)
    #print len(bins[0])
    for i in range(len(bins)):
        #print(variables[i])
        #for j in range(len(bins[i])):
        if(variables[i]=="wind"):
            ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
            (bins_wind,freq_wind) = ([ bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
        if(variables[i]=="rain"): 
            ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
            (bins_rain,freq_rain) = ([bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
        if(variables[i]=="RH"): 
            ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
            (bins_RH,freq_RH) = ([bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
        if(variables[i]=="temp"): 
            ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
            (bins_temp,freq_temp) = ([bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
        if(variables[i]=="month"): 
            ind = [j for (j, val) in enumerate(freq[i]) if val > threshold]
            (bins_month,freq_month) = ([bins[i][j] for j in ind], [x for x in freq[i] if x > threshold])
        if(variables[i]=="area"):
            (bins_area,freq_area) = (bins[i],freq[i])
        #print(bins[i],freq[i])
        
        chi_data = [(bins_wind, freq_wind),(bins_month,freq_month),(bins_rain,freq_rain),(bins_RH,freq_RH),(bins_temp,freq_temp),(bins_area,freq_area)]
        return chi_data
#print chi_data


In [None]:
def chi_cond(c_layer, var):
    """chi-square test for conditional independencies"""
    for cond in c_layer:
        cats = data_binned.get(cond).cat.categories
        print "categories: ", cats
        for c in range(len(cats)):
            print "bin: ", cats[c]
            print "conditional variable: ", cond
            data_binned_cond = data_binned.get(data_binned.get(cond).cat.codes == c)
            vals_month, counts_month = np.unique(data_binned_cond.get(var), return_counts=True)
            vals_area, counts_area = np.unique(data_binned_cond.get("area"), return_counts=True)

            table = pd.crosstab(data_binned_cond.get("area"),data_binned_cond.get(var))

            print table

            #try - except omdat sommige levels van rain niet te conditioneren zijn - geven voornamelijk nul-frequenties
            try:
                chisq, p, _, _ = stats.chi2_contingency(table,correction=True) #leuke error hierzo 
                print "chisquare: ", chisq, "\np-value: ", p
            except ValueError:
                print "error: zero frequenties"
                pass

In [None]:
#chi-square tests of conditional independence month - area 

l2 = ["wind","rain","RH","temp"]
#print data_binned.get("wind").cat.categories

chi_cond(l2,"month")
        
#temp+wind hebben p<0.05 voor sommige levels dus month is conditionally dependent of area given wind/temp. 
#rain+RH hebben p>0.05 voor alle levels dus month is conditionally independent of area given rain/RH; 
#de pijl van month naar area moet dus blijven

In [None]:
#chi-square tests of conditional independence rain - area
l3_1 = ["FFMC","DMC"]
#l3_2 = ["DMC","ISI"] #even kijken hoe dat zit met het feit dat FFMC de ISI beinvloedt
chi_cond(l3_1,"rain")

#resultaten: zeer sterke (p=1.0) conditional INdependence rain - area given FFMC/DMC
#pijl rain -> area mag weg

In [None]:
#chi-square tests of conditional independence temp - area
chi_cond(l3_1,"temp")
#RESULTS: temp conditionally INdependent of area given FFMC/DMC
#pijl temp -> area mag weg

In [None]:
##chi-square tests of conditional independence RH - area
chi_cond(l3_1,"RH")
#RESULTS: RH conditionally INdependent of area given FFMC/DMC
#pijl RH -> area mag weg

In [None]:
l3_2 = ["DMC","ISI"] #even kijken hoe dat zit met het feit dat FFMC de ISI beinvloedt
chi_cond(l3_2,"wind")
#RESULTS: conditionally DEPENDENT of area given FFMC/ISI
#pijl wind -> area behouden

In [4]:
# split into train and test set
data_binned = data_binned.sample(frac=1) #shuffle
split = int(len(data_binned)/5)
data_binned_train = data_binned[split:] #4/5
data_binned_test = data_binned[:split] #1/5
test_labels = data_binned_test["area"]
data_binned_test = data_binned_test.drop("area", axis=1)

In [30]:
# create graph for the model
G = BayesianModel()
G.add_nodes_from(data_binned)
G.add_edges_from([("month","temp"),
                  ("month","wind"),
                  ("month","RH"),
                  #("month", "rain"),
                  ("month", "area"), 
                  ("wind","ISI"),
                  ("wind","FFMC"),
                  #("wind","area"),
                  ("temp","FFMC"),
                  ("temp","DMC"),
                  ("temp","area"),
                  ("RH","FFMC"),
                  ("RH","DMC"),
                  #("RH","area"),
                  #("rain","DMC"),
                  #("rain","FFMC"),
                  #("rain","area"),
                  ("FFMC","ISI"),
                  ("FFMC","area"),
                  ("DMC","area"),
                  ("ISI","area"),
                  ("X","area"),
                  ("Y","area"),
                  ("day","area")
                 ])

In [6]:
# fit model on data
G.fit(data_binned_train) # only used for discrete bayesnets?

In [7]:
G.get_cpds()

[<TabularCPD representing P(DMC:3 | RH:3, temp:3) at 0x7f2bd82e1080>,
 <TabularCPD representing P(FFMC:3 | RH:3, temp:3, wind:3) at 0x7f2bab50bc50>,
 <TabularCPD representing P(ISI:3 | FFMC:3, wind:3) at 0x7f2bab53fbe0>,
 <TabularCPD representing P(RH:3 | month:4) at 0x7f2bab50ba20>,
 <TabularCPD representing P(X:3) at 0x7f2bab515240>,
 <TabularCPD representing P(Y:3) at 0x7f2bab4dfeb8>,
 <TabularCPD representing P(area:3 | DMC:3, FFMC:3, ISI:3, X:3, Y:3, day:2, month:4, wind:3) at 0x7f2bab46eef0>,
 <TabularCPD representing P(day:2) at 0x7f2bab46ecc0>,
 <TabularCPD representing P(month:4) at 0x7f2bab46e828>,
 <TabularCPD representing P(temp:3 | month:4) at 0x7f2bab46e278>,
 <TabularCPD representing P(wind:3 | month:4) at 0x7f2bab46e5f8>]

In [8]:
G.get_independencies()

(X _|_ RH, Y, ISI, day, month, FFMC, DMC, temp, wind)
(X _|_ RH, ISI, day, month, FFMC, DMC, temp, wind | Y)
(X _|_ Y, ISI, day, month, FFMC, DMC, temp, wind | RH)
(X _|_ RH, Y, day, month, FFMC, DMC, temp, wind | ISI)
(X _|_ RH, Y, ISI, month, FFMC, DMC, temp, wind | day)
(X _|_ RH, Y, ISI, day, FFMC, DMC, temp, wind | month)
(X _|_ RH, Y, ISI, day, month, DMC, temp, wind | FFMC)
(X _|_ RH, Y, ISI, day, month, FFMC, temp, wind | DMC)
(X _|_ RH, Y, ISI, day, month, FFMC, DMC, wind | temp)
(X _|_ RH, Y, ISI, day, month, FFMC, DMC, temp | wind)
(X _|_ day, month, FFMC, DMC, temp, wind, ISI | RH, Y)
(X _|_ RH, day, month, FFMC, DMC, temp, wind | Y, ISI)
(X _|_ RH, month, FFMC, DMC, temp, wind, ISI | Y, day)
(X _|_ RH, day, FFMC, DMC, temp, wind, ISI | Y, month)
(X _|_ RH, day, month, DMC, temp, wind, ISI | Y, FFMC)
(X _|_ RH, day, month, FFMC, temp, wind, ISI | Y, DMC)
(X _|_ RH, day, month, FFMC, DMC, wind, ISI | Y, temp)
(X _|_ RH, day, month, FFMC, DMC, temp, ISI | Y, wind)
(X _|_ Y, d

In [9]:
predictions = G.predict(data_binned_test)
predictions.head()

Unnamed: 0,area
198,"(-1, 0]"
502,"(0, 50]"
353,"(-1, 0]"
109,"(-1, 0]"
431,"(-1, 0]"


In [12]:
predictions = predictions["area"].astype('category').cat.codes
test_labels = test_labels.astype('category').cat.codes
y_pred = np.array(predictions)
y_true = np.array(test_labels)

In [34]:
from sklearn.metrics import f1_score
f1_score(y_true, y_pred, average="micro")

0.44660194174757284