In [2]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ExpectationMaximization as EM
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
output_dir = "D:/Thesis/files_output_dir/output_files/"
level1_road_history_workday= pd.read_csv(output_dir +'PGM_input_discrete_level1_road_history_workday.csv')

In [5]:
level1_road_history_workday.head()

Unnamed: 0,length,lanes,max_speed,dir,tor,n_connnections,weighted_POI,time,avg_speed,avg_speed std,no_taxi_car
0,2,1,2,1,1,2,1,2,1,2,2
1,2,1,2,1,1,2,1,3,1,2,2
2,2,1,2,1,1,2,1,3,1,2,3
3,2,1,2,1,1,2,1,3,1,2,2
4,2,1,2,1,1,2,1,3,1,3,2


In [6]:
level1_road_history_workday.columns

Index(['length', 'lanes', 'max_speed', 'dir', 'tor', 'n_connnections',
       'weighted_POI', 'time', 'avg_speed', 'avg_speed std', 'no_taxi_car'],
      dtype='object')

In [7]:
# Define the Bayesian Network structure with latent variables
model = BayesianNetwork([
    ('length', 'theta'),
    ('lanes', 'theta'),
    ('max_speed', 'theta'),
    ('dir', 'theta'),
    ('tor', 'theta'),
    ('n_connnections', 'theta'),
    ('weighted_POI', 'theta'),
    ('theta', 'Volume'),
    ('theta', 'avg_speed'),
    ('time', 'Volume'),
    ('time', 'avg_speed'),
    ('no_taxi_car', 'Volume'),
    ('no_taxi_car', 'avg_speed std'),
    ('avg_speed', 'avg_speed std')
])

In [8]:
model.latents = {'theta', 'Volume'}

In [13]:
# Initialize the Expectation-Maximization (EM) estimator
data = level1_road_history_workday.copy()  # Make sure to call the copy() method correctly
estimator = EM(model, data)

estimated_params = estimator.get_parameters(latent_card={'theta': 3, 'Volume': 5})  # Specify cardinalities of latent variables


In [17]:
# After estimating parameters, add the learned CPDs back to the model
for cpd in estimated_params:
    model.add_cpds(cpd)
# Validate the model (check if the model is correctly specified)
assert model.check_model()

In [21]:
# Save the model to a file
output_dir = "D:/Thesis/files_output_dir/output_files/"
with open(output_dir + 'bayesian_network_model_level1_workday.pkl', 'wb') as file:
    pickle.dump(model, file)

## Load Model

In [9]:
# Load the model
with open(output_dir + 'bayesian_network_model_level1_workday.pkl', 'rb') as file:
    model = pickle.load(file)

In [10]:
# Retrieve and print the learned CPDs
for cpd in model.get_cpds():
    print(f"Learned CPD for {cpd.variable}:")
    print(cpd)
    print("\n")

Learned CPD for tor:
+--------+-----------+
| tor(1) | 0.974973  |
+--------+-----------+
| tor(2) | 0.0250273 |
+--------+-----------+


Learned CPD for avg_speed std:
+------------------+-----+----------------------+
| avg_speed        | ... | avg_speed(6)         |
+------------------+-----+----------------------+
| no_taxi_car      | ... | no_taxi_car(7)       |
+------------------+-----+----------------------+
| avg_speed std(1) | ... | 0.01099332548095799  |
+------------------+-----+----------------------+
| avg_speed std(2) | ... | 0.4166121362823365   |
+------------------+-----+----------------------+
| avg_speed std(3) | ... | 0.4189242245779348   |
+------------------+-----+----------------------+
| avg_speed std(4) | ... | 0.11730576277101601  |
+------------------+-----+----------------------+
| avg_speed std(5) | ... | 0.027483313702394974 |
+------------------+-----+----------------------+
| avg_speed std(6) | ... | 0.008681237185359682 |
+------------------+-----+-----

In [11]:
from pgmpy.inference import BeliefPropagation

In [12]:
# Perform inference using VariableElimination
inference = BeliefPropagation(model)
# inference = VariableElimination(model)

# Initialize new columns for inferred latent variables in the dataset
level1_road_history_workday['theta'] = np.nan
level1_road_history_workday['Volume'] = np.nan

# Iterate through each row in the data to perform inference
for index, row in level1_road_history_workday.iterrows():
    # Extract observed evidence from the current row
    evidence = {
        'length': row['length'],
        'lanes': row['lanes'],
        'max_speed': row['max_speed'],
        'dir': row['dir'],
        'tor': row['tor'],
        'n_connnections': row['n_connnections'],
        'weighted_POI': row['weighted_POI'],
        'time': row['time'],
        'no_taxi_car': row['no_taxi_car'], 
        'avg_speed': row['avg_speed'], 
        'avg_speed std': row['avg_speed std']
        
    }
    
    # Remove NaN values from the evidence dictionary
    evidence = {k: v for k, v in evidence.items() if pd.notna(v)}
    
    # Perform MAP query to find the most likely value of latent nodes
    result_theta = inference.map_query(variables=['theta'], evidence=evidence)
    result_volume = inference.map_query(variables=['Volume'], evidence=evidence)
    
    # Insert the inferred values into the corresponding columns
    level1_road_history_workday.at[index, 'theta'] = result_theta.get('theta', np.nan)
    level1_road_history_workday.at[index, 'Volume'] = result_volume.get('Volume', np.nan)

In [13]:
# Print the first few rows of the updated DataFrame
level1_road_history_workday.head()

Unnamed: 0,length,lanes,max_speed,dir,tor,n_connnections,weighted_POI,time,avg_speed,avg_speed std,no_taxi_car,theta,Volume
0,2,1,2,1,1,2,1,2,1,2,2,1.0,4.0
1,2,1,2,1,1,2,1,3,1,2,2,1.0,0.0
2,2,1,2,1,1,2,1,3,1,2,3,1.0,4.0
3,2,1,2,1,1,2,1,3,1,2,2,1.0,0.0
4,2,1,2,1,1,2,1,3,1,3,2,1.0,0.0


In [None]:
output_dir = "D:/Thesis/files_output_dir/output_files/"
level1_road_history_workday.to_csv(output_dir + 'level1_road_history_workday_inference.csv', index=False)