In [10]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ExpectationMaximization as EM
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
import pandas as pd
import pickle
import multiprocessing
from functools import partial

In [11]:
import numpy as np
from joblib import Parallel, delayed
from pgmpy.inference import BeliefPropagation  # Ensure the correct import based on your setup


In [12]:
# Define the Bayesian Network structure
model = BayesianNetwork([
    ('length', 'theta'),
    ('lanes', 'theta'),
    ('max_speed', 'theta'),
    ('dir', 'theta'),
    ('tor', 'theta'),
    ('n_connnections', 'theta'),
    ('weighted_POI', 'theta'),
    ('theta', 'Volume'),
    ('theta', 'avg_speed'),
    ('time', 'Volume'),
    ('time', 'avg_speed'),
    ('no_taxi_car', 'Volume'),
    ('no_taxi_car', 'avg_speed std'),
    ('avg_speed', 'avg_speed std')
])

# Set latent variables
model.latents = {'theta', 'Volume'}


## data for training

In [13]:
output_dir = "D:/Thesis/files_output_dir/output_files/"
road_data= pd.read_csv(output_dir +'PGM_input_discrete_level3_road_history_holiday.csv')
data = road_data.copy()

In [14]:
data.head()

Unnamed: 0,length,lanes,max_speed,dir,tor,n_connnections,weighted_POI,time,avg_speed,avg_speed std,no_taxi_car
0,2,2,4,2,1,2,3,1,3,2,4
1,2,2,4,2,1,2,3,1,2,2,5
2,2,2,4,2,1,2,3,1,4,1,4
3,2,2,4,2,1,2,3,2,3,1,4
4,2,2,4,2,1,2,3,2,2,2,5


In [6]:
# Initialize the Expectation-Maximization Estimator
estimator = EM(model, data)

# Function to run a part of the EM algorithm in parallel
def run_em_partial(estimator, data_chunk, latent_card):
    # Each process works with a subset (chunk) of the data
    partial_estimator = EM(estimator.model, data_chunk)
    partial_params = partial_estimator.get_parameters(latent_card=latent_card)
    return partial_params

# Split data into chunks for parallel processing
num_chunks = multiprocessing.cpu_count()
data_chunks = np.array_split(data, num_chunks)

# Use multiprocessing Pool to parallelize the EM algorithm
with multiprocessing.Pool(processes=num_chunks) as pool:
    # Partially apply the run_em_partial function with fixed arguments
    em_partial = partial(run_em_partial, estimator, latent_card={'theta': 3, 'Volume': 5})
    
    # Map the function across data chunks in parallel
    results = pool.map(em_partial, data_chunks)

# Combine results from all processes (results is a list of CPD lists)
estimated_params = []
for result in results:
    estimated_params.extend(result)

# Add the learned CPDs back to the model
model.add_cpds(*estimated_params)

# Check if the model is valid with the added CPDs
assert model.check_model(), "The model is not valid with the learned CPDs."

# Save the model using pickle (optional)
with open(output_dir + 'bayesian_network_model_level3_holiday.pkl', 'wb') as f:
    pickle.dump(model, f)

  return bound(*args, **kwds)


## Print CPDs

In [16]:
# Retrieve and print the learned CPDs
for cpd in model.get_cpds():
    print(f"Learned CPD for {cpd.variable}:")
    print(cpd)
    print("\n")

Learned CPD for weighted_POI:
+-----------------+-----------+
| weighted_POI(1) | 0.527377  |
+-----------------+-----------+
| weighted_POI(2) | 0.268126  |
+-----------------+-----------+
| weighted_POI(3) | 0.117407  |
+-----------------+-----------+
| weighted_POI(4) | 0.0614906 |
+-----------------+-----------+
| weighted_POI(5) | 0.0255983 |
+-----------------+-----------+


Learned CPD for tor:
+--------+-----------+
| tor(1) | 0.975091  |
+--------+-----------+
| tor(2) | 0.0249095 |
+--------+-----------+


Learned CPD for no_taxi_car:
+----------------+----------+
| no_taxi_car(4) | 0.259895 |
+----------------+----------+
| no_taxi_car(5) | 0.300038 |
+----------------+----------+
| no_taxi_car(6) | 0.203752 |
+----------------+----------+
| no_taxi_car(7) | 0.236315 |
+----------------+----------+


Learned CPD for max_speed:
+--------------+-------------+
| max_speed(4) | 0.846426    |
+--------------+-------------+
| max_speed(5) | 0.000156763 |
+--------------+----------

## Load Model

In [15]:
# Load the model
with open(output_dir + 'bayesian_network_model_level3_holiday.pkl', 'rb') as file:
    model = pickle.load(file)

## Data Inferencing

In [None]:

inference = VariableElimination(model)

# Initialize new columns for inferred latent variables in the dataset
road_data['theta'] = np.nan
road_data['Volume'] = np.nan

def process_chunk(chunk):
    """Function to process a chunk of rows and perform inference."""
    results = []
    for index, row in chunk.iterrows():
        # Extract observed evidence from the current row as a dictionary
        evidence = {
            'length': row['length'],
            'lanes': row['lanes'],
            'max_speed': row['max_speed'],
            'dir': row['dir'],
            'tor': row['tor'],
            'n_connnections': row['n_connnections'],
            'weighted_POI': row['weighted_POI'],
            'time': row['time'],
            'no_taxi_car': row['no_taxi_car'],
            'avg_speed': row['avg_speed'],
            'avg_speed std': row['avg_speed std']
        }
        
        # Remove NaN values from the evidence dictionary
        evidence = {k: v for k, v in evidence.items() if not pd.isna(v)}

        # Perform MAP query to find the most likely value of latent nodes
        try:
            result_theta = inference.map_query(variables=['theta'], evidence=evidence)
            result_volume = inference.map_query(variables=['Volume'], evidence=evidence)
        except Exception as e:
            # If inference fails, return NaNs
            result_theta = {'theta': np.nan}
            result_volume = {'Volume': np.nan}

        # Collect the results
        results.append((index, result_theta.get('theta', np.nan), result_volume.get('Volume', np.nan)))
    
    return results


# Split data into chunks for parallel processing
num_chunks = multiprocessing.cpu_count()
data_chunks = np.array_split(road_data, num_chunks)

# Use multiprocessing Pool to parallelize the inference process
with multiprocessing.Pool(processes=num_chunks) as pool:
    # Map the process_chunk function across the data chunks in parallel
    chunk_results = pool.map(process_chunk, data_chunks)

# Flatten the list of results
results = [item for sublist in chunk_results for item in sublist]

# Update the DataFrame with results
for index, theta, volume in results:
    road_data.at[index, 'theta'] = theta
    road_data.at[index, 'Volume'] = volume


In [None]:
# Print the first few rows of the updated DataFrame
road_data.head()

In [None]:
file_path = "D:/Thesis/files_output_dir/output_files/level3_road_history_holiday_inference.csv"
road_data.to_csv(file_path, index=False)

# Load Data

In [8]:
file_path = "D:/Thesis/files_output_dir/output_files/level2_road_history_workday_inference.csv"
road_data = pd.read_csv(file_path)

In [9]:
road_data['Volume'].value_counts(dropna=False)

Volume
2.0    1104722
3.0     710955
1.0     561599
4.0     524573
0.0     207230
Name: count, dtype: int64