In [1]:
import pandas as pd
import numpy as np
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
import tempfile
from sklearn.preprocessing import LabelEncoder

In [2]:
# Step 1: Load the historical data
data = pd.read_csv("data/INVEST_clean.csv", parse_dates=["Date"])

# Step 2: Load the historical exchange rates dataset
historical_exchange_rates = pd.read_csv("data/HistoricalRandPerUSDRateDetail.csv", parse_dates=["Date"])

# Step 3: Load the JSE All Share Index dataset
jse_all_share_index = pd.read_csv("data/FTSE-JSEAllShareIndex.csv", parse_dates=["Date"])

# Step 4: Filter the data for General Industrials and Consumer Services sectors (2015-2018)
general_industrials_companies = [
    "SUPER GROUP", "PPC", "NAMPAK", "MURRAY & ROBERTS", "KAP INDUSTRIAL",
    "INVICTA", "IMPERIAL", "BIDVEST GROUP", "BARLOWORLD", "AFRIMAT"
]
consumer_services_companies = [
    "WOOLWORTHS HDG", "TSOGO SUN", "TRUWORTHS INTL"
]

filtered_data = data[
    (data['Date'].dt.year >= 2015) & 
    (data['Date'].dt.year <= 2018) & 
    (data['Name'].isin(general_industrials_companies + consumer_services_companies))
].copy()

# Step 5: Merge filtered_data with historical_exchange_rates on Date
filtered_data = pd.merge(filtered_data, historical_exchange_rates, on="Date", how="left")
filtered_data.rename(columns={'Value': 'ExchangeRate'}, inplace=True)

# Handle missing exchange rate data
filtered_data['ExchangeRate'].fillna(filtered_data['ExchangeRate'].mean(), inplace=True)

# Step 6: Discretize Exchange Rate into categories (Low/Medium/High)
filtered_data['ExchangeRate'] = pd.qcut(filtered_data['ExchangeRate'], q=3, labels=['Low', 'Medium', 'High'])

# Step 7: Merge with JSE All Share Index data
filtered_data = pd.merge(filtered_data, jse_all_share_index[['Date', 'Close']], on="Date", how="left")
filtered_data.rename(columns={'Close': 'JSE_All_Share_Index'}, inplace=True)

# Handle missing JSE index data
filtered_data['JSE_All_Share_Index'].fillna(filtered_data['JSE_All_Share_Index'].mean(), inplace=True)

In [3]:
# Step 8: Define temporal variables and encode categorical variables
filtered_data['MarketCondition'] = np.where(filtered_data['Price'] > filtered_data['Price'].mean(), 'Good', 'Bad')
filtered_data['SectorPerformance'] = np.where(filtered_data['PE'] > filtered_data['PE'].median(), 'Positive', 'Negative')

def get_share_performance(price):
    if price > filtered_data['Price'].quantile(0.75):
        return 'High'
    elif price > filtered_data['Price'].quantile(0.25):
        return 'Medium'
    else:
        return 'Low'

filtered_data['SharePerformance'] = filtered_data['Price'].apply(get_share_performance)

# Encode categorical variables
le_market = LabelEncoder()
le_sector = LabelEncoder()
le_share = LabelEncoder()
le_exchange_rate = LabelEncoder()

filtered_data['MarketCondition'] = le_market.fit_transform(filtered_data['MarketCondition'])
filtered_data['SectorPerformance'] = le_sector.fit_transform(filtered_data['SectorPerformance'])
filtered_data['SharePerformance'] = le_share.fit_transform(filtered_data['SharePerformance'])
filtered_data['ExchangeRate'] = le_exchange_rate.fit_transform(filtered_data['ExchangeRate'])

# Automatically create time slice columns
variables = ['MarketCondition', 'SectorPerformance', 'SharePerformance', 'ExchangeRate', 'JSE_All_Share_Index']

# Create slice_0 columns (current time slice)
for var in variables:
    filtered_data[f'{var}_0'] = filtered_data[var]

# Create slice_1 columns (next time slice) by shifting
for var in variables:
    filtered_data[f'{var}_1'] = filtered_data[var].shift(-1)

# Drop rows with NaN values in slice_1 columns
filtered_data.dropna(inplace=True)

# Create the Dynamic Bayesian Network (DBN)
def initialize_dbn(include_exchange_rate=True, include_jse_index=True):
    global dbn
    dbn = gum.BayesNet("Dynamic_BDN")

    # Define active variables based on inclusion of exchange rate and JSE index
    active_vars = variables.copy()
    
    if not include_exchange_rate:
        active_vars.remove('ExchangeRate')
    
    if not include_jse_index:
        active_vars.remove('JSE_All_Share_Index')

    # Define variables for time slice 0 and 1 dynamically based on inclusion of exchange rate and JSE index
    for slice_num in [0, 1]:
        for var in active_vars:
            num_states = len(filtered_data[f"{var}_{slice_num}"].unique())  # Adjust number of states dynamically
            
            dbn.add(gum.LabelizedVariable(f"{var}_{slice_num}", f"{var} at time slice {slice_num}", num_states))

    # Define intra-slice arcs for slice_0
    dbn.addArc("MarketCondition_0", "SectorPerformance_0")
    dbn.addArc("SectorPerformance_0", "SharePerformance_0")
    
    if include_exchange_rate:
        dbn.addArc("ExchangeRate_0", "MarketCondition_0")  # External factor influence

    if include_jse_index:
        dbn.addArc("JSE_All_Share_Index_0", "MarketCondition_0")  # External factor influence from JSE index

    # Temporal arcs connecting slice 0 to slice 1
    for var in active_vars:
        dbn.addArc(f"{var}_0", f"{var}_1")

initialize_dbn(include_exchange_rate=True, include_jse_index=True)  # Initialize with both external factors included

# Display the DBN structure
gnb.showBN(dbn)

InvalidArgument: [pyAgrum] Invalid argument: Empty variable MarketCondition_0:Labelized({}) cannot be added in a Potential