Data

In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define categories for variables A, B, and C
A_categories = ['A_1', 'A_2', 'A_3', 'A_4']
B_categories = ['B_1', 'B_2', 'B_3']
C_categories = ['C_1', 'C_2']

# Number of samples to generate
num_samples = 1000

# Generate random categorical variables
A_samples = np.random.choice(A_categories, num_samples)
B_samples = np.random.choice(B_categories, num_samples)
C_samples = np.random.choice(C_categories, num_samples)

# Generate status variable (True/False) with some dependency on A, B, and C
# Example: Higher probability of True for certain combinations
status_probs = {
    ('A_1', 'B_1', 'C_1'): 0.8,
    ('A_1', 'B_2', 'C_1'): 0.7,
    ('A_2', 'B_1', 'C_2'): 0.6,
    ('A_3', 'B_3', 'C_1'): 0.4,
    ('A_4', 'B_2', 'C_2'): 0.3,
}

# Assign probabilities based on dictionary, defaulting to 0.5
status_values = []
for a, b, c in zip(A_samples, B_samples, C_samples):
    prob = status_probs.get((a, b, c), 0.5)  # Default probability is 0.5
    status_values.append(np.random.rand() < prob)  # Convert to boolean

# Create DataFrame
df = pd.DataFrame({
    'A': A_samples,
    'B': B_samples,
    'C': C_samples,
    'Status': status_values
})


Frequentist

In [2]:
# Calculate the empirical probability of Status being True for each (A, B, C) combination
freq_prob_df = df.groupby(['A', 'B', 'C'])['Status'].mean().reset_index()

# Rename columns for clarity
freq_prob_df.rename(columns={'Status': 'P(Status=True) (Frequentist)'}, inplace=True)


# Merge the frequentist probability estimates into the main DataFrame
df = df.merge(freq_prob_df, on=['A', 'B', 'C'], how='left', suffixes=('', ' (Frequentist)'))

Bayesian

In [3]:
from sklearn.naive_bayes import CategoricalNB

# Encode categorical variables as numerical values
df_encoded = df.copy()
df_encoded['A'] = df_encoded['A'].astype('category').cat.codes
df_encoded['B'] = df_encoded['B'].astype('category').cat.codes
df_encoded['C'] = df_encoded['C'].astype('category').cat.codes

# Define features and target variable
X_bayes = df_encoded[['A', 'B', 'C']]
y_bayes = df_encoded['Status']

# Fit Naïve Bayes model for categorical data
bayes_model = CategoricalNB()
bayes_model.fit(X_bayes, y_bayes)

# Predict probability of Status being True
df['P(Status=True) (Bayesian)'] = bayes_model.predict_proba(X_bayes)[:, 1]

In [5]:
# Implementing Naïve Bayes manually

# Step 1: Compute the prior probability P(Status=True) and P(Status=False)
prior_true = df['Status'].mean()
prior_false = 1 - prior_true

# Step 2: Compute conditional probabilities P(A | Status), P(B | Status), P(C | Status)
cond_probs = {}

for feature in ['A', 'B', 'C']:
    cond_probs[feature] = {}

    for category in df[feature].unique():
        # P(feature=category | Status=True)
        prob_true = df[(df[feature] == category) & (df['Status'] == True)].shape[0] / df[df['Status'] == True].shape[0]
        # P(feature=category | Status=False)
        prob_false = df[(df[feature] == category) & (df['Status'] == False)].shape[0] / df[df['Status'] == False].shape[0]

        cond_probs[feature][category] = {'True': prob_true, 'False': prob_false}

# Step 3: Compute posterior probabilities P(Status=True | A, B, C) using Bayes' theorem
bayes_probs = []

for _, row in df.iterrows():
    # Compute likelihoods
    likelihood_true = prior_true
    likelihood_false = prior_false

    for feature in ['A', 'B', 'C']:
        category = row[feature]
        likelihood_true *= cond_probs[feature][category]['True']
        likelihood_false *= cond_probs[feature][category]['False']

    # Normalize probabilities
    posterior_true = likelihood_true / (likelihood_true + likelihood_false)
    bayes_probs.append(posterior_true)

# Add manually computed Bayesian probabilities to DataFrame
df['P(Status=True) (Manual Bayesian)'] = bayes_probs

Logistic

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Fixing OneHotEncoder issue by removing deprecated parameters and using correct settings
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')  # Ensure compatibility

# One-hot encode categorical variables A, B, and C
X_encoded = encoder.fit_transform(df[['A', 'B', 'C']]).toarray()  # Convert sparse matrix to dense array

# Convert to DataFrame with appropriate column names
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(['A', 'B', 'C']))

# Define target variable
y = df['Status']

# Fit logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_encoded_df, y)

# Predict probabilities of Status being True
df['P(Status=True)'] = log_reg.predict_proba(X_encoded_df)[:, 1]

In [7]:
df

Unnamed: 0,A,B,C,Status,P(Status=True) (Frequentist),P(Status=True) (Bayesian),P(Status=True) (Manual Bayesian),P(Status=True)
0,A_3,B_2,C_2,True,0.648649,0.563604,0.564091,0.565382
1,A_4,B_3,C_1,True,0.487179,0.422793,0.422070,0.425254
2,A_1,B_1,C_1,False,0.800000,0.591691,0.592091,0.586935
3,A_3,B_1,C_1,True,0.645161,0.620131,0.620797,0.624220
4,A_3,B_1,C_2,False,0.508475,0.553473,0.553893,0.556508
...,...,...,...,...,...,...,...,...
995,A_1,B_2,C_2,False,0.584906,0.534111,0.534313,0.526686
996,A_1,B_2,C_2,False,0.584906,0.534111,0.534313,0.526686
997,A_4,B_2,C_2,False,0.285714,0.442762,0.442291,0.442740
998,A_4,B_1,C_1,True,0.437500,0.501086,0.500820,0.503603


Neural Nets