In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import numpy as np


In [2]:
# Load the data
data_path = "files/2020_bn_nb_data.txt"  # Update this path to your file location
df = pd.read_csv(data_path, sep="\t")

# Display the first few rows to understand the data structure
df.head()


Unnamed: 0,EC100,EC160,IT101,IT161,MA101,PH100,PH160,HS101,QP
0,BC,CC,BB,BC,CC,BC,AA,BB,y
1,CC,BC,BB,BB,CC,BC,AB,BB,y
2,AB,BB,AB,AB,BB,CC,BC,AB,y
3,BC,CC,BB,BB,BB,BB,BC,BB,y
4,BC,AB,CD,BC,BC,BC,BC,CD,y


In [3]:
# Define features and target
features = df.columns[:-1]  # All columns except 'QP'
target = "QP"

# Extract data and target
X = df[features]
y = df[target]


In [4]:
# Define encoder and imputer
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)
# List to store accuracy for each run
nb_accuracies = []



In [5]:
# Perform 20 random splits for training and testing
for _ in range(20):
    # Split the data into 70% train and 30% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=None)
    
    # Fit the encoder on training data to determine categories
    ordinal_encoder.fit(X_train)
    
    # Transform training and testing data with the learned categories
    X_train_enc = ordinal_encoder.transform(X_train)
    X_test_enc = ordinal_encoder.transform(X_test)
    
    # Apply imputer to replace NaNs with -1
    X_train_enc = imputer.fit_transform(X_train_enc)
    X_test_enc = imputer.transform(X_test_enc)
    
    # Clip X_test_enc to prevent index errors during prediction
    for feature_idx in range(X_train_enc.shape[1]):
        max_index = int(X_train_enc[:, feature_idx].max())
        X_test_enc[:, feature_idx] = np.clip(X_test_enc[:, feature_idx], 0, max_index)
    
    # Create and fit the Naive Bayes model
    model = CategoricalNB()
    model.fit(X_train_enc, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_test_enc)
    accuracy = accuracy_score(y_test, y_pred)
    nb_accuracies.append(accuracy)


In [7]:

def generate_cpts(df, target, features):
    cpts = {}
    
    # Iterate through each feature to create its CPT
    for feature in features:
        # Count occurrences of each combination of the feature and the target
        joint_counts = df.groupby([target, feature]).size().unstack(fill_value=0)
        
        # Calculate probabilities
        cpt = joint_counts.div(joint_counts.sum(axis=0), axis=1)
        
        # Store in the CPT dictionary
        cpts[feature] = cpt
    
    return cpts


In [6]:
# Calculate mean and standard deviation of accuracy
nb_mean_accuracy = np.mean(nb_accuracies)
nb_std_accuracy = np.std(nb_accuracies)

print(f"Naive Bayes Mean Accuracy: {nb_mean_accuracy}")
print(f"Naive Bayes Std Deviation: {nb_std_accuracy}")


Naive Bayes Mean Accuracy: 0.9785714285714289
Naive Bayes Std Deviation: 0.01720227796970329


In [8]:
# Generate CPTs for the defined features and target
cpts = generate_cpts(df, target, features)

# Display the CPTs
for feature, cpt in cpts.items():
    print(f"CPT for {feature}:\n{cpt}\n")


CPT for EC100:
EC100   AA   AB   BB   BC        CC        CD    DD    F
QP                                                      
n      0.0  0.0  0.0  0.0  0.138889  0.448276  0.95  1.0
y      1.0  1.0  1.0  1.0  0.861111  0.551724  0.05  0.0

CPT for EC160:
EC160   AA   AB        BB        BC        CC        CD        DD    F
QP                                                                    
n      0.0  0.0  0.032258  0.016949  0.234043  0.864865  0.863636  1.0
y      1.0  1.0  0.967742  0.983051  0.765957  0.135135  0.136364  0.0

CPT for IT101:
IT101   AA   AB        BB        BC        CC        CD   DD    F
QP                                                               
n      0.0  0.0  0.088235  0.040816  0.238095  0.628571  1.0  1.0
y      1.0  1.0  0.911765  0.959184  0.761905  0.371429  0.0  0.0

CPT for IT161:
IT161   AA    AB        BB        BC        CC        CD        DD    F
QP                                                                     
n      0.0  0.04 