<a href="https://colab.research.google.com/github/WilliamPoe/CSCI-290/blob/main/notebooks/Naive_Bayes_Classifier_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Created Naive Bayes Classifier

In [1]:
import pandas as pd
import math
from scipy.stats import norm
import numpy as np

In [2]:
def Naive_Bayes_Classifier(X, y, instance, numerical, categorical):
# X:df, y:target, instance:unseen instance, numerical:numerical attribute, categorical:categorical attributes

  priors = {}
  likelihoods = {}


  for clss in X[y].unique(): # Loops through the unique values in the target feature
    data = X[X[y] == clss] # Create a dataset for the rows where the clss is True
    priors[clss] = len(data) / len(X) # Calculate the priors for the dataset it created
    likelihoods[clss] = {} # Makes a dictionary where the clss is the key for the likelihoods
    for cat in categorical: # Gets likelihoods for categorical features
      likelihoods[clss][cat] = data[cat].value_counts(normalize=True).to_dict() # Gets the probabilites of the value counts in the cat attribute
    for num in numerical: # Gets likelihoods for numerical features
      likelihoods[clss][num] = norm( loc=data[num].mean(), scale = data[num].std()) # Gets the mean and standard deviation for the num attribute


    posterior = {} # Create a dictionary for calculating the posteriors

    for clss in priors:
      post = priors[clss] # Sets post equal to priors for that clss
      for cat in categorical: # Categorical features
        if cat in instance: # Checks if the cat is in the unseen instance
          if instance[cat] in likelihoods[clss][cat]: # Checks if the feature in instance is in the likelihoods
            post *= likelihoods[clss][cat][instance[cat]] # Multiplies post by the likelihoods for the cat features and then sets the post equal to that
          else:
            post *= 1
        else:
          pass
      for num in numerical: # Numerical features
        post *= likelihoods[clss][num].pdf(instance[num]) # Multiplies post by the likelihoods for the num features and then sets the post equal to that

      posterior[clss] = post # Sets the posterior for that clss equal to the post

  return max(posterior, key=posterior.get) # Returns the class with the highest posterior

# Test on Penguins Data

In [3]:
# Dataset
df = pd.read_csv("https://github.com/WilliamPoe/CSCI-290/raw/refs/heads/main/Data/penguins.csv") # Penguins dataset
df.head()
# Tagret column in the dataset
target = 'species'
# Unseen Instance
instance = {'bill_length_mm':40.9, 'bill_depth_mm':13.7, 'flipper_length_mm':214	, 'body_mass_g':4650, 'sex':'female'}
# Numerical features
numerical = []
# Categorical features
categorical = []

## Remove [2:-1] if you want to test on all columns ##
for attrib in df.columns:
  if attrib != target:
    # Checks for numerical data types
    if df[attrib].dtype in ['float64', 'int64']:
      # Considers the column numerical if there are more than 10 unique values
      if len( df[attrib].unique() ) > 10:
        numerical.append(attrib)
    else:
      categorical.append(attrib)

print(numerical)
print(categorical)
# Calls Naive Bayes Classifier function
Naive_Bayes_Classifier(df, target, instance, numerical, categorical)


['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
['island', 'sex']


'Gentoo'

# Test On Synthetic Data

In [4]:
np.random.seed(42)
X = pd.DataFrame( {"x1": np.random.randn( 200 ), "x2": 2*np.random.randn( 200) - 5, "x3": np.random.randint(3, size=200) } )
X["target"] = (X["x1"] > -2)*(X["x2"] > -7 )*(X["x3"]!=1)*1



target = 'target'
# Unseen Instance
instance = {'x1':-3, 'x2':-8, 'x3':1}
# Numerical features
numerical = []
# Categorical features
categorical = []

## Remove [2:-1] if you want to test on all columns ##
for attrib in X.columns:
  if attrib != target:
    # Checks for numerical data types
    if X[attrib].dtype in ['float64', 'int64']:
      # Considers the column numerical if there are more than 10 unique values
      if len( X[attrib].unique() ) > 10:
        numerical.append(attrib)
      elif len(X[attrib].unique()) < 10:
        categorical.append(attrib)
    elif X[attrib].dtype in ['object']:
      categorical.append(attrib)

print(numerical)
print(categorical)
# Calls Naive Bayes Classifier function
Naive_Bayes_Classifier(X, target, instance, numerical, categorical)


['x1', 'x2']
['x3']


0

# sklearn Naive Bayes Classifier

In [5]:
## Add sklearn naive bayes classifier ##

In [6]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB

In [15]:
# Add sklearn naive Bayes classifier
gnb = GaussianNB().fit(np.array( X[["x1","x2"]] ), np.array( X["target"] ).reshape(-1,1))
cnb = CategoricalNB().fit(np.array( X[["x3"]] ).reshape(-1,1), np.array( X["target"] ).reshape(-1,1))

# Unseen instance being predicted
print(gnb.predict(np.array( pd.DataFrame( {"x1": [-3], "x2": [-8] } ) )))
print(cnb.predict(np.array([1] ).reshape(-1,1)))


[0]
[0]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
