In [2]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from scipy.stats import norm

In [3]:
# Loading the dataset
from seaborn.utils import load_dataset
df = load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
# Data Cleaning
df = df.drop(['deck', 'alive'], axis=1)  # Assign result back
df = df.dropna()  # Now removes much less data
df['age'] = df['age'].astype('int32')

In [5]:
# Data Information
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    object  
 3   age          712 non-null    int32   
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   embarked     712 non-null    object  
 8   class        712 non-null    category
 9   who          712 non-null    object  
 10  adult_male   712 non-null    bool    
 11  embark_town  712 non-null    object  
 12  alone        712 non-null    bool    
dtypes: bool(2), category(1), float64(1), int32(1), int64(4), object(4)
memory usage: 60.6+ KB


In [6]:
# Splitting the data into training and validation set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ['survived'], axis = 0), df[['survived']], random_state = 42)

In [None]:
class MixedNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Laplace smoothing strength
        
    def fit(self, X_num, X_cat, y):
        """Train the model.
        X_num: Numerical features (n_samples x n_num_features)
        X_cat: Categorical features (n_samples x n_cat_features)
        y: Target labels (n_samples)
        """
        # === Step 1: Process classes ===
        self.classes = np.unique(y) # array([0, 1])
        self.n_classes = len(self.classes) # 2
        
        # Prior probability P(y)
        self.class_priors = np.zeros(self.n_classes)
        for i, c in enumerate(self.classes):
            self.class_priors[i] = np.mean(y == c) # array([0.32417582, 0.67582418])
        
        # === Step 2: Handle numerical features (Gaussian) ===
        # For each class, compute mean/std per feature
        self.num_means = np.zeros((self.n_classes, X_num.shape[1]))
        self.num_stds = np.zeros((self.n_classes, X_num.shape[1]))
        
        for i, c in enumerate(self.classes):
            # Filter samples for class c
            X_class = X_num[(y == c).values.ravel()]
            self.num_means[i, :] = np.mean(X_class, axis=0) # array([[64.53213051], [85.8211065 ]])
            self.num_stds[i, :] = np.std(X_class, axis=0) + 1e-9  # Avoid zero std

        # === Step 3: Handle categorical features (Frequency counts) ===
        self.cat_probs = []  # Store probabilities per feature
        self.cat_mappings = []  # Store category-to-index mappings
        
        for j in range(X_cat.shape[1]):  # For each categorical feature
            feature_vals = X_cat.iloc[:, j]
            unique_vals = np.unique(feature_vals) # 1, 2, 3
            mapping = {val: idx for idx, val in enumerate(unique_vals)} # {1: 0, 2: 1, 3: 2}
            self.cat_mappings.append(mapping)
            
            n_cats = len(unique_vals) # 3
            prob_matrix = np.zeros((self.n_classes, n_cats))
            
            for i, c in enumerate(self.classes):
                # Filter samples for class c
                class_vals = feature_vals[(y == c).values.ravel()] # Yes
                
                # Count occurrences of each category
                counts = np.zeros(n_cats) # [0, 0, 0]
                for val in class_vals:
                    counts[mapping[val]] += 1
                
                # Apply Laplace smoothing
                counts_smoothed = counts + self.alpha
                prob_matrix[i, :] = counts_smoothed / (len(class_vals) + self.alpha * n_cats)
                
            self.cat_probs.append(prob_matrix)
    
    def predict(self, X_num, X_cat):
        """Predict class probabilities for new data"""
        log_probs = np.zeros((X_num.shape[0], self.n_classes)) # n * 2
        
        for i in range(X_num.shape[0]):  # For each sample
            for c_idx in range(self.n_classes):  # For each class
                # Start with log prior: log P(y)
                log_prob = np.log(self.class_priors[c_idx]) # Just one value for each class
                
                # === Numerical: Gaussian log PDF ===
                # log P(x_num | y) = log of normal distribution PDF
                # Likelyhood log probability
                log_gauss = norm.logpdf(
                    X_num.iloc[i, :], 
                    loc=self.num_means[c_idx, :], 
                    scale=self.num_stds[c_idx, :]
                )
                log_prob += np.sum(log_gauss)

                # === Categorical: Sum log probabilities ===
                for j in range(X_cat.shape[1]):  # For each cat feature
                    val = X_cat.iloc[i, j]
                    mapping = self.cat_mappings[j]
                    
                    if val in mapping:
                        # Get P(x_cat | y) from precomputed matrix
                        prob_val = self.cat_probs[j][c_idx, mapping[val]]
                        log_prob += np.log(prob_val)
                    else:
                        # Unseen category: use uniform probability
                        n_cats = len(self.cat_mappings[j])
                        log_prob += np.log(1 / n_cats)  # Fallback
                
                log_probs[i, c_idx] = log_prob
        
        # Return class with highest log probability
        return self.classes[np.argmax(log_probs, axis=1)]

In [8]:
# Seperating Numerical and Categorical columns
X_train_num = X_train.select_dtypes(include = ["float"])
X_train_cat = X_train.select_dtypes(include = ["object", "int", "category"])

X_test_num = X_test.select_dtypes(include = ["float"])
X_test_cat = X_test.select_dtypes(include = ["object", "int", "category"])

In [9]:
model = MixedNaiveBayes(alpha = 1.0)
model.fit(X_train_num, X_train_cat, y_train)

In [10]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
y_pred = model.predict(X_test_num, X_test_cat)
print("Accuracy -", accuracy_score(y_test, y_pred))
print("Precision -", precision_score(y_test, y_pred))
print("Recall -", recall_score(y_test, y_pred))
print("F1 Score -", f1_score(y_test, y_pred))

Accuracy - 0.7584269662921348
Precision - 0.7368421052631579
Recall - 0.7088607594936709
F1 Score - 0.7225806451612903
