In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [2]:
class NaiveBayes:
    
    def __init__(self):
        self.prior_probabilities = {}
        self.conditional_probabilities = {}
        self.classes = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # calculate mean, variance, and prior for each class
        self.mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self.variance = np.zeros((n_classes, n_features), dtype=np.float64)
        self.priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self.classes):
            X_c = []
            for i in range(X.shape[0]):
                if y[i] == c:
                    X_c.append(X[i])
            X_c = np.array(X_c)

            self.mean[idx, :] = X_c.mean(axis=0)
            self.variance[idx, :] = X_c.var(axis=0)

        # Calculates the prior probabilities for each class.
        for label in self.classes:
            self.prior_probabilities[label] = np.count_nonzero(y == label) / len(y)
        # Calculates the conditional probabilities for each class.
        self.calculate_conditional_probabilities(X, y)


    def calculate_conditional_probabilities(self, X, y):
        n_features = X.shape[1]
        for label in self.classes:
            class_samples = X[y == label]
            class_probabilities = {}
            for i in range(n_features):
                feature_values = class_samples[:, i]
                feature_count = Counter(feature_values) # count the occurrences of each feature value in the list
                feature_probabilities = {}
                for value in feature_count:
                    feature_probabilities[value] = feature_count[value] / len(feature_values)
                class_probabilities[i] = feature_probabilities
            self.conditional_probabilities[label] = class_probabilities


    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = []
            for idx, c in enumerate(self.classes):
                prior = np.log(self.prior_probabilities[c])
                numerator = np.exp(-((x - self.mean[idx]) ** 2) / (2 * self.variance[idx]))
                denominator = np.sqrt(2 * np.pi * self.variance[idx])
                posterior = np.sum(numerator / denominator)
                posterior = prior + posterior
                posteriors.append(posterior)
            y_pred.append(self.classes[np.argmax(posteriors)])
        return y_pred

In [3]:
class KNN:
    def __init__(self, K):
        self.K = K

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        y_pred = np.zeros(X_test.shape[0])

        for i, x_test in enumerate(X_test):
            dists = np.sqrt(np.sum((self.X_train - x_test)**2, axis=1))
            idx = np.argsort(dists)[:self.K]
            k_labels = self.y_train[idx]
            counts = np.bincount(k_labels)
            y_pred[i] = np.argmax(counts)
        return y_pred

In [5]:
titanic_df = pd.read_csv(r'train.csv')

titanic_df = titanic_df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

titanic_df = pd.get_dummies(titanic_df, columns=['Sex', 'Embarked'])

titanic_df = titanic_df.dropna()

X = titanic_df.drop(['Survived'], axis=1).values
y = titanic_df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


nb = NaiveBayes()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print('Accuracy for Naive bayes:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Scale the features to have zero mean and unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
K = 7
knn = KNN(K)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy for {K} KNN:{accuracy}')
print(classification_report(y_test, y_pred))

Accuracy for Naive bayes: 0.7342657342657343
              precision    recall  f1-score   support

           0       0.78      0.79      0.78        87
           1       0.67      0.64      0.65        56

    accuracy                           0.73       143
   macro avg       0.72      0.72      0.72       143
weighted avg       0.73      0.73      0.73       143

Accuracy for 7 KNN:0.7902097902097902
              precision    recall  f1-score   support

           0       0.82      0.84      0.83        87
           1       0.74      0.71      0.73        56

    accuracy                           0.79       143
   macro avg       0.78      0.78      0.78       143
weighted avg       0.79      0.79      0.79       143

