# Assignment 6 


### Q1
(Gaussian Naïve Bayes Classifier) Implement Gaussian Naïve Bayes Classifier on the Iris dataset from sklearn.datasets using 
(i) Step-by-step implementation 
(ii) In-built function

In [2]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB

iris = load_iris()
X = iris.data        
y = iris.target      

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

class GaussianNBFromScratch:
    def __init__(self, eps=1e-9):
        self.eps = eps  # tiny value to avoid division by zero
    
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        n_features = X.shape[1]
        self.class_count_ = {}
        self.class_prior_ = {}
        self.theta_ = {}  # mean per class
        self.sigma_ = {}  # variance per class
        
        for c in self.classes_:
            X_c = X[y == c]
            self.class_count_[c] = X_c.shape[0]
            # prior = P(class)
            self.class_prior_[c] = X_c.shape[0] / X.shape[0]
            # mean (theta) and variance (sigma)
            self.theta_[c] = np.mean(X_c, axis=0)
            # use unbiased (ddof=0) population variance; add eps for stability
            self.sigma_[c] = np.var(X_c, axis=0) + self.eps
        
        return self
    
    def _log_gaussian_prob(self, X, mean, var):
        """
        Compute log of Gaussian probability density for each sample and feature.
        For multivariate independent features, sum log-probabilities across features.
        """
        # term1: -0.5 * log(2*pi*var)
        term1 = -0.5 * np.log(2.0 * np.pi * var)
        # term2: - (x - mean)^2 / (2*var)
        term2 = - ((X - mean) ** 2) / (2.0 * var)
        # sum over features to get log p(x | class) per sample
        return np.sum(term1 + term2, axis=1)  # shape (n_samples,)
    
    def predict_log_proba(self, X):
        """
        Returns log P(class) + log P(x | class) for each class and sample.
        shape -> (n_samples, n_classes)
        """
        n_samples = X.shape[0]
        log_probs = np.zeros((n_samples, len(self.classes_)))
        
        for idx, c in enumerate(self.classes_):
            log_prior = np.log(self.class_prior_[c])
            log_likelihood = self._log_gaussian_prob(X, self.theta_[c], self.sigma_[c])
            log_probs[:, idx] = log_prior + log_likelihood
        
        return log_probs
    
    def predict(self, X):
        log_probs = self.predict_log_proba(X)
        # pick class with highest posterior (in log-space)
        indices = np.argmax(log_probs, axis=1)
        return self.classes_[indices]
    
    def predict_proba(self, X):
        """
        Convert log_probs to normalized probabilities (softmax per sample).
        """
        log_probs = self.predict_log_proba(X)
        # For numerical stability, subtract max per row
        a = log_probs - log_probs.max(axis=1, keepdims=True)
        exp_a = np.exp(a)
        probs = exp_a / exp_a.sum(axis=1, keepdims=True)
        return probs

# Fit the from-scratch model
gnb_scratch = GaussianNBFromScratch()
gnb_scratch.fit(X_train, y_train)

y_pred_scratch = gnb_scratch.predict(X_test)
acc_scratch = accuracy_score(y_test, y_pred_scratch)

print("Gaussian NB (From Scratch)")
print("Accuracy:", acc_scratch)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_scratch))
print("Classification Report:\n", classification_report(y_test, y_pred_scratch, target_names=iris.target_names))

gnb_sklearn = GaussianNB()
gnb_sklearn.fit(X_train, y_train)
y_pred_sklearn = gnb_sklearn.predict(X_test)
acc_sklearn = accuracy_score(y_test, y_pred_sklearn)

print("Gaussian NB (sklearn)")
print("Accuracy:", acc_sklearn)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sklearn))
print("Classification Report:\n", classification_report(y_test, y_pred_sklearn, target_names=iris.target_names))

print("Comparison")
print(f"From-scratch accuracy: {acc_scratch:.4f}")
print(f"sklearn accuracy     : {acc_sklearn:.4f}")

print("\nSample predicted probabilities (first 5 test samples):")
print("scratch:\n", gnb_scratch.predict_proba(X_test[:5]))
print("sklearn:\n", gnb_sklearn.predict_proba(X_test[:5]))


Gaussian NB (From Scratch)
Accuracy: 0.9210526315789473
Confusion Matrix:
 [[12  0  0]
 [ 0 12  1]
 [ 0  2 11]]
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.92      0.89        13
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38

Gaussian NB (sklearn)
Accuracy: 0.9210526315789473
Confusion Matrix:
 [[12  0  0]
 [ 0 12  1]
 [ 0  2 11]]
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.92      0.89        13
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92   

### Q2
Explore about GridSearchCV toot in scikit-learn. This is a tool that is often used for tuning hyperparameters of machine learning models. Use this tool to find the best value of K for K-NN Classifier using any dataset.

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

knn = KNeighborsClassifier()

param_grid = {
    "n_neighbors": list(range(1, 31)),  # Test k values from 1 to 30
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}

grid_search = GridSearchCV(
    knn,
    param_grid,
    cv=5,            # 5-fold cross validation
    scoring="accuracy"
)

grid_search.fit(X_train, y_train)

print("Best K:", grid_search.best_params_["n_neighbors"])
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

best_knn = grid_search.best_estimator_

y_pred = best_knn.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))


Best K: 6
Best parameters: {'metric': 'euclidean', 'n_neighbors': 6, 'weights': 'uniform'}
Best cross-validation score: 0.9833333333333334
Test accuracy: 0.9666666666666667
