<a href="https://colab.research.google.com/github/PrishaAggarwal/Machine_Learning_Assignments/blob/main/ML_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Assignment_6
#Q1
#From scratch

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load data
iris = load_iris()
X = iris.data            # shape (150, 4)
y = iris.target          # 0,1,2

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Train: estimate priors, means, variances per class
classes = np.unique(y_train)
n_classes = classes.size
n_features = X.shape[1]

# containers
priors = {}
means = {}
vars_ = {}

for cls in classes:
    X_c = X_train[y_train == cls]
    priors[cls] = X_c.shape[0] / X_train.shape[0]         # P(y=cls)
    means[cls] = X_c.mean(axis=0)                        # mean per feature
    # Use unbiased estimate (ddof=0), add small epsilon to variance to avoid zero-division
    vars_[cls] = X_c.var(axis=0) + 1e-9                  # variance per feature

print("Class priors:", priors)
print("Per-class means (example):\n", {c: means[c] for c in classes})

# Helper: gaussian log-likelihood for vector x given class params
def gaussian_log_likelihood(x, mean, var):
    """
    Compute sum of log Gaussian PDFs across features for a single sample x:
    log p(x | mean, var) = sum_{j} [ -0.5*log(2Ï€var_j) - (x_j - mean_j)^2/(2 var_j) ]
    """
    # Ensure vectors
    mean = mean.reshape(-1)
    var = var.reshape(-1)
    term1 = -0.5 * np.log(2.0 * np.pi * var)
    term2 = - ( (x - mean) ** 2 ) / (2.0 * var)
    return np.sum(term1 + term2)

# Prediction function (returns array of predicted labels)
def predict_gnb(X):
    y_pred = []
    for x in X:
        class_log_posteriors = []
        for cls in classes:
            log_prior = np.log(priors[cls])
            log_likelihood = gaussian_log_likelihood(x, means[cls], vars_[cls])
            log_posterior = log_prior + log_likelihood
            class_log_posteriors.append(log_posterior)
        # choose class with highest posterior
        y_pred.append(np.argmax(class_log_posteriors))
    return np.array(y_pred)

# Evaluate
y_pred_scratch = predict_gnb(X_test)
print("Scratch GNB Accuracy:", accuracy_score(y_test, y_pred_scratch))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_scratch))
print("\nClassification Report:\n", classification_report(y_test, y_pred_scratch, target_names=iris.target_names))



Class priors: {np.int64(0): 0.3392857142857143, np.int64(1): 0.33035714285714285, np.int64(2): 0.33035714285714285}
Per-class means (example):
 {np.int64(0): array([4.99473684, 3.45      , 1.48157895, 0.24736842]), np.int64(1): array([5.9972973 , 2.74324324, 4.26486486, 1.31081081]), np.int64(2): array([6.66486486, 2.99459459, 5.60810811, 2.04864865])}
Scratch GNB Accuracy: 0.9210526315789473

Confusion Matrix:
 [[12  0  0]
 [ 0 12  1]
 [ 0  2 11]]

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.92      0.89        13
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38



In [None]:
#Q1
#Inbuilt

from sklearn.naive_bayes import GaussianNB

# Create and fit
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict & evaluate
y_pred_sklearn = gnb.predict(X_test)

print("sklearn GaussianNB Accuracy:", accuracy_score(y_test, y_pred_sklearn))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_sklearn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_sklearn, target_names=iris.target_names))

# Compare predictions (optional)
print("\nAre predictions identical between scratch and sklearn?", np.array_equal(y_pred_scratch, y_pred_sklearn))

sklearn GaussianNB Accuracy: 0.9210526315789473

Confusion Matrix:
 [[12  0  0]
 [ 0 12  1]
 [ 0  2 11]]

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.92      0.89        13
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38


Are predictions identical between scratch and sklearn? True


In [None]:
#Q2

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 1: Define model
knn = KNeighborsClassifier()

# Step 2: Define hyperparameter grid
param_grid = {
    'n_neighbors': list(range(1, 31)),     # search K from 1 to 30
    'weights': ['uniform', 'distance']     # try both weight strategies
}

# Step 3: Grid Search with 5-fold CV
grid = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=5,              # 5-fold cross-validation
    scoring='accuracy',
    verbose=1
)

grid.fit(X_train, y_train)

# Step 4: Best results
print("\nBest Parameters: ", grid.best_params_)
print("Best CV Score: ", grid.best_score_)

# Step 5: Evaluate on Test Data
best_knn = grid.best_estimator_
y_pred = best_knn.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))


Fitting 5 folds for each of 60 candidates, totalling 300 fits

Best Parameters:  {'n_neighbors': 3, 'weights': 'uniform'}
Best CV Score:  0.9583333333333334

Test Accuracy: 1.0
