In [None]:
import pandas as pd
import numpy as np
import random
from math import sqrt, exp, pi

In [None]:
from google.colab import drive
drive.mount("/content/drive")

data = pd.read_csv('/content/drive/My Drive/iris.data',sep=',', header = None)

Mounted at /content/drive


In [None]:
print(data)

       0    1    2    3               4
0    5.1  3.5  1.4  0.2     Iris-setosa
1    4.9  3.0  1.4  0.2     Iris-setosa
2    4.7  3.2  1.3  0.2     Iris-setosa
3    4.6  3.1  1.5  0.2     Iris-setosa
4    5.0  3.6  1.4  0.2     Iris-setosa
..   ...  ...  ...  ...             ...
145  6.7  3.0  5.2  2.3  Iris-virginica
146  6.3  2.5  5.0  1.9  Iris-virginica
147  6.5  3.0  5.2  2.0  Iris-virginica
148  6.2  3.4  5.4  2.3  Iris-virginica
149  5.9  3.0  5.1  1.8  Iris-virginica

[150 rows x 5 columns]


In [None]:
column_names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']
data.columns = column_names

In [None]:
data.describe()

Unnamed: 0,sepal length,sepal width,petal length,petal width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [None]:
# Convert categorical labels into numerical format
class_map = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
y = np.array([class_map[label] for label in y])

In [None]:
# Step 3: Stratified Sampling
def stratified_sampling(X, y, train_size):
    classes = np.unique(y)
    train_indices = []
    test_indices = []
    for c in classes:
        indices = np.where(y == c)[0]
        np.random.shuffle(indices)
        train_count = int(train_size * len(indices))
        train_indices.extend(indices[:train_count])
        test_indices.extend(indices[train_count:])
    return train_indices, test_indices

In [None]:
train_indices, test_indices = stratified_sampling(X, y, train_size=0.7)
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

In [None]:
class NaiveBayesClassifier:
    def __init__(self):
        pass

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.parameters = {}
        self.X_train = X
        self.y_train = y
        for c in self.classes:
            X_c = X[y == c]
            self.parameters[c] = {
                'mean': X_c.mean(axis=0),
                'std': X_c.std(axis=0) + 1e-8
            }

    def calculate_prior(self, c):
        return np.mean(self.y_train == c)

    def calculate_likelihood(self, x, c):
        mean = self.parameters[c]['mean']
        std = self.parameters[c]['std']
        numerator = exp(-((x - mean) ** 2) / (2 * (std ** 2)))
        denominator = sqrt(2 * pi) * std
        return numerator / denominator

    def predict_instance(self, x):
        posteriors = []
        for c in self.classes:
            prior = np.log(self.calculate_prior(c))
            likelihood = np.sum(np.log(self.calculate_likelihood(x, c)))
            posterior = prior + likelihood
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    def predict(self, X):
        y_pred = [self.predict_instance(x) for x in X]
        return y_pred

In [None]:
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)

In [None]:
# Step 6: Model Evaluation
y_pred = nb_classifier.predict(X_test)

In [None]:
def accuracy(y_true, y_pred):
    correct = sum(y_true == y_pred)
    return correct / len(y_true)

In [None]:
print("Accuracy:", accuracy(y_test, y_pred))

Accuracy: 0.9111111111111111
