In [21]:
import numpy as np
import pandas as pd

In [22]:
df = pd.read_csv('/content/Breast_cancer_data.csv')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mean_radius      569 non-null    float64
 1   mean_texture     569 non-null    float64
 2   mean_perimeter   569 non-null    float64
 3   mean_area        569 non-null    float64
 4   mean_smoothness  569 non-null    float64
 5   diagnosis        569 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 26.8 KB


In [50]:
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


In [24]:
X = df.drop('diagnosis', axis=1).values
y = df['diagnosis'].values

In [25]:

#alpha= []
class NaiveBayes:
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = np.zeros((len(self.classes), X.shape[1]))
        self.variance = np.zeros((len(self.classes), X.shape[1]))
        self.prior = np.zeros(len(self.classes))
        for i, c in enumerate(self.classes):
            X_c = X[y==c]
            self.mean[i, :] = X_c.mean(axis=0)
            self.variance[i, :] = X_c.var(axis=0)
            self.prior[i] = X_c.shape[0] / X.shape[0]
            
    def predict(self, X):
        posterior = np.zeros((X.shape[0], len(self.classes)))
        for i, c in enumerate(self.classes):
            likelihood = np.exp(-(X - self.mean[i, :]) ** 2 / (2 * self.variance[i, :])) / np.sqrt(2 * np.pi * self.variance[i, :])
            posterior[:, i] = np.log(likelihood).sum(axis=1) + np.log(self.prior[i])
        return self.classes[np.argmax(posterior, axis=1)]

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [27]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [28]:
nb=NaiveBayes()
nb.fit(X_train,y_train)

In [29]:
# Checking Accuracy
def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

In [30]:
nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

In [31]:
print("Naive Bayes classification test data accuracy", accuracy(y_test, predictions)*100)

Naive Bayes classification test data accuracy 90.9090909090909


In [32]:
# Predicting the labels for the test dataset
y_pred = nb.predict(X_test)

# Calculating true positives, false positives, false negatives, and true negatives
TP = sum((y_test == 1) & (y_pred == 1))
FP = sum((y_test == 0) & (y_pred == 1))
FN = sum((y_test == 1) & (y_pred == 0))
TN = sum((y_test == 0) & (y_pred == 0))

# Calculating precision, recall, and F1 score
precision = TP / (TP + FP)*100
recall = TP / (TP + FN)*100
f1_score = 2 * (precision * recall) / (precision + recall)

# Printing the precision, recall, and F1 score values
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1_score)

Precision: 88.77551020408163
Recall: 97.75280898876404
F1 Score: 93.04812834224597


# cross validation

In [33]:
def cross_validation(X, y, k, alpha):
    accuracies = []
    folds = np.array_split(np.arange(X.shape[0]), k)
    for i in range(k):
        X_train = X[np.concatenate(folds[:i] + folds[i+1:]), :]
        y_train = y[np.concatenate(folds[:i] + folds[i+1:])]
        X_test = X[folds[i], :]
        y_test = y[folds[i]]
        model = NaiveBayes()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = np.mean(y_pred == y_test)
        accuracies.append(accuracy)
    return np.mean(accuracies)

In [34]:
def grid_search(X, y, k, alpha_values):
    results = []
    for alpha in alpha_values:
        model = NaiveBayes()
        accuracy = cross_validation(X, y, k,alpha)
        results.append((alpha, accuracy))
    return sorted(results, key=lambda x: x[1], reverse=True)[0]

In [35]:
k = 5
alpha_values = [0.01, 0.1, 1, 10, 100]
best_alpha, best_accuracy = grid_search(X, y, 5, alpha_values)
print('Best alpha:', best_alpha)
print('Best accuracy:', best_accuracy*100)

Best alpha: 0.01
Best accuracy: 89.63359726750507


# **grid search**

In [36]:
import itertools
import numpy as np
from collections import Counter


class NaiveBayes:
    def __init__(self):
        self.class_priors = None
        self.feature_likelihoods = None

    def train(self, X, y):
        # Compute class priors
        self.class_priors = Counter(y)
        total_samples = len(y)
        for c in self.class_priors:
            self.class_priors[c] /= total_samples

        # Compute feature likelihoods
        n_samples, n_features = X.shape
        self.feature_likelihoods = np.zeros((n_features, 2))
        for feature_idx in range(n_features):
            for feature_value in [0, 1]:
                for c in self.class_priors:
                    X_c = X[y == c]
                    self.feature_likelihoods[feature_idx, feature_value] += (
                        (X_c[:, feature_idx] == feature_value).sum()
                        / float(self.class_priors[c] * total_samples)
                    )

    def predict(self, X):
        y_pred = []
        for x in X:
            probs = []
            for c in self.class_priors:
                likelihoods = self.feature_likelihoods[np.arange(len(x)), x]
                log_prob = np.log(self.class_priors[c]) + np.log(likelihoods).sum()
                probs.append(log_prob)
            y_pred.append(np.argmax(probs))
        return np.array(y_pred)

    def accuracy(self, X, y):
        y_pred = self.predict(X)
        return (y_pred == y).mean()


def grid_search(X, y, param_grid):
    best_accuracy = 0
    best_params = None

    for params in itertools.product(*param_grid.values()):
        # Train Naive Bayes classifier
        nb = NaiveBayes()
        nb.train(X, y)

        # Evaluate accuracy
        accuracy = nb.accuracy(X, y)

        # Check if current params are better than previous ones
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params

    return best_params, best_accuracy


In [37]:
# Generate example data
X = np.random.randint(0, 2, size=(100, 10))
y = np.random.randint(0, 2, size=100)

# Define hyperparameter search space
param_grid = {"alpha": [0.1, 1.0, 10.0], "fit_prior": [True, False]}

# Perform grid search
best_params, best_accuracy = grid_search(X, y, param_grid)

print("Best hyperparameters:", best_params)
print("Best accuracy:", best_accuracy*100)


Best hyperparameters: (0.1, True)
Best accuracy: 46.0


# **Bayesian Optimization**

In [38]:
!pip install -U bayesian-optimization

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [39]:
import numpy as np
from scipy.stats import uniform
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

In [42]:
def nb_cv_score(alpha, X, y):
    """
    Calculates the cross-validation score for Naive Bayes classifier using Laplace smoothing.
    """
    nb = NaiveBayes()

    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    

    clf = NaiveBayes(alpha=alpha)
    scores = cross_val_score(clf, X, y, cv=5)
    return np.mean(scores)

def target_func(alpha):
    """
    Target function to be optimized by Bayesian Optimization.
    """
    return nb_cv_score(alpha, X, y)

# Create a Bayesian Optimization object with the target function to optimize
optimizer = BayesianOptimization(
    f=target_func,
    pbounds={'alpha': (1e-5, 10)},
    verbose=2,
    random_state=42,
)

# Perform Bayesian Optimization for 10 iterations
optimizer.maximize(
    init_points=5,
    n_iter=5,
)


|   iter    |  target   |   alpha   |
-------------------------------------


AttributeError: ignored

# random search

In [48]:
n_iter = 10
alpha_range = (0.1, 1.0)
best_score, best_params = random_search(X_train, y_train, n_iter, alpha_range)
print(best_params)

NameError: ignored

In [49]:
import random
import math
import numpy as np

class NaiveBayesClassifier:
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.classes = None
        self.priors = None
        self.likelihoods = None

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.priors = np.zeros(len(self.classes))
        self.likelihoods = {}

        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.priors[i] = X_c.shape[0] / X.shape[0]
            self.likelihoods[c] = {}

            for j in range(X.shape[1]):
                self.likelihoods[c][j] = {}

                for val in np.unique(X[:, j]):
                    count = X_c[X_c[:, j] == val].shape[0]
                    self.likelihoods[c][j][val] = (count + self.alpha) / (X_c.shape[0] + self.alpha * len(np.unique(X[:, j])))

    def predict(self, X):
      y_pred = np.zeros(X.shape[0])

      for i, x in enumerate(X):
        posteriors = []

        for j, c in enumerate(self.classes):
          prior = math.log(self.priors[j])
          likelihood = 0

          for k, val in enumerate(x):
                if val in self.likelihoods[c][k]:
                    likelihood += math.log(self.likelihoods[c][k][val])

                # if value not in likelihoods dictionary, assume zero likelihood
                else:
                    likelihood += math.log(1e-6)

          posterior = prior + likelihood
          posteriors.append(posterior)

    def random_search(X, y, n_iter, alpha_range):
      best_score = 0
      best_params = {}

    for i in range(n_iter):
      
      alpha = random.uniform(alpha_range[0], alpha_range[1])
      clf = NaiveBayesClassifier(alpha=alpha)
      clf.fit(X_train, y_train)
      y_pred = clf.predict(X_test)
      score = sum(y_pred == y_test) / len(y_test)
if score > best_score:
    best_score = score
    best_params['alpha'] = alpha
    return best_score, best_params

  



NameError: ignored

In [None]:
clf = NaiveBayesClassifier(alpha=best_params.get('alpha'))


In [None]:
clf = NaiveBayesClassifier(alpha=best_params['alpha'])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = sum(y_pred == y_test) / len(y_test)
print("Accuracy:", score)
