# Naive Bayes

# Naive Bayes From Scratch

In [1]:
import pandas as pd
import numpy as np

In [15]:
data = pd.read_csv('dataset/stroke/healthcare-dataset-stroke-data.csv')

In [16]:
data = data[['age', 'hypertension', 'heart_disease', 'avg_glucose_level','bmi','stroke']]

(5110, 6)

In [31]:
# Separate by class
def separate_by_class(data):
    separated = {}
    for i in range(len(data)):
        vector = data[i, :-1]  # All but last column
        label = data[i, -1]    # Last column
        if label not in separated:
            separated[label] = []
        separated[label].append(vector)
    return separated


In [32]:
# Calculate mean and standard deviation
def summarize_dataset(dataset):
    summaries = [(np.mean(attribute), np.std(attribute)) for attribute in zip(*dataset)]
    return summaries

In [33]:
# probab calculation
def calculate_probability(x, mean, std):
    exponent = np.exp(-(np.power(x - mean, 2) / (2 * np.power(std, 2))))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

In [34]:
# Make predictions
def predict(summaries, input_data):
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = 1
        for i in range(len(class_summaries)):
            mean, std = class_summaries[i]
            x = input_data[i]
            probabilities[class_value] *= calculate_probability(x, mean, std)
    return max(probabilities, key=probabilities.get)

In [35]:
data = np.array([[1.0, 2.0, 0],
                 [1.5, 1.8, 0],
                 [5.0, 8.0, 1],
                 [6.0, 9.0, 1]])

In [37]:
separated = separate_by_class(data)
summaries = {label: summarize_dataset(np.array(rows)) for label, rows in separated.items()}
prediction = predict(summaries, np.array([1.2, 1.9]))
print(f'Predicted class: {prediction}')

Predicted class: 0.0


-----------

In [38]:
# Using scikit learning

In [40]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [41]:
data = datasets.load_iris()
x,y = data.data, data.target

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [44]:
model = GaussianNB() #Normal Probabiolity Distirubtion
model.fit(x_train, y_train)

In [48]:
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy:.3f}')

accuracy: 1.000


----------

# Text Classification with scikit learn

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [55]:
data = [
     ('Free money now!!!', 1),  # Spam
    ('Hi Bob, how about a game of golf tomorrow?', 0),  # Not spam
    ('Limited time offer!', 1),  # Spam
    ('Are we still on for lunch?', 0)  #
]
x , y = zip(*data) # * is importat as it unfolds the tuple 

In [57]:
model = make_pipeline(CountVectorizer(), MultinomialNB())

In [58]:
model.fit(x, y)
predicted = model.predict(['Win a free ticket to the concert!'])
print(f'Predicted class: {"Spam" if predicted[0] else "Not Spam"}')

Predicted class: Spam
