In [None]:
# Naive Bayes is a simple but effective probabilistic classifier based on Bayes' theorem with an assumption of independence between features.
# Despite its simplicity, it's often used in text classification, spam filtering, and other tasks involving high-dimensional data.
# Here's a breakdown of how it works:

In [None]:
# In the context of Naive Bayes classification, the assumption of independence between features means
# that the presence or absence of one feature does not affect 
# the presence or absence of any other feature.
# In other words, the features are assumed to be conditionally independent given the class label.

In [None]:
# Bayes' Theorem:
# Bayes' theorem describes the probability of an event, based on prior knowledge of conditions that might be related to the event. It is formulated as:
# 𝑃
# (
# 𝐴
# ∣
# 𝐵
# )
# =
# 𝑃
# (
# 𝐵
# ∣
# 𝐴
# )
# ×
# 𝑃
# (
# 𝐴
# )
# 𝑃
# (
# 𝐵
# )
# P(A∣B)= 
# P(B)
# P(B∣A)×P(A)
# ​
 
# Where:

# 𝑃
# (
# 𝐴
# ∣
# 𝐵
# )
# P(A∣B) is the posterior probability of 
# 𝐴
# A given 
# 𝐵
# B.
# 𝑃
# (
# 𝐵
# ∣
# 𝐴
# )
# P(B∣A) is the likelihood of 
# 𝐵
# B given 
# 𝐴
# A.
# 𝑃
# (
# 𝐴
# )
# P(A) and 
# 𝑃
# (
# 𝐵
# )
# P(B) are the prior probabilities of 
# 𝐴
# A and 
# 𝐵
# B respectively.

In [None]:
# Naive Assumption:
# Naive Bayes assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature. 
# This is a strong assumption and is why it's called "naive." Despite its simplifying assumption, 
Naive Bayes often performs surprisingly well in practice, especially for text classification tasks.

In [None]:

# Classification:
# Given a dataset with features 
# 𝑋
# 1
# ,
# 𝑋
# 2
# ,
# …
# ,
# 𝑋
# 𝑛
# X 
# 1
# ​
#  ,X 
# 2
# ​
#  ,…,X 
# n
# ​
#   and a class label 
# 𝐶
# C, the classifier calculates the posterior probability of each class label given the features using Bayes' theorem:
# 𝑃
# (
# 𝐶
# ∣
# 𝑋
# 1
# ,
# 𝑋
# 2
# ,
# …
# ,
# 𝑋
# 𝑛
# )
# =
# 𝑃
# (
# 𝑋
# 1
# ,
# 𝑋
# 2
# ,
# …
# ,
# 𝑋
# 𝑛
# ∣
# 𝐶
# )
# ×
# 𝑃
# (
# 𝐶
# )
# 𝑃
# (
# 𝑋
# 1
# ,
# 𝑋
# 2
# ,
# …
# ,
# 𝑋
# 𝑛
# )
# P(C∣X 
# 1
# ​
#  ,X 
# 2
# ​
#  ,…,X 
# n
# ​
#  )= 
# P(X 
# 1
# ​
#  ,X 
# 2
# ​
#  ,…,X 
# n
# ​
#  )
# P(X 
# 1
# ​
#  ,X 
# 2
# ​
#  ,…,X 
# n
# ​
#  ∣C)×P(C)
# ​



In [9]:
# Independence Assumption:
# Because of the naive assumption, the joint probability 
# 𝑃
# (
# 𝑋
# 1
# ,
# 𝑋
# 2
# ,
# …
# ,
# 𝑋
# 𝑛
# ∣
# 𝐶
# )
# P(X 
# 1
# ​
#  ,X 
# 2
# ​
#  ,…,X 
# n
# ​
#  ∣C) can be calculated as the product of the individual probabilities of each feature given the class:
# 𝑃
# (
# 𝑋
# 1
# ,
# 𝑋
# 2
# ,
# …
# ,
# 𝑋
# 𝑛
# ∣
# 𝐶
# )
# =
# 𝑃
# (
# 𝑋
# 1
# ∣
# 𝐶
# )
# ×
# 𝑃
# (
# 𝑋
# 2
# ∣
# 𝐶
# )
# ×
# …
# ×
# 𝑃
# (
# 𝑋
# 𝑛
# ∣
# 𝐶
# )
# P(X 
# 1
# ​
#  ,X 
# 2
# ​
#  ,…,X 
# n
# ​
#  ∣C)=P(X 
# 1
# ​
#  ∣C)×P(X 
# 2
# ​
#  ∣C)×…×P(X 
# n
# ​
#  ∣C)

# Model Training:
# During training, Naive Bayes calculates the prior probabilities 
# 𝑃
# (
# 𝐶
# )
# P(C) and the conditional probabilities 
# 𝑃
# (
# 𝑋
# 𝑖
# ∣
# 𝐶
# )
# P(X 
# i
# ​
#  ∣C) for each feature given each class from the training data.

# Model Prediction:
# When a new data point with features 
# 𝑋
# 1
# ,
# 𝑋
# 2
# ,
# ,
# 𝑋
# 𝑛
# X 
# 1
# ​
#  ,X 
# 2
# ​
#  ,…,X 
# n
# ​
#   is encountered, the classifier calculates the posterior probability of each class label given the features and assigns the class label with the highest probability.

In [None]:





# Model Prediction:
# When a new data point with features 
# 𝑋
# 1
# ,
# 𝑋
# 2
# ,
# …
# ,
# 𝑋
# 𝑛
# X 
# 1
# ​
#  ,X 
# 2
# ​
#  ,…,X 
# n
# ​
#   is encountered, the classifier calculates the posterior probability of each class label given the features and assigns the class label with the highest probability.

# Naive Bayes classifiers are efficient, easy to implement, and can handle large amounts of data. However, 
# their performance may degrade if the independence assumption is violated or if there are strong correlations between features.
# Despite its simplicity, Naive Bayes is often used as a baseline model for comparison with more complex classifiers in many applications.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [4]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [5]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [6]:
print(classifier.predict(sc.transform([[30,87000]])))

[0]


In [7]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]]


In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[65  3]
 [ 7 25]]


0.9