**Practical-5**

**Aim: Implement Naïve-Bayes – Multivariate Bernoulli, Multinomial and Gaussian using sklearn.  
Apply it on following Datasets: https://www.kaggle.com/datasets/himanshunakrani/naive-bayes-classification-data**

**Gaussian Naive Bayes**

In [1]:
# Import required libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Gaussian Naive Bayes classifier
gaussian_naive_bayes = GaussianNB()

# Train the model
gaussian_naive_bayes.fit(X_train, y_train)

# Make predictions
y_pred = gaussian_naive_bayes.predict(X_test)
-
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9777777777777777

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.92      0.96        13
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45


Confusion Matrix:
 [[19  0  0]
 [ 0 12  1]
 [ 0  0 13]]


**Multinomial Naive Bayes**

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load only the first 2 columns: Category (ham/spam) and Message
df = pd.read_csv("spam.csv", encoding="latin-1", usecols=[0, 1], names=["Category", "Message"], skiprows=1)

# Map ham → 0, spam → 1
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

# Drop any missing rows just in case
df = df.dropna()

# Features (X) and target (y)
X, y = df['Message'], df['Category']

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Multinomial Naive Bayes works best with TF-IDF
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict
y_pred = nb.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['ham', 'spam']))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9712918660287081

Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1448
        spam       0.99      0.79      0.88       224

    accuracy                           0.97      1672
   macro avg       0.98      0.89      0.93      1672
weighted avg       0.97      0.97      0.97      1672


Confusion Matrix:
 [[1447    1]
 [  47  177]]


**Bernoulli Naive Bayes**

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset (only first two columns are relevant)
df = pd.read_csv("spam.csv", encoding="latin-1", usecols=[0, 1], names=["Category", "Message"], skiprows=1)

# Convert labels: ham -> 0, spam -> 1
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})
df = df.dropna()  # drop any invalid rows

# Features (X) and target (y)
X, y = df['Message'], df['Category']

vectorizer = CountVectorizer(binary=True, stop_words='english')
X_bin = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_bin, y, test_size=0.3, random_state=42)

# Bernoulli NaiveBayes
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

# Predictions
y_pred = bnb.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['ham', 'spam']))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9736842105263158

Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1448
        spam       0.97      0.83      0.89       224

    accuracy                           0.97      1672
   macro avg       0.97      0.91      0.94      1672
weighted avg       0.97      0.97      0.97      1672


Confusion Matrix:
 [[1442    6]
 [  38  186]]
