In [1]:
"""
Naïve Bayes Classifier Implementation
Student Practical Exam - Data Warehouse and Mining
"""

# Import required libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np

print("="*70)
print("           NAÏVE BAYES CLASSIFIER IMPLEMENTATION")
print("="*70)

# Step 1: Load Dataset
print("\nStep 1: Loading Dataset...")
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target labels

print(f"Dataset: Iris Flower Dataset")
print(f"Total samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names: {iris.feature_names}")
print(f"Classes: {iris.target_names}")

# Display first 5 samples
print("\nFirst 5 samples of dataset:")
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
print(df.head())

# Step 2: Split Dataset into Training and Testing Sets
print("\n" + "-"*70)
print("Step 2: Splitting Dataset (70% Train, 30% Test)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Step 3: Create and Train Naïve Bayes Classifier
print("\n" + "-"*70)
print("Step 3: Training Gaussian Naïve Bayes Classifier...")

# Create Gaussian Naïve Bayes model
model = GaussianNB()

# Train the model
model.fit(X_train, y_train)
print("Training completed successfully!")

# Step 4: Make Predictions on Test Data
print("\n" + "-"*70)
print("Step 4: Making Predictions on Test Data...")

y_pred = model.predict(X_test)

print("Predictions made successfully!")
print(f"\nSample predictions (first 10):")
print(f"Actual:    {y_test[:10]}")
print(f"Predicted: {y_pred[:10]}")

# Step 5: Evaluate Model Performance
print("\n" + "-"*70)
print("Step 5: Model Evaluation")
print("-"*70)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n✓ Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
print("\n✓ Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\nConfusion Matrix Explanation:")
print(f"   Columns: Predicted Classes")
print(f"   Rows: Actual Classes")

# Classification Report
print("\n✓ Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# Step 6: Test with New Sample Data
print("\n" + "-"*70)
print("Step 6: Testing with New Sample Data")
print("-"*70)

# New sample (hypothetical flower measurements)
new_samples = [
    [5.1, 3.5, 1.4, 0.2],  # Likely Setosa
    [6.7, 3.0, 5.2, 2.3],  # Likely Virginica
]

print("\nNew Samples:")
for i, sample in enumerate(new_samples, 1):
    print(f"Sample {i}: {sample}")

# Predict class
predictions = model.predict(new_samples)

# Predict probabilities
probabilities = model.predict_proba(new_samples)

print("\nPredictions:")
for i, (sample, pred, prob) in enumerate(zip(new_samples, predictions, probabilities), 1):
    print(f"\nSample {i}: {sample}")
    print(f"Predicted Class: {iris.target_names[pred]}")
    print(f"Prediction Probabilities:")
    for j, class_name in enumerate(iris.target_names):
        print(f"  {class_name}: {prob[j]*100:.2f}%")

# Summary
print("\n" + "="*70)
print("                          SUMMARY")
print("="*70)
print(f"\n✓ Algorithm Used: Gaussian Naïve Bayes")
print(f"✓ Dataset: Iris (150 samples, 4 features, 3 classes)")
print(f"✓ Training Samples: {len(X_train)}")
print(f"✓ Testing Samples: {len(X_test)}")
print(f"✓ Model Accuracy: {accuracy * 100:.2f}%")
print(f"✓ Status: Successfully Classified!")

print("\n" + "="*70)
print("KEY CONCEPTS:")
print("="*70)
print("""
1. Naïve Bayes Theorem:
   P(Class|Features) = P(Features|Class) * P(Class) / P(Features)

2. Assumption:
   All features are independent (Naïve assumption)

3. Gaussian Distribution:
   Used for continuous numerical features
   P(x) = (1/√(2πσ²)) * e^(-(x-μ)²/(2σ²))

4. Advantages:
   • Fast and efficient
   • Works well with small datasets
   • Good for multi-class classification
   • Requires less training data

5. Disadvantages:
   • Assumes feature independence (rarely true in real world)
   • Can be outperformed by complex models
   • Zero frequency problem in categorical data
""")

print("="*70)
print("           IMPLEMENTATION COMPLETED SUCCESSFULLY!")
print("="*70)


           NAÏVE BAYES CLASSIFIER IMPLEMENTATION

Step 1: Loading Dataset...
Dataset: Iris Flower Dataset
Total samples: 150
Number of features: 4
Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Classes: ['setosa' 'versicolor' 'virginica']

First 5 samples of dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  

----------------------------------------------------------------------
Step 2: Splitting Dataset (70% Train, 30% Test)...
Training se

Option 2

In [2]:
# Naive Bayes Classifier Implementation in Python
# Author: James Lewis

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Step 1: Sample Dataset (Example)
# Each record = [feature1, feature2, ..., label]
# Here we’ll use a simple dataset for demonstration

# Features: [age, income]
# Labels: 0 = No, 1 = Yes (e.g., “buys_computer”)
X = [[25, 40000], [35, 60000], [45, 80000], [20, 20000], [50, 100000]]
y = [0, 1, 1, 0, 1]

# Step 2: Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Create and train the model
model = GaussianNB()
model.fit(X_train, y_train)

# Step 4: Predict using test data
y_pred = model.predict(X_test)

# Step 5: Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Predictions:", y_pred)
print("Accuracy:", accuracy)


Predictions: [0]
Accuracy: 0.0


In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train
model = GaussianNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")

# Test new sample
new = [[5.1, 3.5, 1.4, 0.2]]
print(f"Prediction: {iris.target_names[model.predict(new)[0]]}")

Accuracy: 95.56%
Prediction: setosa


In [4]:
"""
Naïve Bayes Classifier - Simple Implementation
"""

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

print("="*60)
print("      NAÏVE BAYES CLASSIFIER")
print("="*60)

# Step 1: Load Data
print("\n1. Loading Iris Dataset...")
iris = load_iris()
X = iris.data
y = iris.target

print(f"   Total Samples: {len(X)}")
print(f"   Features: {iris.feature_names}")
print(f"   Classes: {iris.target_names}")

# Step 2: Split Data
print("\n2. Splitting Data (70-30)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
print(f"   Training: {len(X_train)}, Testing: {len(X_test)}")

# Step 3: Train Model
print("\n3. Training Naïve Bayes...")
model = GaussianNB()
model.fit(X_train, y_train)
print("   ✓ Training Complete!")

# Step 4: Make Predictions
print("\n4. Making Predictions...")
y_pred = model.predict(X_test)
print(f"   First 5 Predictions: {y_pred[:5]}")
print(f"   Actual Values:       {y_test[:5]}")

# Step 5: Evaluate
print("\n5. Model Evaluation")
accuracy = accuracy_score(y_test, y_pred)
print(f"   Accuracy: {accuracy*100:.2f}%")

print("\n   Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Step 6: Test New Sample
print("\n6. Testing New Sample")
new_sample = [[5.1, 3.5, 1.4, 0.2]]
prediction = model.predict(new_sample)
probability = model.predict_proba(new_sample)

print(f"   Sample: {new_sample[0]}")
print(f"   Predicted: {iris.target_names[prediction[0]]}")
print(f"   Probabilities:")
for i, prob in enumerate(probability[0]):
    print(f"      {iris.target_names[i]}: {prob*100:.2f}%")

print("\n" + "="*60)
print("✓ COMPLETE! Accuracy: {:.2f}%".format(accuracy*100))
print("="*60)

      NAÏVE BAYES CLASSIFIER

1. Loading Iris Dataset...
   Total Samples: 150
   Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
   Classes: ['setosa' 'versicolor' 'virginica']

2. Splitting Data (70-30)...
   Training: 105, Testing: 45

3. Training Naïve Bayes...
   ✓ Training Complete!

4. Making Predictions...
   First 5 Predictions: [1 0 2 1 1]
   Actual Values:       [1 0 2 1 1]

5. Model Evaluation
   Accuracy: 97.78%

   Confusion Matrix:
[[19  0  0]
 [ 0 12  1]
 [ 0  0 13]]

6. Testing New Sample
   Sample: [5.1, 3.5, 1.4, 0.2]
   Predicted: setosa
   Probabilities:
      setosa: 100.00%
      versicolor: 0.00%
      virginica: 0.00%

✓ COMPLETE! Accuracy: 97.78%
