Data Analytics III
1. Implement Simple Naïve Bayes classification algorithm using Python/R on iris.csv dataset.
2. Compute Confusion matrix to find TP, FP, TN, FN, Accuracy, Error rate, Precision, Recall
on the given dataset.

In [1]:
# Step 1: Import Libraries and Load Data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report

# Load the dataset
df = pd.read_csv('/content/Iris Dataset Prac (1,3.2,6).csv')

# Drop the 'Id' column as it is not a predictive feature
if 'Id' in df.columns:
    df = df.drop('Id', axis=1)

print("Dataset Head:")
display(df.head())

Dataset Head:


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [2]:
# Step 2: Data Exploration with describe()

print("\n--- Statistical Summary ---")
display(df.describe())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())


--- Statistical Summary ---


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5



Missing values:
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [3]:
# Step 3: Split Data into Training and Testing Sets

# Features (Sepal/Petal lengths and widths)
X = df.drop('Species', axis=1)
# Target (The species name)
y = df['Species']

# Split: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Data split successfully: {len(X_train)} training samples, {len(X_test)} testing samples.")

Data split successfully: 120 training samples, 30 testing samples.


In [4]:
# Step 4: Implement Naïve Bayes Classification

# Initialize the model
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train)

# Make predictions
y_pred = nb_model.predict(X_test)

In [5]:
#Step 5: Compute Confusion Matrix and Evaluation Metrics

# Generate Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\n--- Confusion Matrix ---")
print(cm)

# Accuracy and Error Rate
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy

# Precision and Recall (using 'weighted' average for multi-class)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"\nAccuracy:   {accuracy:.4f}")
print(f"Error Rate: {error_rate:.4f}")
print(f"Precision:  {precision:.4f}")
print(f"Recall:     {recall:.4f}")

# Detailed Report (Shows TP/FP/TN/FN indirectly via Precision/Recall per class)
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test, y_pred))


--- Confusion Matrix ---
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

Accuracy:   1.0000
Error Rate: 0.0000
Precision:  1.0000
Recall:     1.0000

--- Detailed Classification Report ---
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00         9
 Iris-virginica       1.00      1.00      1.00        11

       accuracy                           1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30

