In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the dataset
data = pd.read_csv('iris.csv')

# Display the first few rows
print("First few rows of the dataset:")
print(data.head())

# Get some basic information about the dataset
print("\nInformation about the dataset:")
print(data.info())

# Summary statistics
print("\nSummary statistics of the dataset:")
print(data.describe())

# Check the distribution of the target variable
print("\nDistribution of the target variable:")
print(data['species'].value_counts())

First few rows of the dataset:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None

Summary statistics of the dataset:
       sepal_length  sepal_width  petal_length  petal_w

In [3]:
# Select features (independent variables) and target (dependent variable)
X = data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = data['species']

# Encode the target variable (species) into numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("\nEncoded target variable (first 10 values):", y_encoded[:10])
print("\nMapping of encoded values to original species:", dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_)))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Encoded target variable (first 10 values): [0 0 0 0 0 0 0 0 0 0]

Mapping of encoded values to original species: {np.int64(0): 'setosa', np.int64(1): 'versicolor', np.int64(2): 'virginica'}

Shape of X_train: (105, 4)
Shape of X_test: (45, 4)
Shape of y_train: (105,)
Shape of y_test: (45,)


In [4]:

# Initialize the Gaussian Naïve Bayes model
model = GaussianNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

print("\nPredictions on the test set (encoded):")
print(y_pred)

# Convert the encoded predictions back to original species names (optional, for better understanding)
y_pred_original = label_encoder.inverse_transform(y_pred)
print("\nPredictions on the test set (original species):")
print(y_pred_original)



Predictions on the test set (encoded):
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 2 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]

Predictions on the test set (original species):
['versicolor' 'setosa' 'virginica' 'versicolor' 'versicolor' 'setosa'
 'versicolor' 'virginica' 'versicolor' 'versicolor' 'virginica' 'setosa'
 'setosa' 'setosa' 'setosa' 'virginica' 'virginica' 'versicolor'
 'versicolor' 'virginica' 'setosa' 'virginica' 'setosa' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'setosa' 'setosa'
 'setosa' 'setosa' 'versicolor' 'setosa' 'setosa' 'virginica' 'versicolor'
 'setosa' 'setosa' 'setosa' 'virginica' 'versicolor' 'versicolor' 'setosa'
 'setosa']


In [5]:
# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# For multi-class, TP, FP, TN, FN are not as straightforward as in binary classification.
# The confusion matrix itself provides a detailed view of the classification performance per class.

# Compute Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

# Compute Error Rate
error_rate = 1 - accuracy
print("Error Rate:", error_rate)

# Compute Precision
precision = precision_score(y_test, y_pred, average='weighted') # Using 'weighted' for multi-class
print("Precision:", precision)

# Compute Recall
recall = recall_score(y_test, y_pred, average='weighted') # Using 'weighted' for multi-class
print("Recall:", recall)


Confusion Matrix:
[[19  0  0]
 [ 0 12  1]
 [ 0  0 13]]

Accuracy: 0.9777777777777777
Error Rate: 0.022222222222222254
Precision: 0.9793650793650793
Recall: 0.9777777777777777
