Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Load and Inspect the Data

In [None]:
# Define the file path in Google Colab
file_name = '/content/IRIS.csv'

try:
    df = pd.read_csv(file_name)
    print(f"Successfully loaded '{file_name}'.")

    # Display the first 5 rows
    print("\n--- Data Head ---")
    print(df.head())

    # Display info about columns, data types, and non-null counts
    print("\n--- Initial Data Info ---")
    df.info()

    # Check the distribution of the target variable
    print("\n--- Species Distribution ---")
    print(df['species'].value_counts())

except FileNotFoundError:
    print(f"Error: The file '{file_name}' was not found.")
    print("Please make sure you have uploaded 'IRIS.csv' to your Colab session.")

Successfully loaded '/content/IRIS.csv'.

--- Data Head ---
   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

--- Initial Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

--- Species Distribution ---
species
Iris

Encode the Target Variable

In [None]:
# Initialize the LabelEncoder
encoder = LabelEncoder()

# Fit and transform the 'species' column
df['species_encoded'] = encoder.fit_transform(df['species'])

# Store the class names for later use in evaluation
class_names = encoder.classes_
print(f"Encoded classes: {list(zip(class_names, encoder.transform(class_names)))}")

# Display the DataFrame to see the new column
print("\n--- Data with Encoded Target ---")
print(df.head())

Encoded classes: [('Iris-setosa', np.int64(0)), ('Iris-versicolor', np.int64(1)), ('Iris-virginica', np.int64(2))]

--- Data with Encoded Target ---
   sepal_length  sepal_width  petal_length  petal_width      species  \
0           5.1          3.5           1.4          0.2  Iris-setosa   
1           4.9          3.0           1.4          0.2  Iris-setosa   
2           4.7          3.2           1.3          0.2  Iris-setosa   
3           4.6          3.1           1.5          0.2  Iris-setosa   
4           5.0          3.6           1.4          0.2  Iris-setosa   

   species_encoded  
0                0  
1                0  
2                0  
3                0  
4                0  


Define Features (X) and Target (y)

In [None]:
# Our features 'X' are the four measurement columns
feature_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = df[feature_columns]

# Our target 'y' is the new encoded 'species_encoded' column
y = df['species_encoded']

print("Features (X) defined with columns:", X.columns.tolist())
print("Target (y) defined.")

Features (X) defined with columns: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
Target (y) defined.


Split Data into Training and Testing Sets

In [None]:
# Split data into 80% training and 20% testing
# stratify=y ensures the class distribution is the same in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Data split into:")
print(f"{len(X_train)} training samples")
print(f"{len(X_test)} testing samples")

Data split into:
120 training samples
30 testing samples


Feature Scaling

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform both the training and testing data using the fitted scaler
X_test_scaled = scaler.transform(X_test)

print("Features have been scaled.")

Features have been scaled.


Train the Model

In [None]:
# Initialize the k-NN classifier
# n_neighbors=5 means it will look at the 5 closest training examples
knn = KNeighborsClassifier(n_neighbors=5)

print("Training the k-NN model...")
# Train the model
knn.fit(X_train_scaled, y_train)

print("Model training complete.")

Training the k-NN model...
Model training complete.


Evaluate the Model

In [None]:
print("--- Model Evaluation ---")

# Make predictions on the scaled test set
y_pred = knn.predict(X_test_scaled)

# 1. Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f} (or {accuracy*100:.2f}%)")

# 2. Print Confusion Matrix
print("\n--- Confusion Matrix ---")
# The rows are the True Labels, Columns are Predicted Labels
#       [Setosa, Versicolor, Virginica]
print(confusion_matrix(y_test, y_pred))

# 3. Print Classification Report
print("\n--- Classification Report ---")
# We use target_names to see the actual species names
print(classification_report(y_test, y_pred, target_names=class_names))

--- Model Evaluation ---

Model Accuracy: 0.9333 (or 93.33%)

--- Confusion Matrix ---
[[10  0  0]
 [ 0 10  0]
 [ 0  2  8]]

--- Classification Report ---
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.83      1.00      0.91        10
 Iris-virginica       1.00      0.80      0.89        10

       accuracy                           0.93        30
      macro avg       0.94      0.93      0.93        30
   weighted avg       0.94      0.93      0.93        30



Example Prediction on New Data

In [None]:
print("--- Example Prediction ---")

# Let's create a new flower with these measurements:
# sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2
# This is a classic 'Iris-setosa'
new_flower = np.array([[5.1, 3.5, 1.4, 0.2]])

# We must scale this new data using the *same scaler* we trained
new_flower_scaled = scaler.transform(new_flower)

# Make the prediction
prediction_encoded = knn.predict(new_flower_scaled)

# The prediction is a number (0, 1, or 2). We use the encoder to get the name back.
prediction_species = encoder.inverse_transform(prediction_encoded)

print(f"Measurements: {new_flower[0]}")
print(f"Predicted species: {prediction_species[0]}")

--- Example Prediction ---
Measurements: [5.1 3.5 1.4 0.2]
Predicted species: Iris-setosa


