In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

In [4]:
# load csv file
df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
print(df.columns)

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')


In [6]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [8]:
print(df.shape)

(150, 5)


In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


In [10]:
#checking for null values
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [14]:
# Get the number of unique values in a specific column
unique_count = df['species'].nunique()
print(f"Number of unique values: {unique_count}")
print(df['species'].unique())

Number of unique values: 3
['setosa' 'versicolor' 'virginica']


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Define X (Features) and y (Target)
# Note: Check if your target column is 'species' or 'Species' (capital S)
X = df.drop('species', axis=1) 
y = df['species']

# 2. Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Scale the Data (Good practice for KNN)
# We scale X_train and X_test so the "distance" calculations are fair
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Data Split & Scaled!")
print(f"Training shape: {X_train.shape}")
print(f"Testing shape:  {X_test.shape}")

Data Split & Scaled!
Training shape: (120, 4)
Testing shape:  (30, 4)


In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- Step 3: Initialize the Model ---
# n_neighbors=3 means "Look at the 3 closest neighbors to decide"
knn_model = KNeighborsClassifier(n_neighbors=3)

# --- Step 4: Train the Model ---
# We teach the model using the Training Data
knn_model.fit(X_train, y_train)

# --- Step 5: Make Predictions ---
# We hide the answers (y_test) and ask the model to guess the species for X_test
predictions = knn_model.predict(X_test)

# --- Step 6: Evaluate ---
# 1. Calculate Accuracy (How many did it get right?)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# 2. Detailed Report (Precision, Recall for each flower type)
print("\n--- Classification Report ---")
print(classification_report(y_test, predictions))

# 3. Confusion Matrix (Where did it get confused?)
print("\n--- Confusion Matrix ---")
print(confusion_matrix(y_test, predictions))

Model Accuracy: 100.00%

--- Classification Report ---
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


--- Confusion Matrix ---
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
