In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [13]:
from sklearn.datasets import load_iris
import pandas as pd

# Load the dataset iris is a special kind of object called a Bunch, which is basically like a dictionary.
iris = load_iris()

print(iris.keys())



dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [None]:
# Convert to a DataFrame for easy viewing
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['target_name'] = df['target'].map(lambda i: iris.target_names[i])

# Show the first 5 rows
print(df.head())  # shows first 5 rows nicely formatted

print(iris.feature_names) ## Gives you all the physical feature measurements of the Iris


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target target_name  
0       0      setosa  
1       0      setosa  
2       0      setosa  
3       0      setosa  
4       0      setosa  


In [15]:
##################################################################
## This gives us a function to split our dataset into training and testing sets.
## In ML, we train on one part of the data and test on another to see how well the model generalizes.
from sklearn.model_selection import train_test_split 
## This imports the k-Nearest Neighbors (kNN) model.
## It’s a simple ML model that classifies a data point by looking at its nearest neighbors in the training set.
from sklearn.neighbors import KNeighborsClassifier
## This is a helper function to check how well the model performs.
## It compares predictions to actual values and tells us what percentage were correct.
from sklearn.metrics import accuracy_score

##################################################################
# Step 1: Define features (X) and labels (y)
X = df[iris.feature_names] ## X is your features (the 4 columns with measurements: sepal/petal length/width).
y = df['target'] ## y is your label (which species the flower is: 0, 1, or 2).

In [20]:
# Step 2: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

## train_test_split --> splits the data.
## test_size = 0.2 --> 20% of the data is saved for testing
## random_state --> can be any integer, ensures the same split every time (for reproducibility).

## X_train = training features

## y_train = training labels

## X_test = test features

## y_test = test labels

In [50]:
# Step 3: Create the model
knn = KNeighborsClassifier(n_neighbors=5)

## This creates a k-Nearest Neighbors classifier, using k = 3.
## It means it will classify based on the 3 closest flowers in the training set.
## You can try 1, 5, 7, etc. Odd numbers are often preferred to avoid ties.
## A value too large (k = 100) might "over-smooth" the decision and hurt accuracy.



In [51]:
# Step 4: Train the model
knn.fit(X_train, y_train)
## knn.fit() means “learn from the training data”.
## It stores the training data and prepares the model for prediction

In [52]:
# Step 5: Predict on test data
y_pred = knn.predict(X_test)

## Now we ask the model to predict the species of the flowers in the test set.



In [54]:

# Step 6: Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

## Compares y_pred (your model’s guesses) to y_test (the real answers).
## Tells you how many the model got right — e.g., Accuracy: 0.97 = 97% correct.

Accuracy: 0.90


In [55]:
## Step: Compare results 

from sklearn.metrics import accuracy_score

# Predict on test data
y_pred = knn.predict(X_test)

# Combine predictions and actual values in a DataFrame
comparison = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
})

# Map the numbers to species names
species_names = iris.target_names
comparison['Actual'] = comparison['Actual'].map(lambda i: species_names[i])
comparison['Predicted'] = comparison['Predicted'].map(lambda i: species_names[i])

# Add per-row accuracy (True/False or 1/0)
comparison['Correct'] = (comparison['Actual'] == comparison['Predicted']).astype(int)  # 1 = correct, 0 = wrong

print(comparison.head(10))

# Optionally, show total model accuracy as well
accuracy = comparison['Correct'].mean()
print(f"\nModel Accuracy: {accuracy:.2f}")


       Actual   Predicted  Correct
0   virginica   virginica        1
1  versicolor  versicolor        1
2      setosa      setosa        1
3  versicolor  versicolor        1
4   virginica  versicolor        0
5      setosa      setosa        1
6  versicolor  versicolor        1
7  versicolor  versicolor        1
8      setosa      setosa        1
9  versicolor  versicolor        1

Model Accuracy: 0.90


In [None]:
## model accuracy down to 70% for n = 99 whereas for n = 5 its 90%