<a href="https://colab.research.google.com/github/T-RexBytes/iris-ml-project/blob/main/iris_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.datasets import load_iris

# 1. Load the dataset from the library
iris_data = load_iris()

# 2. Create a table (DataFrame) with the measurements
# 'data' contains the numbers, 'feature_names' contains the column headers
df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)

# 3. Add the target (the species) to the table
df['species'] = iris_data.target

# 4. Show the first 5 rows to check if it worked
df.head()

df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [5]:
from sklearn.model_selection import train_test_split

# 1. Separate the input features (X) from the target answer (y)
X = df.drop('species', axis=1)  # Drop the species column to get just measurements
y = df['species']               # Keep just the species column

# 2. Split the data: 80% for training, 20% for testing
# random_state=42 ensures we get the same random split every time we run this
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Check how many flowers ended up in each pile
print("Training rows:", X_train.shape[0])
print("Testing rows:", X_test.shape[0])

Training rows: 120
Testing rows: 30


In [6]:
from sklearn.preprocessing import MinMaxScaler

# 1. Create the empty scaler tool
scaler = MinMaxScaler()

# 2. Learn the range AND transform the Training data
X_train_scaled = scaler.fit_transform(X_train)

# 3. Only transform the Test data (using the range we learned above)
X_test_scaled = scaler.transform(X_test)

# 4. Let's check the result by turning it back into a DataFrame just for viewing
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

print("Original Max Value (Sepal Length):", X_train['sepal length (cm)'].max())
print("Scaled Max Value (Sepal Length):", X_train_scaled_df['sepal length (cm)'].max())

Original Max Value (Sepal Length): 7.7
Scaled Max Value (Sepal Length): 0.9999999999999998


In [7]:
from sklearn.neighbors import KNeighborsClassifier

# 1. Create the classifier (let's look at the 3 closest neighbors)
knn = KNeighborsClassifier(n_neighbors=3)

# 2. Train the model
knn.fit(X_train_scaled, y_train)

print("The model has learned the patterns!")

The model has learned the patterns!


In [8]:
# Ask the model to predict the species for the test data
y_pred = knn.predict(X_test_scaled)

print("Predictions:", y_pred)

Predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix

# 1. Calculate the overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

# 2. Show the confusion matrix
# The rows are the ACTUAL species, the columns are the PREDICTED species
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

Accuracy Score: 1.0

Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
