# IRIS-CLASSIFICATION

## 1.setup

In [16]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

## 2.prepare input data

In [18]:
# iris = load_iris()
# X = iris.data  # The feature data
# y = iris.target # The target labels

# print("--- Dataset Information ---")
# print(f"Features (X) shape: {X.shape}")
# print(f"Labels (y) shape: {y.shape}")
# print(f"Target species names: {iris.target_names}")
# print("-" * 25)

# # We'll use 80% of the data for training and 20% for testing.
# # The `random_state` ensures that the split is the same every time you run the code,
# # making your results reproducible.
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )
# print("\nTrain Test Dimensions:\n------------------------------------")
# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
#------------------------------------------------------------------------------------------------------------------------------------------------------
# 1. Load the dataset from a CSV file
# Make sure your 'iris.csv' file is in the same directory as this script.
try:
    iris_data = pd.read_csv("iris.csv")
except FileNotFoundError:
    print("Error: The file 'iris.csv' was not found.")
    print("Please ensure the file is in the same directory as this script.")

print("--- Dataset Information from CSV ---")
print(f"Dataset shape: {iris_data.shape}")
print("First 5 rows of the dataset:")
print(iris_data.head())
print("-" * 35)

# 2. Prepare the data: separate features (X) and labels (y)
# The last column is assumed to be the species label.
# We use .iloc to select columns by their position.
X = iris_data.iloc[:, :-1].values  # All rows, all columns except the last one
y_raw = iris_data.iloc[:, -1].values # All rows, only the last column

# Convert species names to numerical labels
# We'll use pandas' factorize to assign a unique integer to each species name.
# This is necessary because machine learning models work with numbers, not text.
y, target_species_names = pd.factorize(y_raw)

print("\n--- Processed Data Dimensions ---")
print(f"Features (X) shape: {X.shape}")
print(f"Labels (y) shape: {y.shape}")
print(f"Target species names: {target_species_names}")
print("-" * 35)

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

--- Dataset Information from CSV ---
Dataset shape: (150, 5)
First 5 rows of the dataset:
   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
-----------------------------------

--- Processed Data Dimensions ---
Features (X) shape: (150, 4)
Labels (y) shape: (150,)
Target species names: ['setosa' 'versicolor' 'virginica']
-----------------------------------


## 3.creating a model

In [19]:
# A pipeline simplifies the workflow by chaining together multiple steps.
# Here, we combine a StandardScaler (for feature scaling) and a RandomForestClassifier.
# The StandardScaler helps models that are sensitive to the scale of features.
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Preprocessing step
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) # Model step
])

## 4.training and evaluating model

In [20]:
# The .fit() method trains all steps in the pipeline.
pipeline.fit(X_train, y_train)
print("Model training complete.")
print("-" * 25)

# The .predict() method applies the trained pipeline to the test features.
y_pred = pipeline.predict(X_test)

# We use accuracy and a classification report to get a detailed view of the model's performance.
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=iris.target_names)

print("\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)

Model training complete.
-------------------------

--- Model Evaluation ---
Accuracy: 1.0000

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## 5.prediction

In [21]:
# Let's say we have a new flower with these measurements:
# [sepal_length, sepal_width, petal_length, petal_width]
new_flower = np.array([[5.1, 3.5, 1.4, 0.2]]) # This is a Setosa

# We use the trained pipeline to make a prediction on the new data.
# The pipeline will automatically apply the StandardScaler before predicting.
predicted_class = pipeline.predict(new_flower)
predicted_species = iris.target_names[predicted_class][0]

print("\n--- Predicting a New Species ---")
print(f"New flower features: {new_flower[0]}")
print(f"Predicted species: {predicted_species}")


--- Predicting a New Species ---
New flower features: [5.1 3.5 1.4 0.2]
Predicted species: setosa
