In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
import io
import os # Import os for checking file existence

# Initialize df outside the try block to ensure it is always in scope.
df = None

# --- 1. Load Data ---
print("--- 1. Data Loading ---")

try:
    # --- Priority 1: Check for the custom file access mechanism (for this specific environment) ---
    if '__file_content' in globals():
        file_path_keys = ["uploaded:IRIS.csv", "uploaded:archive (5).zip/IRIS.csv"]
        file_content = None
        found_key = None

        # Try to access the content using the expected keys
        for key in file_path_keys:
            if key in __file_content:
                file_content = __file_content[key]
                found_key = key
                break

        if file_content is None:
            raise FileNotFoundError(f"Could not find IRIS.csv content using environment keys: {file_path_keys}")

        # Read the content into a Pandas DataFrame from the in-memory string
        df = pd.read_csv(io.StringIO(file_content))
        print(f"Data loaded successfully using environment key: {found_key}")

    # --- Priority 2: Fallback to standard file reading (for local/standard environments) ---
    elif os.path.exists("IRIS.csv"):
        df = pd.read_csv("IRIS.csv")
        print("Data loaded successfully from local file 'IRIS.csv'.")
    else:
        raise FileNotFoundError("IRIS.csv not found in the current directory and custom environment access failed.")

    print("-" * 30)
    print("First 5 rows of the dataset:")
    print(df.head())
    print("-" * 30)

except Exception as e:
    print(f"Error loading data: {e}")
    print("Data loading failed. Cannot proceed with model training.")
    # df remains None if an error occurred

# --- CHECK FOR SUCCESSFUL LOAD BEFORE PROCEEDING ---
if df is None:
    # If df is None, the try block failed. Print a message to confirm the halt.
    print("\n[CLASSIFIER HALTED] Please resolve the data loading error to continue.")
else:
    # --- 2. Data Preprocessing and Feature Engineering ---

    # Define features (X) and target (y)
    X = df.drop('species', axis=1) # Features are all columns except 'species'
    y = df['species']             # Target is the 'species' column

    # Encode the categorical target variable (species names) into numerical labels.
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    species_names = list(le.classes_)
    print(f"Species mapping: {species_names}")
    print("-" * 30)

    # --- 3. Split Data ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    print(f"Training set size: {len(X_train)} samples")
    print(f"Test set size: {len(X_test)} samples")
    print("-" * 30)

    # --- 4. Model Training ---
    # We use Logistic Regression, which is robust for this dataset.
    model = LogisticRegression(max_iter=300, random_state=42)

    print("Training the Logistic Regression model...")
    model.fit(X_train, y_train)
    print("Model training complete.")
    print("-" * 30)

    # --- 5. Model Evaluation ---
    y_pred = model.predict(X_test)

    print("--- Model Performance Evaluation ---")

    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=species_names))

    print("Confusion Matrix:")
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(pd.DataFrame(conf_matrix, index=species_names, columns=species_names))
    print("-" * 30)


    # --- 6. Prediction Function ---

    def predict_iris_species(sepal_length, sepal_width, petal_length, petal_width):
        """
        Predicts the species of an Iris flower based on its measurements.
        """
        # Create a DataFrame for the single sample
        new_data = pd.DataFrame([[sepal_length, sepal_width, petal_length, petal_width]],
                                columns=X.columns)

        # Make the prediction
        predicted_label = model.predict(new_data)[0]
        predicted_species = le.inverse_transform([predicted_label])[0]

        # Get the confidence
        probabilities = model.predict_proba(new_data)[0]
        confidence = np.max(probabilities) * 100

        return predicted_species, confidence

    # --- Example Prediction ---
    # Test with a known Iris-virginica sample: 6.3, 3.3, 6.0, 2.5
    example_sl, example_sw, example_pl, example_pw = 6.3, 3.3, 6.0, 2.5

    predicted_name, confidence = predict_iris_species(
        example_sl, example_sw, example_pl, example_pw
    )

    print(f"--- Example Prediction ---")
    print(f"Input: SL={example_sl}, SW={example_sw}, PL={example_pl}, PW={example_pw}")
    print(f"Predicted Species: {predicted_name}")
    print(f"Confidence: {confidence:.2f}%")

    # Test with a known Iris-setosa sample: 5.1, 3.5, 1.4, 0.2
    example_sl, example_sw, example_pl, example_pw = 5.1, 3.5, 1.4, 0.2

    predicted_name, confidence = predict_iris_species(
        example_sl, example_sw, example_pl, example_pw
    )

    print(f"\n--- Example Prediction ---")
    print(f"Input: SL={example_sl}, SW={example_sw}, PL={example_pl}, PW={example_pw}")
    print(f"Predicted Species: {predicted_name}")
    print(f"Confidence: {confidence:.2f}%")

--- 1. Data Loading ---
Data loaded successfully from local file 'IRIS.csv'.
------------------------------
First 5 rows of the dataset:
   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
------------------------------
Species mapping: ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
------------------------------
Training set size: 120 samples
Test set size: 30 samples
------------------------------
Training the Logistic Regression model...
Model training complete.
------------------------------
--- Model Performance Evaluation ---
Classification Report:
                 precision    recall  f1-score   support

    Iris-seto