In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings

# Sample data (simulated flower data for classification)
data = {
    'sepal_length': [5.1, 4.9, 7.0, 6.4, 5.8, 6.3, 5.0, 6.5, 6.2, 5.9, 5.0, 7.7, 6.0, 6.9, 5.6, 6.7, 6.3, 5.7, 6.4, 5.5],
    'sepal_width': [3.5, 3.0, 3.2, 3.2, 2.7, 3.3, 3.6, 2.8, 2.2, 3.2, 2.3, 3.8, 2.9, 3.1, 3.0, 2.5, 2.8, 2.8, 3.1, 2.4],
    'petal_length': [1.4, 1.4, 4.7, 4.5, 5.1, 6.0, 1.4, 4.6, 4.5, 4.8, 3.3, 6.7, 4.0, 4.9, 3.9, 5.7, 5.1, 4.5, 5.3, 3.8],
    'petal_width': [0.2, 0.2, 1.3, 1.5, 1.9, 2.5, 0.2, 1.5, 1.5, 1.8, 1.0, 2.0, 1.2, 1.5, 1.1, 2.1, 1.8, 1.3, 1.9, 1.1],
    'species': ['Setosa', 'Setosa', 'Versicolor', 'Versicolor', 'Virginica', 'Virginica', 'Setosa', 'Versicolor', 'Versicolor', 'Virginica', 'Setosa', 'Virginica', 'Versicolor', 'Virginica', 'Versicolor', 'Virginica', 'Virginica', 'Versicolor', 'Virginica', 'Versicolor']
}
df = pd.DataFrame(data)

# 1. Feature Engineering: Create a new feature
#   - Assumption:  Setosa species has smaller petal area than others.
#   - New Feature: 'petal_area' = petal_length * petal_width
def create_petal_area(df):
    """
    Creates a new feature 'petal_area' in the DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing 'petal_length' and 'petal_width'.

    Returns:
        pd.DataFrame: DataFrame with the added 'petal_area' feature.
    """
    df['petal_area'] = df['petal_length'] * df['petal_width']
    return df

# 2. Model Training and Evaluation
def train_and_evaluate_model(X_train, y_train, X_test, y_test, model_name="Decision Tree"):
    """
    Trains a classification model and evaluates its performance.

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target variable.
        X_test (pd.DataFrame): Testing features.
        y_test (pd.Series): Testing target variable.
        model_name (str, optional): Name of the model for reporting. Defaults to "Decision Tree".

    Returns:
        tuple: (accuracy, classification_report) of the model on the test set.
    """
    model = DecisionTreeClassifier(random_state=42)  # Initialize Decision Tree model
    model.fit(X_train, y_train)             # Train the model
    y_pred = model.predict(X_test)    # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    report = classification_report(y_test, y_pred)    # Get detailed classification report

    print(f"\nResults for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(report)

    return accuracy, report

# 3. Feature Importance
def display_feature_importance(model, X):
    """
    Displays the feature importance from the trained model.

    Args:
        model: Trained classification model (must have feature_importances_ attribute).
        X (pd.DataFrame): The feature DataFrame used for training.
    """
    if hasattr(model, 'feature_importances_'):
        print("\nFeature Importances:")
        print(model.feature_importances_)  # Print the importance of each feature

        feature_names = X.columns
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]  # Sort feature importances in descending order

        # Print the feature ranking
        print("Feature ranking:")
        for f in range(X.shape[1]):
            print(f"{f+1}. {feature_names[indices[f]]} ({importances[indices[f]]:.3f})")
    else:
        print(f"\nFeature Importance not available for model type: {type(model).__name__}")

def main():
    """
    Main function to orchestrate the feature engineering, model training, and evaluation.
    """
    # Load and prepare data
    df = pd.DataFrame(data)
    df = create_petal_area(df) # Create the petal_area feature

    # Prepare data for classification
    X = df.drop('species', axis=1)  # Features (including new 'petal_area')
    y = df['species']             # Target variable ('species' column)

    #hold out 20% of the data for final testing
    X_train_val, X_test_final, y_train_val, y_test_final = train_test_split(X, y, test_size=0.2, random_state=42)

    # 2. Verify Utility of the New Feature with a simple classification model
    #   - Use Decision Tree (simple, interpretable)
    #   - Compare performance with and without the new feature

    # 2.a: Train and Evaluate WITHOUT the new feature
    X_train_no_area = X_train_val.drop('petal_area', axis=1)
    X_test_no_area  = X_test_final.drop('petal_area', axis=1)

    accuracy_no_area, report_no_area = train_and_evaluate_model(X_train_no_area, y_train_val, X_test_no_area, y_test_final, model_name="Decision Tree (No Petal Area)")

    # 2.b: Train and Evaluate WITH the new feature ('petal_area')
    accuracy_with_area, report_with_area = train_and_evaluate_model(X_train_val, y_train_val, X_test_final, y_test_final, model_name="Decision Tree (With Petal Area)")

    # 3. Feature Importance
    #  Display feature importances from the model trained with 'petal_area'
    model_with_area = DecisionTreeClassifier(random_state=42)
    model_with_area.fit(X_train_val, y_train_val)
    display_feature_importance(model_with_area, X_train_val)

    # 4. Model Stability Check
    print("\nModel Stability Check (Multiple Train/Test Splits):")
    kf = KFold(n_splits=5, shuffle=True, random_state=42) # 5-fold cross-validation
    accuracies_with_area = []
    accuracies_no_area = []

    for train_index, val_index in kf.split(X_train_val):
        X_train_fold, X_val_fold = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
        y_train_fold, y_val_fold = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

        # With petal area
        model_with_area_fold = DecisionTreeClassifier(random_state=42)
        model_with_area_fold.fit(X_train_fold, y_train_fold)
        acc_with_area_fold = accuracy_score(y_val_fold, model_with_area_fold.predict(X_val_fold))
        accuracies_with_area.append(acc_with_area_fold)

        # Without petal area
        model_no_area_fold = DecisionTreeClassifier(random_state=42)
        model_no_area_fold.fit(X_train_fold.drop('petal_area',axis=1), y_train_fold)
        acc_no_area_fold = accuracy_score(y_val_fold, model_no_area_fold.predict(X_val_fold.drop('petal_area',axis=1)))
        accuracies_no_area.append(acc_no_area_fold)

    print(f"Average Accuracy with petal_area: {np.mean(accuracies_with_area):.4f}, Std Dev: {np.std(accuracies_with_area):.4f}")
    print(f"Average Accuracy without petal_area: {np.mean(accuracies_no_area):.4f}, Std Dev: {np.std(accuracies_no_area):.4f}")

    # Final evaluation on the held-out test set
    print("\nFinal Evaluation on Held-Out Test Set:")
    train_and_evaluate_model(X_train_val, y_train_val, X_test_final, y_test_final, model_name="Decision Tree (Final Test)")


if __name__ == "__main__":
    main()

# Title: Feature Engineering for Classification
# Description: Create a new feature that could help distinguish between species based on
# logical assumptions and verify its utility.



Results for Decision Tree (No Petal Area):
Accuracy: 1.0000
              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00         2
  Versicolor       1.00      1.00      1.00         1
   Virginica       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4


Results for Decision Tree (With Petal Area):
Accuracy: 1.0000
              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00         2
  Versicolor       1.00      1.00      1.00         1
   Virginica       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4


Feature Importances:
[0.         0.         0.32323232 0.         0.67676768]
Feature ranking:
1. petal_area (0.6