In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Sample data (simulated flower data for classification)
data = {
    'sepal_length': [5.1, 4.9, 7.0, 6.4, 5.8, 6.3, 5.0, 6.5, 6.2, 5.9, 5.0, 7.7, 6.0, 6.9, 5.6, 6.7, 6.3, 5.7, 6.4, 5.5],
    'sepal_width': [3.5, 3.0, 3.2, 3.2, 2.7, 3.3, 3.6, 2.8, 2.2, 3.2, 2.3, 3.8, 2.9, 3.1, 3.0, 2.5, 2.8, 2.8, 3.1, 2.4],
    'petal_length': [1.4, 1.4, 4.7, 4.5, 5.1, 6.0, 1.4, 4.6, 4.5, 4.8, 3.3, 6.7, 4.0, 4.9, 3.9, 5.7, 5.1, 4.5, 5.3, 3.8],
    'petal_width': [0.2, 0.2, 1.3, 1.5, 1.9, 2.5, 0.2, 1.5, 1.5, 1.8, 1.0, 2.0, 1.2, 1.5, 1.1, 2.1, 1.8, 1.3, 1.9, 1.1],
    'species': ['Setosa', 'Setosa', 'Versicolor', 'Versicolor', 'Virginica', 'Virginica', 'Setosa', 'Versicolor', 'Versicolor', 'Virginica', 'Setosa', 'Virginica', 'Versicolor', 'Virginica', 'Versicolor', 'Virginica', 'Virginica', 'Versicolor', 'Virginica', 'Versicolor']
}
df = pd.DataFrame(data)

# 1. Feature Engineering: Create a new feature
#   - Assumption:  Setosa species has smaller petal area than others.
#   - New Feature: 'petal_area' = petal_length * petal_width
df['petal_area'] = df['petal_length'] * df['petal_width']  # Calculate petal area and store it in a new column

# 2. Verify Utility of the New Feature with a simple classification model
#   - Use Decision Tree (simple, interpretable)
#   - Compare performance with and without the new feature

# Prepare data for classification
X = df.drop('species', axis=1)  # Features (including new 'petal_area') - dropping the target variable
y = df['species']             # Target variable ('species' column)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)  # 70% train, 30% test split

# 2.a: Train and Evaluate WITHOUT the new feature
X_train_no_area = X_train.drop('petal_area', axis=1) #remove petal area
X_test_no_area  = X_test.drop('petal_area', axis=1)  #remove petal area

model_no_area = DecisionTreeClassifier(random_state=42)  # Initialize Decision Tree model
model_no_area.fit(X_train_no_area, y_train)             # Train the model WITHOUT 'petal_area'
y_pred_no_area = model_no_area.predict(X_test_no_area)    # Make predictions
accuracy_no_area = accuracy_score(y_test, y_pred_no_area)  # Calculate accuracy
report_no_area = classification_report(y_test, y_pred_no_area)    # Get detailed classification report

print("Results WITHOUT 'petal_area':")
print(f"Accuracy: {accuracy_no_area:.4f}")
print(report_no_area)

# 2.b: Train and Evaluate WITH the new feature ('petal_area')
model_with_area = DecisionTreeClassifier(random_state=42)  # Initialize Decision Tree model
model_with_area.fit(X_train, y_train)             # Train the model WITH 'petal_area'
y_pred_with_area = model_with_area.predict(X_test)    # Make predictions
accuracy_with_area = accuracy_score(y_test, y_pred_with_area)  # Calculate accuracy
report_with_area = classification_report(y_test, y_pred_with_area)    # Get detailed classification report

print("\nResults WITH 'petal_area':")
print(f"Accuracy: {accuracy_with_area:.4f}")
print(report_with_area)

# 3. Feature Importance
#  Display feature importances from the model trained with 'petal_area'
print("\nFeature Importances (WITH 'petal_area'):")
print(model_with_area.feature_importances_)  # Print the importance of each feature

feature_names = X.columns
importances = model_with_area.feature_importances_
indices = np.argsort(importances)[::-1]  # Sort feature importances in descending order

# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
    print(f"{f+1}. {feature_names[indices[f]]} ({importances[indices[f]]:.3f})") #print feature ranking

# Title: Feature Engineering for Classification
# Description: Create a new feature that could help distinguish between species based on
# logical assumptions and verify its utility.


Results WITHOUT 'petal_area':
Accuracy: 1.0000
              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00         2
  Versicolor       1.00      1.00      1.00         2
   Virginica       1.00      1.00      1.00         2

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6


Results WITH 'petal_area':
Accuracy: 1.0000
              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00         2
  Versicolor       1.00      1.00      1.00         2
   Virginica       1.00      1.00      1.00         2

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6


Feature Importances (WITH 'petal_area'):
[0.   0.   0.35 0.   0.65]
Feature ranking:
1. petal_area (0.650)
2. petal_length (0.350)
3. petal_width