# Supervised Learning with the Iris Dataset

This notebook demonstrates various supervised learning models using the famous Iris dataset. We'll cover:
1. Data loading and exploration
2. Data preprocessing
3. Model training and evaluation
4. Model comparison and selection

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set random seed for reproducibility
np.random.seed(42)

# Set style for plots
plt.style.use('seaborn')
sns.set_palette('husl')

In [None]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

# Create a DataFrame for easier visualization
df = pd.DataFrame(X, columns=feature_names)
df['species'] = [target_names[i] for i in y]

# Display basic information
print(f"Number of samples: {len(df)}")
print(f"Number of features: {len(feature_names)}")
print(f"Target classes: {target_names}")
print("\nFirst few rows of the dataset:")
df.head()

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe()

In [None]:
# Pairplot to visualize relationships between features
sns.pairplot(df, hue='species', height=2.5)
plt.show()

In [None]:
# Box plots for each feature
plt.figure(figsize=(15, 6))
for i, feature in enumerate(feature_names):
    plt.subplot(2, 2, i+1)
    sns.boxplot(x='species', y=feature, data=df)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred = log_reg.predict(X_test_scaled)

# Evaluate the model
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Initialize and train the model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)

# Make predictions
y_pred = dt.predict(X_test_scaled)

# Evaluate the model
print("Decision Tree Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Plot the decision tree
plt.figure(figsize=(20, 10))
plot_tree(dt, feature_names=feature_names, class_names=target_names, filled=True)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf.predict(X_test_scaled)

# Evaluate the model
print("Random Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Plot feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance - Random Forest')
plt.show()

In [None]:
from sklearn.svm import SVC

# Initialize and train the model
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svm.predict(X_test_scaled)

# Evaluate the model
print("SVM Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Plot decision boundaries for first two features
def plot_decision_boundary(model, X, y, feature_names, target_names):
    # Create a mesh grid
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))
    
    # Make predictions for each point in the mesh grid
    Z = model.predict(np.c_[xx.ravel(), yy.ravel(), np.zeros_like(xx.ravel()), np.zeros_like(xx.ravel())])
    Z = Z.reshape(xx.shape)
    
    # Plot the decision boundary
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.4)
    plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.8)
    plt.xlabel(feature_names[0])
    plt.ylabel(feature_names[1])
    plt.title('SVM Decision Boundary')
    plt.show()

plot_decision_boundary(svm, X_train_scaled, y_train, feature_names, target_names)

In [None]:
# Compare models using cross-validation
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42)
}

results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    results[name] = scores.mean()

# Plot the results
plt.figure(figsize=(10, 6))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.title('Model Comparison (5-fold Cross Validation)')
plt.ylabel('Mean Accuracy')
plt.xticks(rotation=45)
plt.show()

## Conclusion

In this notebook, we explored various supervised learning models using the Iris dataset. Here are the key findings:

1. **Data Exploration**: The Iris dataset is well-balanced and shows clear separation between classes, especially in petal measurements.

2. **Model Performance**:
   - All models performed well on this dataset, with accuracies above 90%
   - Random Forest and SVM showed the best performance
   - Logistic Regression and Decision Tree also performed well but slightly lower

3. **Feature Importance**:
   - Petal measurements were more important for classification than sepal measurements
   - This aligns with our visual observations from the pairplot

4. **Model Selection**:
   - For this specific dataset, any of the models would work well
   - The choice might depend on other factors like interpretability (Decision Tree) vs. performance (SVM/Random Forest)

This notebook serves as a good starting point for understanding supervised learning models and their application to classification problems. 