# Machine Learning Starter Notebook

This notebook demonstrates a complete machine learning workflow using the Iris dataset. It covers data loading, preprocessing, feature engineering, model training, evaluation, and predictions.

## Workflow Overview:
1. **Import Required Libraries** - Essential data science libraries
2. **Load and Inspect Data** - Dataset exploration and understanding
3. **Data Preprocessing** - Cleaning and preparing the data
4. **Feature Engineering** - Creating and transforming features
5. **Model Training** - Training machine learning models
6. **Model Evaluation** - Assessing model performance
7. **Make Predictions** - Using the model on new data

## 1. Import Required Libraries

Import all the essential libraries for data manipulation, visualization, and machine learning.

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Dict, Any

# Scikit-learn imports
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, auc, precision_recall_curve
)

# Import our custom utilities
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

try:
    from data_processing import clean_data, encode_categorical_features, scale_features
    from model_utils import train_model, evaluate_classification_model, save_model
    from visualization import plot_correlation_matrix, plot_confusion_matrix, plot_feature_importance
    print("✅ Custom utilities imported successfully!")
except ImportError as e:
    print(f"⚠️ Could not import custom utilities: {e}")
    print("Don't worry, we'll use built-in functions instead.")

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("📚 All libraries imported successfully!")

## 2. Load and Inspect Data

Load the Iris dataset and perform initial exploration to understand the data structure and characteristics.

In [None]:
# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = [iris.target_names[i] for i in iris.target]

print("🌸 Iris Dataset Loaded Successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Features: {list(iris.feature_names)}")
print(f"Target classes: {list(iris.target_names)}")

# Display first few rows
print("\n📊 First 5 rows of the dataset:")
df.head()

In [None]:
# Exploratory Data Analysis
print("🔍 Dataset Information:")
print(df.info())

print("\n📈 Statistical Summary:")
print(df.describe())

print("\n🎯 Target Distribution:")
print(df['species'].value_counts())

print("\n❓ Missing Values:")
print(df.isnull().sum())

print("\n🔗 Data Types:")
print(df.dtypes)

In [None]:
# Data Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Pairplot-style visualization
features = iris.feature_names
for i, feature in enumerate(features):
    ax = axes[i//2, i%2]
    for species in df['species'].unique():
        species_data = df[df['species'] == species]
        ax.hist(species_data[feature], alpha=0.7, label=species, bins=15)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')
    ax.legend()

plt.tight_layout()
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = df[iris.feature_names].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 3. Data Preprocessing

Clean and prepare the data for machine learning. This includes handling missing values, encoding categorical variables, and scaling features.

In [None]:
# Check for missing values and duplicates
print("🧹 Data Cleaning Check:")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

# Since Iris dataset is clean, no cleaning needed
print("✅ Data is already clean!")

# Separate features and target
X = df[iris.feature_names].copy()
y = df['target'].copy()

print(f"\n📊 Features shape: {X.shape}")
print(f"🎯 Target shape: {y.shape}")

# Display feature statistics
print("\n📈 Feature Statistics:")
X.describe()

## 4. Feature Engineering

Create new features or transform existing ones to potentially improve model performance.

In [None]:
# Create new features
X_engineered = X.copy()

# Feature 1: Petal area (length × width)
X_engineered['petal_area'] = X['petal length (cm)'] * X['petal width (cm)']

# Feature 2: Sepal area (length × width)
X_engineered['sepal_area'] = X['sepal length (cm)'] * X['sepal width (cm)']

# Feature 3: Petal to sepal length ratio
X_engineered['petal_sepal_length_ratio'] = X['petal length (cm)'] / X['sepal length (cm)']

# Feature 4: Petal to sepal width ratio
X_engineered['petal_sepal_width_ratio'] = X['petal width (cm)'] / X['sepal width (cm)']

# Feature 5: Total area (petal + sepal)
X_engineered['total_area'] = X_engineered['petal_area'] + X_engineered['sepal_area']

print("🔧 Feature Engineering Complete!")
print(f"Original features: {X.shape[1]}")
print(f"Engineered features: {X_engineered.shape[1]}")
print(f"New features added: {X_engineered.shape[1] - X.shape[1]}")

print("\n🆕 New Features:")
new_features = ['petal_area', 'sepal_area', 'petal_sepal_length_ratio', 
                'petal_sepal_width_ratio', 'total_area']
for feature in new_features:
    print(f"  • {feature}")

# Display first few rows of engineered features
print("\n📊 Sample of Engineered Features:")
X_engineered[new_features].head()

## 5. Model Training

Split the data and train multiple machine learning models to compare their performance.

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_engineered, y, test_size=0.2, random_state=42, stratify=y
)

print("🔄 Data Split Complete!")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("\n⚖️ Feature Scaling Complete!")
print("All features are now standardized (mean=0, std=1)")

# Verify scaling
print(f"\nScaled training data mean: {X_train_scaled.mean().mean():.2e}")
print(f"Scaled training data std: {X_train_scaled.std().mean():.2f}")

In [None]:
# Initialize multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

# Train all models
trained_models = {}
training_scores = {}

print("🚀 Training Models...")
print("=" * 50)

for name, model in models.items():
    print(f"\n🔄 Training {name}...")
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model
    
    # Calculate training accuracy
    train_pred = model.predict(X_train_scaled)
    train_accuracy = accuracy_score(y_train, train_pred)
    training_scores[name] = train_accuracy
    
    print(f"✅ {name} trained successfully!")
    print(f"   Training Accuracy: {train_accuracy:.4f}")

print("\n🎯 All models trained successfully!")
print("\n📊 Training Scores Summary:")
for name, score in training_scores.items():
    print(f"  • {name}: {score:.4f}")

## 6. Model Evaluation

Evaluate the trained models using various metrics and visualizations to determine the best performing model.

In [None]:
# Evaluate all models on test set
test_scores = {}
predictions = {}

print("📊 Model Evaluation Results")
print("=" * 60)

for name, model in trained_models.items():
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    predictions[name] = y_pred
    
    # Calculate test accuracy
    test_accuracy = accuracy_score(y_test, y_pred)
    test_scores[name] = test_accuracy
    
    print(f"\n🎯 {name} Results:")
    print(f"   Test Accuracy: {test_accuracy:.4f}")
    print(f"   Training Accuracy: {training_scores[name]:.4f}")
    print(f"   Difference: {abs(training_scores[name] - test_accuracy):.4f}")
    
    # Classification report
    print(f"\n📋 Classification Report for {name}:")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))

# Compare models
print("\n🏆 Model Comparison:")
comparison_df = pd.DataFrame({
    'Model': list(test_scores.keys()),
    'Test Accuracy': list(test_scores.values()),
    'Training Accuracy': [training_scores[name] for name in test_scores.keys()]
})
comparison_df['Overfitting'] = comparison_df['Training Accuracy'] - comparison_df['Test Accuracy']
comparison_df = comparison_df.sort_values('Test Accuracy', ascending=False)
print(comparison_df)

# Find best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = trained_models[best_model_name]
print(f"\n🥇 Best Model: {best_model_name} (Test Accuracy: {test_scores[best_model_name]:.4f})")

In [None]:
# Visualize model performance
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Model accuracy comparison
ax1 = axes[0, 0]
x_pos = np.arange(len(comparison_df))
ax1.bar(x_pos, comparison_df['Test Accuracy'], alpha=0.7, color='skyblue', label='Test')
ax1.bar(x_pos, comparison_df['Training Accuracy'], alpha=0.7, color='lightcoral', label='Training')
ax1.set_xlabel('Models')
ax1.set_ylabel('Accuracy')
ax1.set_title('Model Accuracy Comparison')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(comparison_df['Model'], rotation=45)
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Confusion matrix for best model
ax2 = axes[0, 1]
cm = confusion_matrix(y_test, predictions[best_model_name])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
            xticklabels=iris.target_names, yticklabels=iris.target_names)
ax2.set_title(f'Confusion Matrix - {best_model_name}')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')

# 3. Feature importance (for Random Forest)
if 'Random Forest' in trained_models:
    ax3 = axes[1, 0]
    rf_model = trained_models['Random Forest']
    feature_importance = pd.DataFrame({
        'feature': X_engineered.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=True)
    
    ax3.barh(range(len(feature_importance)), feature_importance['importance'])
    ax3.set_yticks(range(len(feature_importance)))
    ax3.set_yticklabels(feature_importance['feature'])
    ax3.set_xlabel('Importance')
    ax3.set_title('Feature Importance (Random Forest)')
    ax3.grid(True, alpha=0.3)

# 4. Cross-validation scores
ax4 = axes[1, 1]
cv_scores = {}
for name, model in trained_models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    cv_scores[name] = scores
    ax4.boxplot([scores], positions=[list(cv_scores.keys()).index(name)], 
                widths=0.6, patch_artist=True)

ax4.set_xticklabels(cv_scores.keys(), rotation=45)
ax4.set_ylabel('CV Accuracy')
ax4.set_title('Cross-Validation Scores')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print cross-validation results
print("\n🔄 Cross-Validation Results:")
for name, scores in cv_scores.items():
    print(f"  • {name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

## 7. Make Predictions

Use the best performing model to make predictions on new data and save the model for future use.

In [None]:
# Create some sample new data for prediction
# These are made-up measurements that could represent new iris flowers
new_data = pd.DataFrame({
    'sepal length (cm)': [5.1, 6.2, 4.9],
    'sepal width (cm)': [3.5, 2.8, 3.1],
    'petal length (cm)': [1.4, 4.8, 1.5],
    'petal width (cm)': [0.2, 1.8, 0.1]
})

print("🌸 New Data for Prediction:")
print(new_data)

# Apply the same feature engineering
new_data_engineered = new_data.copy()
new_data_engineered['petal_area'] = new_data['petal length (cm)'] * new_data['petal width (cm)']
new_data_engineered['sepal_area'] = new_data['sepal length (cm)'] * new_data['sepal width (cm)']
new_data_engineered['petal_sepal_length_ratio'] = new_data['petal length (cm)'] / new_data['sepal length (cm)']
new_data_engineered['petal_sepal_width_ratio'] = new_data['petal width (cm)'] / new_data['sepal width (cm)']
new_data_engineered['total_area'] = new_data_engineered['petal_area'] + new_data_engineered['sepal_area']

# Scale the new data using the same scaler
new_data_scaled = scaler.transform(new_data_engineered)
new_data_scaled = pd.DataFrame(new_data_scaled, columns=new_data_engineered.columns)

# Make predictions with the best model
predictions_new = best_model.predict(new_data_scaled)
prediction_probabilities = best_model.predict_proba(new_data_scaled)

# Display results
print(f"\n🔮 Predictions using {best_model_name}:")
print("=" * 50)

for i, (pred, probs) in enumerate(zip(predictions_new, prediction_probabilities)):
    species_name = iris.target_names[pred]
    confidence = probs.max()
    
    print(f"\nSample {i+1}:")
    print(f"  Predicted Species: {species_name}")
    print(f"  Confidence: {confidence:.4f}")
    print(f"  Probabilities:")
    for j, (species, prob) in enumerate(zip(iris.target_names, probs)):
        print(f"    • {species}: {prob:.4f}")

# Create a summary DataFrame
prediction_summary = pd.DataFrame({
    'Sample': [f'Sample {i+1}' for i in range(len(predictions_new))],
    'Predicted_Species': [iris.target_names[pred] for pred in predictions_new],
    'Confidence': [prob.max() for prob in prediction_probabilities]
})

print(f"\n📊 Prediction Summary:")
print(prediction_summary)

In [None]:
# Save the best model and scaler for future use
import joblib
import os

# Create models directory if it doesn't exist
models_dir = os.path.join('..', 'models')
os.makedirs(models_dir, exist_ok=True)

# Save the model and scaler
model_path = os.path.join(models_dir, f'best_iris_model_{best_model_name.lower().replace(" ", "_")}.pkl')
scaler_path = os.path.join(models_dir, 'iris_scaler.pkl')

joblib.dump(best_model, model_path)
joblib.dump(scaler, scaler_path)

# Save model metadata
metadata = {
    'model_name': best_model_name,
    'test_accuracy': test_scores[best_model_name],
    'features': list(X_engineered.columns),
    'target_names': list(iris.target_names),
    'model_params': best_model.get_params(),
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

metadata_path = os.path.join(models_dir, 'iris_model_metadata.pkl')
joblib.dump(metadata, metadata_path)

print("💾 Model Saved Successfully!")
print(f"  • Model: {model_path}")
print(f"  • Scaler: {scaler_path}")
print(f"  • Metadata: {metadata_path}")

print(f"\n📝 Model Metadata:")
for key, value in metadata.items():
    if key != 'model_params':
        print(f"  • {key}: {value}")

# Summary
print("\n" + "="*60)
print("🎉 MACHINE LEARNING WORKFLOW COMPLETE!")
print("="*60)
print(f"✅ Dataset: Iris (150 samples, {X_engineered.shape[1]} features)")
print(f"✅ Best Model: {best_model_name}")
print(f"✅ Test Accuracy: {test_scores[best_model_name]:.4f}")
print(f"✅ Models Trained: {len(trained_models)}")
print(f"✅ Features Engineered: {X_engineered.shape[1] - X.shape[1]}")
print(f"✅ Model Saved: {model_path}")
print("\n🚀 Ready for production use!")