In [None]:
# Machine Learning Workflow Template

**Purpose**: End-to-end machine learning pipeline template

**Author**: [Your Name]

**Date**: [Date]

**Project**: [Project Name]

**Model Type**: [Classification/Regression/Clustering]

---

## Template Instructions

This template provides a structured approach to machine learning workflows:

1. **Environment Setup & Data Loading**
2. **Exploratory Data Analysis**
3. **Data Preprocessing**
4. **Feature Engineering**
5. **Model Selection & Training**
6. **Model Evaluation**
7. **Hyperparameter Tuning**
8. **Model Deployment Preparation**
9. **Documentation & Reporting**

**Remember to**:
- Set up MLflow experiment tracking
- Version control your data and models
- Document all assumptions and decisions
- Follow reproducible ML practices
- Validate results thoroughly


In [None]:
# Environment Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
from datetime import datetime
import joblib
import pickle
import os
from pathlib import Path

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# MLflow for experiment tracking
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# MLflow setup
EXPERIMENT_NAME = "ml-workflow-template"
mlflow.set_experiment(EXPERIMENT_NAME)

# Create directories for artifacts
Path("models").mkdir(exist_ok=True)
Path("data").mkdir(exist_ok=True)
Path("reports").mkdir(exist_ok=True)

print(f"🚀 ML Environment setup complete")
print(f"🔬 MLflow experiment: {EXPERIMENT_NAME}")
print(f"🎲 Random seed: {RANDOM_SEED}")
print(f"📅 Analysis date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


In [None]:
## 1. Data Loading & Initial Setup

Load your dataset and perform initial data exploration.


In [None]:
# Model Training with MLflow Tracking
with mlflow.start_run(run_name=f"ml-template-{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    
    # TODO: Replace with your data loading code
    # df = pd.read_csv('your_dataset.csv')
    
    # Example: Load sample data
    from sklearn.datasets import load_iris
    iris = load_iris()
    X = pd.DataFrame(iris.data, columns=iris.feature_names)
    y = pd.Series(iris.target)
    
    # Log dataset info
    mlflow.log_param("dataset_shape", X.shape)
    mlflow.log_param("n_features", X.shape[1])
    mlflow.log_param("n_samples", X.shape[0])
    mlflow.log_param("target_classes", len(np.unique(y)))
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
    )
    
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Model training
    model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)
    
    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("train_score", model.score(X_train_scaled, y_train))
    mlflow.log_metric("test_score", model.score(X_test_scaled, y_test))
    
    # Log model
    mlflow.sklearn.log_model(model, "model")
    
    # Save preprocessing pipeline
    joblib.dump(scaler, "models/scaler.pkl")
    mlflow.log_artifact("models/scaler.pkl")
    
    print(f"✅ Model trained successfully")
    print(f"📊 Test Accuracy: {accuracy:.4f}")
    print(f"📈 Training Score: {model.score(X_train_scaled, y_train):.4f}")
    print(f"📉 Test Score: {model.score(X_test_scaled, y_test):.4f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\n🔍 Feature Importance:")
    print(feature_importance)
