In [23]:
import sys
sys.path.append(r"C:\Users\Rudra\plot-flow")
from plotflow import Pipeline, plot_flow, clear_registry

In [24]:
import os
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from plotflow import Pipeline, plot_flow, clear_registry

# Clear registry for a clean pipeline
clear_registry()

# --- Step 1: Load data ---
@Pipeline(rank=1, name="Load Data")
def load_data():
    iris = load_iris()
    X = pd.DataFrame(iris.data, columns=iris.feature_names)
    y = pd.Series(iris.target)
    return X, y, iris.target_names

# --- Step 2: Feature Engineering & Cleaning ---
@Pipeline(rank=2, connect=[1], name="Feature Engineering")
def feature_engineer(data):
    X, y, target_names = data
    # Check missing values
    print("Missing values check:\n", X.isnull().sum())
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, target_names, scaler

# --- Step 3: Train-test split ---
@Pipeline(rank=3, connect=[2], name="Train-Test Split")
def split_data(data):
    X_scaled, y, target_names, scaler = data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test, target_names, scaler

# --- Step 4: Train Models ---
@Pipeline(rank=4, connect=[3], name="Train Models")
def train_models(data):
    X_train, X_test, y_train, y_test, target_names, scaler = data
    
    # Individual models
    lr = LogisticRegression()
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
    svm = SVC(probability=True)
    
    # Train models
    for model in [lr, rf, gb, svm]:
        model.fit(X_train, y_train)
    
    # Ensemble
    ensemble = VotingClassifier(
        estimators=[('lr', lr), ('rf', rf), ('gb', gb), ('svm', svm)],
        voting='soft'
    )
    ensemble.fit(X_train, y_train)
    
    # Evaluate
    y_pred = ensemble.predict(X_test)
    print("Ensemble Accuracy:", accuracy_score(y_test, y_pred))
    
    return ensemble, scaler, target_names

# --- Step 5: Make final prediction ---
@Pipeline(rank=5, connect=[4], name="Final Prediction")
def final_prediction(data):
    ensemble, scaler, target_names = data
    new_data = np.array([[5.1, 3.5, 1.4, 0.2]])
    new_data_scaled = scaler.transform(new_data)
    pred = ensemble.predict(new_data_scaled)
    print("Final Prediction (class):", target_names[pred][0])
    return pred

# --- Plot the pipeline ---
output_file = os.path.join(os.path.abspath('.'), "iris_pipeline.png")
plot_flow(save_as=output_file, show=False, orientation="vertical", figsize=(12, 8))
print(f"Pipeline plot saved at: {output_file}")


Pipeline flow saved to 'c:\Users\Rudra\plot-flow\test\notebooks\iris_pipeline.png'
Pipeline plot saved at: c:\Users\Rudra\plot-flow\test\notebooks\iris_pipeline.png
