In [21]:
#Designing ML Pipelines with Scikit-learn: Create multistep pipelines for data preprocessing, training, and evaluation.
#single pipeline with different classifier
# Import necessary libraries
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [22]:
 # Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
# Define the classifiers to compare
classifiers = {
    'Support Vector Classifier': SVC(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [25]:
# Define the parameter grids for each classifier
param_grids = {
    'Support Vector Classifier': {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    'Random Forest': {
        'classifier__n_estimators': [10, 50, 100],
        'classifier__max_depth': [None, 10, 20]
    },
    'K-Nearest Neighbors': {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    }
}

In [26]:
# Iterate over classifiers
for name, classifier in classifiers.items():
    # Create a pipeline with a standard scaler and the classifier
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', classifier)
    ])

In [27]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grids[name], cv=5)
grid_search.fit(X_train, y_train)

In [28]:
# Predict on the test set
y_pred = grid_search.predict(X_test)

In [29]:
# Evaluate the model
print(f"Classifier: {name}")
print("Best parameters found:", grid_search.best_params_)
print("Classification report:\n", classification_report(y_test, y_pred, target_names=data.target_names))
print("="*60)

Classifier: K-Nearest Neighbors
Best parameters found: {'classifier__n_neighbors': 5, 'classifier__weights': 'uniform'}
Classification report:
               precision    recall  f1-score   support

   malignant       0.93      0.93      0.93        43
      benign       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [30]:
#Combining Multiple Pipelines - combine multiple pipelines or apply different preprocessing steps to various subsets of features, Scikit-learn offers tools like FeatureUnion and ColumnTransformer to facilitate these more complex workflows
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV    #Manually tuning the hyperparameters can be time-consuming and may not yield the best results. This is where GridSearchCV from Scikit-learn becomes invaluable.
#It evaluates all possible combinations of hyperparameters using cross-validation and identifies the set that results in the best model performance.
from sklearn.preprocessing import StandardScaler, MinMaxScaler   #StandardScaler is applied to features 0 to 9, MinMaxScaler is applied to features 10 to 19 and The remaining features are left unchanged
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer  #create a pipeline that includes the ColumnTransformer for preprocessing and the classifier itself
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [31]:
# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

In [32]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Define preprocessing pipelines for different feature subsets
preprocessor = ColumnTransformer(
    transformers=[
        ('num_standard', StandardScaler(), slice(0, 10)),  # Standardize features 0 to 9
        ('num_minmax', MinMaxScaler(), slice(10, 20))      # Min-Max scale features 10 to 19
    ],
    remainder='passthrough'  # Keep the rest of the features unchanged
)

In [34]:
# Define the classifiers to compare
classifiers = {
    'Support Vector Classifier': SVC(),
    'Random Forest': RandomForestClassifier()
}

In [35]:
# Define the parameter grids for each classifier  #parameter grid refers to a structured set of hyperparameters and their corresponding values that you define for the purpose of tuning a model
param_grids = {
    'Support Vector Classifier': {
        'classifier__C': [0.1, 1, 10],   #hyperparameter controls the regularization strength, Regularization helps prevent overfitting, The code will test the values 0.1, 1, and 10
        'classifier__kernel': ['linear', 'rbf'] #hyperparameter specifies the type of kernel used by the SVC
    },
    'Random Forest': {
        'classifier__n_estimators': [10, 50, 100], #hyperparameter determines the number of decision trees in the random forest, The code will test 10, 50, and 100 trees
        'classifier__max_depth': [None, 10, 20]   #depth of a tree, here none means no limit
    }
}

In [36]:
# Iterate over classifiers
for name, classifier in classifiers.items():
    # Create a pipeline with the preprocessor and the classifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

In [37]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grids[name], cv=5)
grid_search.fit(X_train, y_train)

In [38]:
# Predict on the test set
y_pred = grid_search.predict(X_test)

In [39]:
# Evaluate the model
print(f"Classifier: {name}")
print("Best parameters found:", grid_search.best_params_)
print("Classification report:\n", classification_report(y_test, y_pred, target_names=data.target_names))
print("="*60)

Classifier: Random Forest
Best parameters found: {'classifier__max_depth': None, 'classifier__n_estimators': 50}
Classification report:
               precision    recall  f1-score   support

   malignant       0.98      0.93      0.95        43
      benign       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [40]:
#Example 2: Using Scikit-learn for Machine Learning Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import pandas as pd

# Sample dataset
df = pd.DataFrame({
    'age': [25, 30, 35, None, 45],
    'salary': [50000, 60000, 70000, 80000, None],
    'gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'purchased': [0, 1, 0, 1, 0]
})

# Splitting data
X = df.drop(columns=['purchased'])
y = df['purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining transformers
numeric_features = ['age', 'salary']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['gender']
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combining transformers
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Training
pipeline.fit(X_train, y_train)

# Evaluation
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.00


In [41]:
#Python Script to Automate the Pipeline
import os
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [42]:
# Setup logging
logging.basicConfig(filename='pipeline_execution.log', level=logging.INFO, format='%(asctime)s - %(message)s')

In [43]:
# Step 1: Load the dataset
def load_data(file_path):
    """Load data from a CSV file"""
    try:
        data = pd.read_csv(file_path)
        logging.info(f"Data loaded successfully from {file_path}")
        return data
    except Exception as e:
        logging.error(f"Error loading data from {file_path}: {e}")
        raise

In [44]:
# Step 2: Data Preprocessing Pipeline
def preprocess_data(df):
    """Preprocess the data (missing value handling, scaling, encoding)"""
    numeric_features = ['age', 'salary']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_features = ['gender']
    categorical_transformer = Pipeline(steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [45]:
# Separate features and target
X = df.drop(columns=['purchased'])
y = df['purchased']