### Importing necessary libraries

In [None]:
import os
import pandas as pd
import numpy as np
import json
import logging
from google.colab import files
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet,
    LogisticRegression, SGDClassifier
)
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    explained_variance_score, accuracy_score, f1_score, roc_auc_score, confusion_matrix
)
import pickle

### Machine Learning Pipeline Building

**MLPipeline** is designed to automate the creation and execution of a machine learning pipeline using configurations specified in a JSON file. It reads configurations, loads the dataset from various possible paths, and then checks its structure. The pipeline supports features for preprocessing through imputation, scaling, encoding, and can even generate new features through linear and polynomial interactions based on configuration. It enables feature reduction using methods like tree-based selection and integrates configurable models for regression or classification tasks. In that regard, the class facilitates model training and hyperparameter tuning with tools such as **Pipeline** and **GridSearchCV** to ensure optimal performance. The pipeline is also modular and thereby able to be flexibly adapted to various datasets and predictive tasks.

In [None]:
class MLPipeline:
    def __init__(self, config_path, dataset_path=None):
        # Reading the configuration file
        with open(config_path, 'r') as file:
            self.full_config = json.load(file)

        # Extracting nested configurations
        self.config = self.full_config['design_state_data']

        # Dataset loading
        if dataset_path:
            self.data_path = dataset_path
        else:
            self.data_path = self.config['session_info'].get('dataset')

        # Validating and loading the dataset
        self.data = self._load_dataset()

        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

        # Printing dataset columns (Debugging)
        print("Dataset columns:", self.data.columns)


    def _load_dataset(self):
        # File paths to try
        potential_paths = [
            self.data_path,  # Direct giving the path
            os.path.join('/content', self.data_path),  # Giving the path from colab content directory
            os.path.join(os.getcwd(), self.data_path),  # From current working directory
            os.path.expanduser(f"~/{self.data_path}"),  # From home directory
        ]

        # Trying each potential path
        for path in potential_paths:
            try:
                # Checking if file exists
                if not os.path.exists(path):
                    continue

                # Attempting to read the file based on extension
                if path.endswith('.csv'):
                    return pd.read_csv(path)
                elif path.endswith('.xlsx') or path.endswith('.xls'):
                    return pd.read_excel(path)
                elif path.endswith('.json'):
                    return pd.read_json(path)
                elif path.endswith('.parquet'):
                    return pd.read_parquet(path)
            except Exception as e:
                print(f"Error loading dataset from {path}: {e}")
                continue

        # Raising an error incase no dataset is found
        raise FileNotFoundError(f"Could not find dataset. Tried paths: {potential_paths}")

    # Handling feature preprocessing based on JSON configurations
    def _preprocess_features(self):
        feature_configs = self.config['feature_handling']

        numeric_features = []
        categorical_features = []
        impute_strategies = {}

        for feature, details in feature_configs.items():
            if details['is_selected']:
                if details['feature_variable_type'] == 'numerical':
                    numeric_features.append(feature)

                    # Determining imputation strategy
                    if details['feature_details']['missing_values'] == 'Impute':
                        if details['feature_details']['impute_with'] == 'Average of values':
                            impute_strategies[feature] = 'mean'
                        else:
                            impute_strategies[feature] = details['feature_details']['impute_value']

                elif details['feature_variable_type'] == 'text':
                    categorical_features.append(feature)

        # Setting target column
        target_column = self.config['target']['target']

        # Removing target from features if present
        if target_column in numeric_features:
            numeric_features.remove(target_column)

        # Preprocessing steps
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        return preprocessor, numeric_features, categorical_features

    # Generate additional features based on configuration
    def _feature_generation(self, X):
        feature_gen_config = self.config.get('feature_generation', {})

        print("Feature generation config:", feature_gen_config)

        # Linear interactions
        for interaction in feature_gen_config.get('linear_interactions', []):
            if interaction[0] in X.columns and interaction[1] in X.columns:
                new_col_name = '_'.join(interaction + ['interaction'])
                X[new_col_name] = X[interaction[0]] * X[interaction[1]]
                print(f"Created new column: {new_col_name}")
                print(X[new_col_name].head())
            else:
                print(f"Values: {X[interaction[0]] if interaction[0] in X.columns else 'Feature not found'} and {X[interaction[1]] if interaction[1] in X.columns else 'Feature not found'}")

        # Polynomial interactions
        for poly_interaction in feature_gen_config.get('polynomial_interactions', []):
            feat1, feat2 = poly_interaction.split('/')
            if feat1 in X.columns and feat2 in X.columns:
                if X[feat1].dtype in [np.float64, np.int64] and X[feat2].dtype in [np.float64, np.int64]:
                    new_col_name = '_'.join([feat1, feat2, 'poly_interaction'])
                    X[new_col_name] = X[feat1] / X[feat2]
                    print(f"Created new column: {new_col_name}")
                    print(X[new_col_name].head())
                else:
                    print(f"Values: {X[feat1]} and {X[feat2]}")

            else:
                print(f"Values: {X[feat1] if feat1 in X.columns else 'Feature not found'} and {X[feat2] if feat2 in X.columns else 'Feature not found'}")


        # Explicit pairwise interactions
        for pair_interaction in feature_gen_config.get('explicit_pairwise_interactions', []):
            feat1, feat2 = pair_interaction.split('/')
            if feat1 in X.columns and feat2 in X.columns:
                new_col_name = '_'.join([feat1, feat2, 'pair_interaction'])
                X[new_col_name] = X[feat1] * X[feat2]
                print(f"Created new column: {new_col_name}")
                print(X[new_col_name].head())
            else:
                print(f"Values: {X[feat1] if feat1 in X.columns else 'Feature not found'} and {X[feat2] if feat2 in X.columns else 'Feature not found'}")


        return X

    # Applying feature reduction based on configuration
    def _feature_reduction(self, X, y):
        reduction_config = self.config.get('feature_reduction', {})
        method = reduction_config.get('feature_reduction_method', 'No Reduction')

        if method == 'Tree-based':
            num_features = int(reduction_config.get('num_of_features_to_keep', 'auto'))
            return SelectFromModel(
                RandomForestRegressor(
                    n_estimators=int(reduction_config.get('num_of_trees', 5)),
                    max_depth=int(reduction_config.get('depth_of_trees', 6))
                ),
                max_features=num_features
            )

        return 'passthrough'

    # Selecting and configuring models based on configurations
    def _select_models(self):
        models = []
        algorithm_config = self.config.get('algorithms', {})
        prediction_type = self.config['target']['prediction_type'].lower()

        for algo_name, algo_details in algorithm_config.items():
            if prediction_type == 'regression' and 'regress' in algo_name.lower():
                model = self._configure_regressor(algo_name, algo_details)
            elif prediction_type == 'classification' and 'classif' in algo_name.lower():
                model = self._configure_classifier(algo_name, algo_details)
            else:
                continue
            if model:
                models.append(model)

        return models

    # Specifically configuring regressors
    def _configure_regressor(self, algo_name, algo_details):
        if 'randomforestregressor' in algo_name.lower():
            return (
                'random_forest',
                RandomForestRegressor(),
                {
                    'n_estimators': list(range(
                        algo_details.get('min_trees', 10),
                        algo_details.get('max_trees', 30)
                    )),
                    'max_depth': list(range(
                        algo_details.get('min_depth', 20),
                        algo_details.get('max_depth', 30)
                    ))
                }
            )

        if 'linearregression' in algo_name.lower():
            return (
                'linear_regression',
                LinearRegression(),
                {}
            )

        if 'ridge' in algo_name.lower():
            return (
                'ridge_regression',
                Ridge(),
                {
                    'alpha': [0.1, 1.0, 10.0]
                }
            )

        if 'lasso' in algo_name.lower():
            return (
                'lasso_regression',
                Lasso(),
                {
                    'alpha': [0.1, 1.0, 10.0]
                }
            )

        if 'elasticnet' in algo_name.lower():
            return (
                'elastic_net_regression',
                ElasticNet(),
                {
                    'alpha': [0.1, 1.0, 10.0],
                    'l1_ratio': [0.1, 0.5, 0.9]
                }
            )

        if 'decisiontreeregressor' in algo_name.lower():
            return (
                'decision_tree',
                DecisionTreeRegressor(),
                {
                    'max_depth': list(range(
                        algo_details.get('min_depth', 2),
                        algo_details.get('max_depth', 10)
                    ))
                }
            )

        if 'svr' in algo_name.lower():
            return (
                'svr',
                SVR(),
                {
                    'C': [0.1, 1.0, 10.0],
                    'epsilon': [0.01, 0.1, 1.0]
                }
            )

        return None

    # Specifically configuring classifiers
    def _configure_classifier(self, algo_name, algo_details):
        if 'randomforestclassifier' in algo_name.lower():
            return (
                'random_forest',
                RandomForestClassifier(),
                {
                    'n_estimators': list(range(
                        algo_details.get('min_trees', 10),
                        algo_details.get('max_trees', 30)
                    )),
                    'max_depth': list(range(
                        algo_details.get('min_depth', 20),
                        algo_details.get('max_depth', 30)
                    ))
                }
            )

        if 'logisticregression' in algo_name.lower():
            return (
                'logistic_regression',
                LogisticRegression(),
                {
                    'C': [0.1, 1.0, 10.0]
                }
            )

        if 'sgdclassifier' in algo_name.lower():
            return (
                'sgd_classifier',
                SGDClassifier(),
                {
                    'alpha': [0.0001, 0.001, 0.01]
                }
            )

        if 'decisiontreeclassifier' in algo_name.lower():
            return (
                'decision_tree',
                DecisionTreeClassifier(),
                {
                    'max_depth': list(range(
                        algo_details.get('min_depth', 2),
                        algo_details.get('max_depth', 10)
                    ))
                }
            )

        if 'svc' in algo_name.lower():
            return (
                'svc',
                SVC(),
                {
                    'C': [0.1, 1.0, 10.0],
                    'kernel': ['linear', 'rbf', 'poly']
                }
            )

        return None

    # Executing complete machine learning pipeline
    def run_pipeline(self):
        # Preparing data
        X = self.data.drop(columns=[self.config['target']['target']])
        y = self.data[self.config['target']['target']]

        # Splitting the data into train-test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.2,
            random_state=self.config['train'].get('random_seed', 42)
        )

        # Preprocessing
        preprocessor, numeric_features, categorical_features = self._preprocess_features()

        # Feature generation
        X_train = self._feature_generation(X_train)
        X_test = self._feature_generation(X_test)

        # Feature reduction
        feature_reducer = self._feature_reduction(X_train, y_train)

        # Model selection
        models = self._select_models()

        results = {}
        for model_name, model, params in models:
            # Pipeline Creation
            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('feature_reducer', feature_reducer),
                ('model', model)
            ])

            # Using grid search
            grid_search = GridSearchCV(
                pipeline,
                param_grid={f'model__{k}': v for k, v in params.items()},
                cv=5,
                scoring='neg_mean_squared_error' if self.config['target']['prediction_type'].lower() == 'regression' else 'accuracy'
            )

            # Fit and predict
            grid_search.fit(X_train, y_train)
            y_pred = grid_search.predict(X_test)

            # Metrics for regression and classification
            if self.config['target']['prediction_type'].lower() == 'regression':
                results[model_name] = {
                    'best_params': grid_search.best_params_,
                    'mse': mean_squared_error(y_test, y_pred),
                    'mae': mean_absolute_error(y_test, y_pred),
                    'r2': r2_score(y_test, y_pred),
                    'explained_variance': explained_variance_score(y_test, y_pred)

                }
            else:
                results[model_name] = {
                    'best_params': grid_search.best_params_,
                    'accuracy': accuracy_score(y_test, y_pred),
                    'f1_score': f1_score(y_test, y_pred, average='weighted'),
                    'roc_auc': roc_auc_score(y_test, y_pred, multi_class='ovr'),
                    'confusion_matrix': confusion_matrix(y_test, y_pred)
                }

            # Printing the results
            print(f"Model: {model_name}")
            print(f"Best params: {results[model_name]['best_params']}")
            for metric, value in results[model_name].items():
                print(f"{metric}: {value}")

            self.logger.info(f"Model: {model_name}")
            for metric, value in results[model_name].items():
                self.logger.info(f"{metric}: {value}")

        # Saving the results to CSV
        results_df = pd.DataFrame(results).T
        results_df.to_csv('results.csv')
        print("Results saved to results.csv")

        # Saving the best model
        best_model = grid_search.best_estimator_
        with open('best_model.pkl', 'wb') as f:
            pickle.dump(best_model, f)
        print("Best model saved to best_model.pkl")

        # Displaying the pipeline for the best model
        print("Pipeline for the best model:")
        print(best_model)

        return results


### Giving the dataset and JSON file path and initiating the pipeline

Configured the JSON file as per correct norms

In [None]:
# Path for the dataset and json file
if __name__ == '__main__':
    pipeline = MLPipeline(
        config_path='/content/data.json',
        dataset_path='/content/iris.csv'
    )

    pipeline.run_pipeline()

Dataset columns: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')
Feature generation config: {'linear_interactions': [['petal_length', 'sepal_width']], 'linear_scalar_type': 'robust', 'polynomial_interactions': ['petal_length/sepal_width', 'petal_width/species'], 'explicit_pairwise_interactions': ['sepal_width/sepal_length', 'petal_width/sepal_length']}
Created new column: petal_length_sepal_width_interaction
137    17.05
84     13.50
27      5.25
127    14.70
132    15.68
Name: petal_length_sepal_width_interaction, dtype: float64
Created new column: petal_length_sepal_width_poly_interaction
137    1.774194
84     1.500000
27     0.428571
127    1.633333
132    2.000000
Name: petal_length_sepal_width_poly_interaction, dtype: float64
Values: Feature not found and 137     Iris-virginica
84     Iris-versicolor
27         Iris-setosa
127     Iris-virginica
132     Iris-virginica
            ...       
9          Iris-setosa
103  

  _data = np.array(data, dtype=dtype, copy=copy,


Model: random_forest
Best params: {'model__max_depth': 22, 'model__n_estimators': 12}
best_params: {'model__max_depth': 22, 'model__n_estimators': 12}
mse: 0.12438732351515476
mae: 0.2729475999711435
r2: 0.7409137189853056
explained_variance: 0.7636678163152325
Model: linear_regression
Best params: {}
best_params: {}
mse: 0.0621836868158592
mae: 0.18394422315469672
r2: 0.8704776362927323
explained_variance: 0.8739186032721652
Model: ridge_regression
Best params: {'model__alpha': 0.1}
best_params: {'model__alpha': 0.1}
mse: 0.06218398325551821
mae: 0.1842834069038237
r2: 0.8704770188387456
explained_variance: 0.8739697635099681
Model: lasso_regression
Best params: {'model__alpha': 0.1}
best_params: {'model__alpha': 0.1}
mse: 0.0630903605453565
mae: 0.18344992683432743
r2: 0.8685891261292303
explained_variance: 0.8752439935185841
Model: elastic_net_regression
Best params: {'model__alpha': 0.1, 'model__l1_ratio': 0.1}
best_params: {'model__alpha': 0.1, 'model__l1_ratio': 0.1}
mse: 0.06144

### Displaying the pipeline

In [None]:
from sklearn import set_config
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Steps that we got as an output from the above code
steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['sepal_length',
                                                   'sepal_width',
                                                   'petal_length']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['species'])])),
                ('feature_reducer',
                 SelectFromModel(estimator=RandomForestRegressor(max_depth=6,
                                                                 n_estimators=5),
                                 max_features=4)),
                ('model', DecisionTreeRegressor(max_depth=4))]
pipe = Pipeline(steps)

# Displaying the pipeline
set_config(display="diagram")
pipe

The pipeline first preprocesses the data with imputing missing values and scaling numerical features (sepal_length, sepal_width, petal_length) and encoding categorical features (species) by one-hot encoding. (The feature petal_width was already within a reasonable range so there was no need to scale it again.) Then a feature set is reduced by selecting the most important features (up to 4) using a RandomForestRegressor. Finally, the model is trained by using a DecisionTreeRegressor with max_depth set to 4 for streamlined preprocessing, feature selection, and predictive modeling.