In [1]:
import os
os.environ["EXPERIMENT_DETAILS"] = '{"name": "testSAEEEEE", "algo_details": {"sklearn.ensemble.GradientBoostingClassifier": null}, "id": "383", "dataset": "Employee 1.csv", "target_column": "LeaveOrNot"}'


In [None]:
import os
import json
import cloudpickle
import numpy as np
import pandas as pd
from tpot import TPOTClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import (accuracy_score, average_precision_score, f1_score,
                             precision_score, recall_score, roc_auc_score)
from sklearn.model_selection import train_test_split
from mlflow.models import infer_signature
import mlflow.sklearn
from fosforio import get_local_dataframe, get_dataframe


def experiment(exp_details, tpot_config, generations, population_size, cv, random_state, verbosity):
    """
    exp_details: {
        name: exp_name
        id: exp_id
        description: exp_description
        dataset: exp_dataset
        target_column: exp_target_column
        algo_details: None
    }
    tpot_config: dict
    generations: 0.5
    population_size: 0.5
    cv: 5
    random_state: 42
    verbosity: 2
    """
    # Tracking URI set
    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL", "http://mlflow-server"))

    # Setting experiment name
    mlflow.set_experiment(exp_details.get("name", "sample_experiment"))

    # Adding description to the experiment
    tags = {'mlflow.note.content': exp_details.get("description", "sample_description")}

    # Reading input data
    try:
        if exp_details.get("dataset").split(".")[-1].lower() in ["csv", "tsv", "xlsx", "xls"]:
            # Read the input data file from /data mount attached to this notebook pod
            input_file = "/data/" + exp_details.get("dataset")
            data = get_local_dataframe(input_file)
        else:
            # Input data is a dataset
            data = get_dataframe(exp_details.get("dataset"))
            # Adding this to log input data in mlflow
            input_file = '/tmp/input_data.csv'
            data.to_csv(input_file, index=False)
    except Exception as ex:
        print(f"Unable to read input dataset.\nError: {ex}")

    # Data Preprocessing: Validating and encoding the data if required and imputing null values.
    data = data.fillna(method='pad')  # Filling null values with the previous ones
    data = data.fillna(method='bfill')  # Filling null value with the next ones
    df_target, le_target, df_feature, le_dict_feature, oh_enc_feature, le_column_feature, oh_column_feature = encoding(
        data, exp_details.get("target_column"))

    # Split the data into training and test sets. (0.75, 0.25) split.
    train_x, test_x, train_y, test_y = train_test_split(df_feature, df_target, train_size=0.75, test_size=0.25)

    # Registering datasets with mlflow experiment run
    dataset = mlflow.data.from_pandas(data, source=input_file)

    for algo, hyperparam in exp_details.get("algo_details").items():
        if not hyperparam and type(hyperparam) != dict:
            hyperparam = TPOTClassifier.default_config_dict[algo]
        print(f"Starting experiment run for {algo} with hyperparam: {hyperparam}")
        # Adding individual algorithm and its hyperparam to tpot config along with preprocessors and selectors
        tpot_config_dict = {**tpot_config, **{algo: hyperparam}}
        with mlflow.start_run(tags=tags):
            pipeline_optimizer = TPOTClassifier(
                generations=generations,
                population_size=population_size,
                cv=cv,
                random_state=random_state,
                verbosity=verbosity,
                config_dict=tpot_config_dict
            )

            pipeline_optimizer.fit(train_x, train_y)

            predicted_qualities = pipeline_optimizer.predict(test_x)

            matrices = eval_metrics(test_y, predicted_qualities)

            print(f"{str(pipeline_optimizer.fitted_pipeline_)} algorithms best selected by TPOT")
            print(f"  score: {pipeline_optimizer.score(test_x, test_y)}")

            try:
                # logging hyper params for the best run/pipeline chosen
                for step, _ in pipeline_optimizer.fitted_pipeline_.named_steps.items():
                    # Checking if the step used in TPOT pipeline is present in TPOT default config dict.
                    # if yes, then log only the hyperparams which are present in TPOT default config and not all.
                    step_name = [s for s in TPOTClassifier.default_config_dict.keys() if step.lower() in s.lower()]
                    for k, v in pipeline_optimizer.fitted_pipeline_.named_steps[step].get_params().items():
                        if step_name:
                            if k in TPOTClassifier.default_config_dict.get(step_name[0]):
                                mlflow.log_param(str(step) + "_" + str(k), v)
                        else:
                            mlflow.log_param(str(step) + "_" + str(k), v)
            except Exception as ex:
                print(f"Exception occurred in logging params to mlflow.\nex: {ex}")

            # logging model metric
            for i in matrices:
                if matrices[i]:
                    mlflow.log_metric(i, matrices[i])
                    print(i, matrices[i])
            mlflow.log_metric("score", pipeline_optimizer.score(test_x, test_y))

            # Log input data to MLflow run artifact.
            mlflow.log_artifact(input_file)

            # Registering datasets with mlflow experiment run
            mlflow.log_input(dataset, context="input")

            # Set custom tags
            mlflow.set_tags({
                "template_id": os.getenv("template_id", "sample_template_id"),
                "notebook_name": os.getenv("notebook_name", "sample_notebook_name"),
                "algorithm": algo,
                "algo_details": exp_details.get("algo_details"),
                "tpot_selected_algo": str(pipeline_optimizer.fitted_pipeline_)
            })

            predictions = pipeline_optimizer.fitted_pipeline_.predict(train_x)
            signature = infer_signature(train_x, predictions)

            # Storing score function for the model
            score_and_dump_func("/tmp/scoring_func")
            mlflow.log_artifact("/tmp/scoring_func")

            # Register the model
            mlflow.sklearn.log_model(
                pipeline_optimizer.fitted_pipeline_, "model",
                registered_model_name=exp_details.get("name", "sample_experiment"), signature=signature,
                pip_requirements=['mlflow==2.10.0', 'sqlalchemy==1.3.5']
            )

            # Exporting the autogenerated code of tpot for best pipeline
            pipeline_optimizer.export(f'tpot_exported_{exp_details.get("name")}_{algo}.py')
            mlflow.log_artifact(f'tpot_exported_{exp_details.get("name")}_{algo}.py')


def try_or(fn):
    try:
        out = fn()
        return out
    except:
        return None


def eval_metrics(y_actual, y_pred):
    """
    :param
    actual
    pred
    :returns
    rmse, mae, r2
    """
    return {
        "accuracy_score": try_or(lambda: accuracy_score(y_actual, y_pred)),
        "average_precision_score": try_or(lambda: average_precision_score(y_actual, y_pred)),
        "f1_score": try_or(lambda: f1_score(y_actual, y_pred, average="weighted", labels=np.unique(y_pred))),
        "precision_score": try_or(
            lambda: precision_score(y_actual, y_pred, average="weighted", labels=np.unique(y_pred))),
        "recall_score": try_or(lambda: recall_score(y_actual, y_pred, average="weighted")),
        "roc_auc_score": try_or(lambda: roc_auc_score(y_actual, y_pred))
    }


def score_and_dump_func(file_path):
    """
    :param
    file_path
    """

    def score_func(model, request):
        """
        :param
        model
        request
        :returns
        score_output
        """
        # Enter your custom score function here

        score_output = "Success"
        return score_output

    with open(file_path, "wb") as out:
        cloudpickle.dump(score_func, out)


def encoding(df, target_column):
    """
    Checking whether encoding required in target and feature datasets.
    If required, then encoding them with label and one hot encoding.
    :param:
    df: input dataframe
    target_column: target column
    :returns:
    df_target: target dataframe
    le_target: target label encoder object
    df_feature: feature dataframe
    le_dict_feature: dict of feature label encoder objects
    oh_enc_feature: feature one hot encoder object
    le_column_feature: list of feature label encoder columns
    oh_column_feature: list of feature one hot encoder columns
    """
    
    print('in encoding starts')
    df_target = df[[target_column]]
    le_target = None
    # Target column validation and encoding
    if df.dtypes[target_column].name in ['object', 'bool']:
        print(f"target_column is of {df.dtypes[target_column].name} datatype, encoding required.")
        le_target = LabelEncoder()
        df_target[target_column] = pd.DataFrame(le_target.fit_transform(df_target[target_column].astype(str)))
        print(f"Target column label encoded {df_target[target_column]}, object: {le_target}")

    # Feature column validation and encoding
    df_feature = df.drop(target_column, axis=1)
    non_numeric_cols = df_feature.select_dtypes(include=['object', 'bool']).columns.tolist()
    le_dict_feature = {}
    le_column_feature = []
    oh_column_feature = []
    oh_enc_feature = None
    if len(non_numeric_cols) >= 1:
        print(f"{non_numeric_cols} columns are non numeric in feature dataset, encoding required.")
        for col in non_numeric_cols:
            if df_feature[col].nunique() >= 10:
                le_column_feature.append(col)
            else:
                oh_column_feature.append(col)

        print(f"Columns identified to be encoded with label encoder: {le_column_feature}\n"
              f"Columns identified to be encoded with one hot encoder: {oh_column_feature}")

        # columns to be label encoded
        if len(le_column_feature) == 0:
            df_feature = df_feature
        else:
            for col in le_column_feature:
                le_dict_feature[col] = LabelEncoder()
                df_feature[col] = le_dict_feature[col].fit_transform(df_feature[col].astype(str))
                print(f"{col} column label encoded {df_feature[col]}, object: {le_dict_feature[col]}")

        # columns to be one hot encoded
        if len(oh_column_feature) == 0:
            df_feature = df_feature
        else:
            unique_combinations = pd.get_dummies(df_feature[oh_column_feature])
            unique_combinations_list = unique_combinations.columns.tolist()
            oh_enc_feature = OneHotEncoder()
            oh_encoded_array = oh_enc_feature.fit_transform(df_feature[oh_column_feature]).toarray() if len(
                oh_column_feature) > 1 else oh_enc_feature.fit_transform(df_feature[oh_column_feature]).toarray()
            df_oh_enc = pd.DataFrame(oh_encoded_array, columns=unique_combinations_list)
            df_feature = df_feature.drop(columns=oh_column_feature)
            df_feature = df_feature.join(df_oh_enc)
            print(f"new one hot encoded df: {oh_encoded_array}\n"
                  f"one hot encoder object: {oh_enc_feature}\n")
        print(f"final feature df created: {df_feature}")
        print(' encoding ends')

        print(df_feature)
        df_feature.to_csv('/data/finalOutput.csv')
        print('tar')
        print(df_target)
        
    return df_target, le_target, df_feature, le_dict_feature, oh_enc_feature, le_column_feature, oh_column_feature


# Adding Preprocessors and Selectors with tpot default hyperparameter tuning
preprocessors_selectors = [
    # Preprocessors
    "sklearn.preprocessing.Binarizer",
    "sklearn.decomposition.FastICA",
    "sklearn.cluster.FeatureAgglomeration",
    "sklearn.preprocessing.MaxAbsScaler",
    "sklearn.preprocessing.MinMaxScaler",
    "sklearn.preprocessing.Normalizer",
    "sklearn.kernel_approximation.Nystroem",
    "sklearn.decomposition.PCA",
    "sklearn.preprocessing.PolynomialFeatures",
    "sklearn.kernel_approximation.RBFSampler",
    "sklearn.preprocessing.RobustScaler",
    "sklearn.preprocessing.StandardScaler",
    "tpot.builtins.ZeroCount",
    "tpot.builtins.OneHotEncoder",
    # Selectors
    "sklearn.feature_selection.SelectFwe",
    "sklearn.feature_selection.SelectPercentile",
    "sklearn.feature_selection.VarianceThreshold",
    "sklearn.feature_selection.RFE",
    "sklearn.feature_selection.SelectFromModel"
]
tpot_config = {key: TPOTClassifier.default_config_dict[key] for key in preprocessors_selectors}

# Running Experiment with user configured params.
print(f"Starting Experiment Execution with the following params:\n{os.getenv('EXPERIMENT_DETAILS')}\n")
experiment(exp_details=json.loads(os.getenv("EXPERIMENT_DETAILS")), tpot_config=tpot_config, generations=5,
           population_size=20, cv=5, random_state=42, verbosity=2)


Starting Experiment Execution with the following params:
{"name": "testSAEEEEE", "algo_details": {"sklearn.ensemble.GradientBoostingClassifier": null}, "id": "383", "dataset": "Employee 1.csv", "target_column": "LeaveOrNot"}

in encoding starts
['Education', 'City', 'Gender', 'EverBenched'] columns are non numeric in feature dataset, encoding required.
Columns identified to be encoded with label encoder: []
Columns identified to be encoded with one hot encoder: ['Education', 'City', 'Gender', 'EverBenched']
new one hot encoded df: [[1. 0. 0. ... 1. 1. 0.]
 [1. 0. 0. ... 0. 1. 0.]
 [1. 0. 0. ... 0. 1. 0.]
 ...
 [0. 1. 0. ... 1. 1. 0.]
 [1. 0. 0. ... 1. 0. 1.]
 [1. 0. 0. ... 1. 0. 1.]]
one hot encoder object: OneHotEncoder()

final feature df created:       JoiningYear  PaymentTier  Age  ExperienceInCurrentDomain  \
0            2017            3   34                          0   
1            2013            1   28                          3   
2            2014            3   38       

  return _dataset_source_registry.resolve(


Starting experiment run for sklearn.ensemble.GradientBoostingClassifier with hyperparam: {'n_estimators': [100], 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'subsample': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]), 'max_features': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])}


Version 0.12.1 of tpot is outdated. Version 0.12.2 was released Friday February 23, 2024.
  y = column_or_1d(y, warn=True)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8503895121540126

Generation 2 - Current best internal CV score: 0.8526834201428143
