<a href="https://colab.research.google.com/github/Rohanghotane/AutoML-Pipeline-Builder/blob/main/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
                                                # MOUNTING TO GOOGLE DRIVE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls '/content/drive/My drive/iris.csv'

ls: cannot access '/content/drive/My drive/iris.csv': No such file or directory


In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
                                        #LOADING IRIS DATASET AND JSON CONFIGURATION

In [None]:
df = pd.read_csv('/content/drive/My Drive/iris.csv')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [None]:
path = '/content/drive/MyDrive/algoparams_from_ui.json'
with open(path, 'r') as f:
    config = json.load(f)

print(config.keys())

dict_keys(['session_name', 'session_description', 'design_state_data'])


In [None]:
                                                              #LOGGING SETUP

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
                                                            #FEATURE HANDLING

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
target = config['design_state_data']['target']['target']
prediction = config['design_state_data']['target']['prediction_type']

In [None]:
features_config = config['design_state_data']['feature_handling']
chosen_attributes = [k for k, v in features_config.items() if v['is_selected'] and k != target]


In [None]:
numerical_features = []
categorical_features = []
num_impute_strategies = {}

In [None]:
for feat, details in features_config.items():
    if not details['is_selected'] or feat == target:
        continue
    if details['feature_variable_type'] == 'numerical':
        numerical_features.append(feat)
        strategy = details['feature_details']['impute_with']
        value = details['feature_details']['impute_value']
        if strategy == 'custom':
            num_impute_strategies[feat] = ('constant', value)
        else:
            num_impute_strategies[feat] = ('mean', None)
    elif details['feature_variable_type'] == 'text':
        categorical_features.append(feat)

In [None]:
                                                              #TRANSFORMERS

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
num_transformers = []
for feat in numerical_features:
    strategy, fill_value = num_impute_strategies[feat]
    imputer = SimpleImputer(strategy=strategy, fill_value=fill_value)
    num_transformers.append((feat, Pipeline([
        ('imputer', imputer),
        ('scaler', StandardScaler())]), [feat]))

cat_transformer = ('cat', Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')) ]), categorical_features)

In [None]:
                                                            #FEATURE GENERATION

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
if 'feature_generation' in config['design_state_data']:
    fg = config['design_state_data']['feature_generation']
    if 'linear_interactions' in fg:
        for f1, f2 in fg['linear_interactions']:
            df[f'{f1}_times_{f2}'] = df[f1] * df[f2]
            chosen_attributes.append(f'{f1}_times_{f2}')

In [None]:
                                                              #FEATURE REDUCTION

In [None]:
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier, RandomForestRegressor, RandomForestClassifier
from sklearn.decomposition import PCA

In [None]:
reduction_type = config['design_state_data']['feature_reduction']['feature_reduction_method']
reduction_step = ('reduction', 'passthrough')

if reduction_type == 'PCA':
    n_features = int(config['design_state_data']['feature_reduction']['num_of_features_to_keep'])
    reduction_step = ('reduction', PCA(n_components=n_features))
elif reduction_type == 'Tree-based':

    if prediction == 'Regression':
        model = ExtraTreesRegressor(n_estimators=5)
    else:
        model = ExtraTreesClassifier(n_estimators=5)
    reduction_step = ('reduction', SelectFromModel(model, max_features=int(
        config['design_state_data']['feature_reduction']['num_of_features_to_keep']), prefit=False))

In [None]:
                                                                #GRID SEARCH

In [None]:
models = config['design_state_data']['algorithms']

for model_key, model_info in models.items():
    if not model_info['is_selected']:
        continue


    if prediction == 'Regression' and 'Regressor' in model_key:
        if model_key == 'RandomForestRegressor':
            model = RandomForestRegressor()
            param_grid = {
                'model__n_estimators': [model_info['min_trees'], model_info['max_trees']],
                'model__max_depth': [model_info['min_depth'], model_info['max_depth']],
                'model__min_samples_leaf': [model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']]
            }
        elif model_key == 'LinearRegression':
            model = LinearRegression()
            param_grid = {}


    elif prediction == 'Classification' and 'Classifier' in model_key:
        if model_key == 'RandomForestClassifier':
            model = RandomForestClassifier()
            param_grid = {
                'model__n_estimators': [model_info['min_trees'], model_info['max_trees']],
                'model__max_depth': [model_info['min_depth'], model_info['max_depth']],
                'model__min_samples_leaf': [model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']]
            }
        elif model_key == 'LogisticRegression':
            model = LogisticRegression(max_iter=1000)
            param_grid = {}

    else:
        continue

In [None]:
                                                          #BUILDING PIPELINE

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        cat_transformer
    ])

pipe = Pipeline([
    ('preprocessor', preprocessor),
    reduction_step,
    ('model', model)
])

In [None]:
                                                            #TRAIN TEST SPLIT

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
X = df[chosen_attributes]
Y = df[target]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=10)

In [None]:
                                                            #GRID SEARCH

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3)
grid.fit(X_train, Y_train)

In [None]:
y_pred = grid.predict(X_test)

In [None]:
logging.info(f"Model: {model_key}")
logging.info("Best Params: %s", grid.best_params_)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score

In [None]:
if prediction == 'Regression':
  logging.info("MSE: %f", mean_squared_error(Y_test, y_pred))
  logging.info("R2 Score: %f", r2_score(Y_test, y_pred))
else:
  logging.info("Accuracy: %f", accuracy_score(Y_test, Y_pred))
  logging.info("F1 Score: %f", f1_score(Y_test, y_pred, average='weighted'))

In [None]:
logging.info("-" * 30)

In [None]:
import joblib

joblib.dump(grid.best_estimator_, f"best_model_{model_key}.pkl")


['best_model_neural_network.pkl']

In [None]:
with open(f"metrics_{model_key}.txt", "w") as f:
    if prediction == 'Regression':
        f.write(f"MSE: {mean_squared_error(Y_test, y_pred)}\n")
        f.write(f"R2 Score: {r2_score(Y_test, y_pred)}\n")
    else:
        f.write(f"Accuracy: {accuracy_score(Y_test, y_pred)}\n")
        f.write(f"F1 Score: {f1_score(Y_test, y_pred, average='weighted')}\n")


In [None]:
!python run_pipeline.ipynb

python3: can't open file '/content/run_pipeline.ipynb': [Errno 2] No such file or directory


In [None]:
!jupyter nbconvert run_pipeline.ipynb --to python

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr