In [94]:
import json, re

with open('algoparams_from_ui.json.rtf', 'r') as file:
    data = file.read()

start = data.find("\{")
end = data.rfind("\}")
json_data = data[start:end+2]

json_data = json_data.replace("\par", "").replace("\{","{").replace("\}", "}")
json_data = json.loads(json_data)

#print(json_data)

In [95]:
json_data = json_data['design_state_data']

In [96]:
models = {'Regression' : ['RandomForestRegressor','GBTRegressor', 'LinearRegression', 'LogisticRegression','RidgeRegression', 'LassoRegression','ElasticNetRegression', 'DecisionTreeRegressor','SVM','SGD','neural_network'],
         'Classification' : ['RandomForestClassifier', 'GBTClassifier', 'xg_boost', 'DecisionTreeClassifier', 'KNN','neural_network']}

In [97]:
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, train_test_split, GridSearchCV 
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet, SGDClassifier, SGDRegressor
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor

In [98]:
import pandas as pd
data = pd.read_csv('iris.csv')
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [99]:
#find the target variable
def get_target(d):
    return d['target']['target']
target = get_target(json_data)

#split the dataset in train and test dataset.
train_ratio = json_data['train']['train_ratio'] if json_data['train']['train_ratio'] != 0 else 0.7
seed = json_data['train']['random_seed']

train, test = train_test_split(data, train_size = train_ratio, random_state = seed)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [100]:
train.shape, test.shape

((105, 5), (45, 5))

In [101]:
for key, value in json_data['feature_handling'].items():
    if value['is_selected']:
        if value['feature_variable_type'] == 'numerical':
            if value['feature_details']['missing_values'] == 'Impute':
                if value['feature_details']['impute_with'] == 'custom':
                    impute_value = value['feature_details']['impute_value']
                    train[key] = train[key].fillna(impute_value)
                else:
                    imputer = SimpleImputer(strategy='mean')
                    imputer.fit(train[[key]])
                    train[key] = imputer.transform(train[[key]])
                    test[key] = imputer.transform(test[[key]])
        elif value['feature_variable_type'] == 'text':
            n_features = len(train[key].unique())
            vect = HashingVectorizer(n_features=n_features)
            vect.fit(train[key])

            hashed_train = vect.transform(train[key]).toarray()
            hashed_test = vect.transform(test[key]).toarray()

            XH_train = pd.DataFrame(hashed_train,columns=[f"hashed_{i+1}" for i in range(n_features)])
            XH_test = pd.DataFrame(hashed_test,columns=[f"hashed_{i+1}" for i in range(n_features)])
            
            train = pd.concat([train, XH_train], axis=1)
            test = pd.concat([test, XH_test], axis=1)

            train.drop([key], axis=1, inplace=True)
            test.drop([key], axis=1, inplace=True)
        print(f"Feature handling performed successfully for {key}")
    else:
        print(f"The feature {key} is not selected.")

Feature handling performed successfully for sepal_length
Feature handling performed successfully for sepal_width
Feature handling performed successfully for petal_length
Feature handling performed successfully for petal_width
Feature handling performed successfully for species


In [102]:
X_train, y_train = train.drop(['petal_width'], axis=1), train['petal_width']
X_test, y_test = test.drop(['petal_width'], axis=1), test[['petal_width']]

In [103]:
#Find prediction type
prediction_type = json_data['target']['prediction_type']

for k, v in json_data['algorithms'].items():
    if k in models[prediction_type] and v['is_selected']:
        model_name = k
        model_params = v

In [104]:
def get_required_model(prediction_type):
    if prediction_type == 'Regression':
        if model_name == 'LinearRegression':
            final_model = get_Linear(model_params)

        elif model_name == 'LogisticRegression':
            final_model = get_Logistic(model_params)
    
        elif model_name == 'RandomForestRegressor':
            final_model = get_RandomForestR(model_params)

        elif model_name == 'RidgeRegression':
            final_model = get_Rigde(model_params)

        elif model_name == 'LassoRegression':
            final_model = get_Lasso(model_params)

        elif model_name == 'ElasticNetRegression':
            final_model = get_Elastic(model_params)

        elif model_name == 'DecisionTreeRegressor':
            final_model = get_DecisionR(model_params)
        elif model_name == 'KNN':
            final_model = get_KNN_R(model_params)

        elif model_name == 'SVM':
            final_model = get_SVR(model_params)

        elif model_name == 'SGD':
            final_model = get_SGD_R(model_params)

        y_predicted = final_model.predict(X_test)
        print(f"Mean squared error for {model_name} model on test set is: ", mean_squared_error(y_test,y_predicted))
    else:
        if model_name == 'RandomForestClassifier':
            final_model = get_RandomForestC(model_params)
        elif model_name == 'DecisionTreeClassifier':
            final_model = get_DecisionC(model_params)
        elif model_name == 'KNN':
            final_model = get_KNN_C(model_params)
        y_predicted = final_model.predict(X_test)
        
        acc = accuracy_score(y_test, y_predicted)
        print(f"Accuracy measure for {model_name} model : ", acc)

### Regression Models

In [105]:
def get_Linear(model_params):
    M1 = LinearRegression(n_jobs=model_params['parallelism'])
    M1.fit(X_train, y_train)
    final_model = M1
    return final_model

In [106]:
def get_Logistic(model_params):
    M3 = LogisticRegression()
    param_grid = {
        'max_iter' : list(range(model_params['min_iter'], model_params['max_iter']+1)),
        'C' : [1, 1.5, 2]
    }
    gridModel = GridSearchCV(estimator = M3,
                            param_grid=param_grid,
                            n_jobs=model_params['parallelism'])
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model

In [107]:
def get_RandomForestR(model_params):
    M2  = RandomForestRegressor()
    param_grid = {
        'n_estimators': list(range(model_params['min_trees'], model_params['max_trees']+1)),
        'max_depth' : list(range(model_params['min_depth'], model_params['max_depth']+1)),
        'min_samples_leaf' : list(range(model_params['min_samples_per_leaf_min_value'], model_params['min_samples_per_leaf_max_value']+1))
    }
    gridModel = GridSearchCV(estimator = M2, 
                             param_grid = param_grid,
                             cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds']),
                             n_jobs = json_data['hyperparameters']['parallelism'])
    gridModel.fit(X_train,y_train)
    final_model = gridModel.best_estimator_
    return final_model

In [108]:
def get_Rigde(model_params):
    M4 = Ridge()
    param_grid = {
        'max_iter' : list(range(model_params['min_iter'], model_params['max_iter']+1)),
        'alpha' : list(np.linspace(model_params['min_regparam'], model_params['max_regparam'], 5))
    }
    gridModel = GridSearchCV(estimator = M4,
                            param_grid=param_grid,
                            cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds']),
                            n_jobs=model_params['parallelism'])
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model

In [109]:
def get_Lasso(model_params):
    M5 = Lasso()
    param_grid = {
        'max_iter' : list(range(model_params['min_iter'], model_params['max_iter']+1)),
        'alpha' : list(np.linspace(model_params['min_regparam'], model_params['max_regparam'], 5))
    }
    gridModel = GridSearchCV(estimator = M5,
                            param_grid=param_grid,
                            cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds']),
                            n_jobs=model_params['parallelism'])
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model


In [110]:
def get_Elastic(model_params):
    M6 = ElasticNet()
    param_grid = {
        'max_iter' : list(range(model_params['min_iter'], model_params['max_iter']+1)),
        'alpha' : list(np.linspace(model_params['min_regparam'], model_params['max_regparam'], 5)),
        'l1_ratio' : list(np.linspace(model_params['min_elasticnet'], model_params['max_elasticnet'], 5))
    }
    gridModel = GridSearchCV(estimator = M6,
                            param_grid=param_grid,
                            cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds']),
                            n_jobs=model_params['parallelism'])
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model

In [111]:
def get_KNN_R(model_params):
    M = KNeighborsRegressor()
    param_grid = {
        'n_neighbors' : model_params['k_value'],
        'algorithm' : ["auto" if model_params['neighbour_finding_algorithm']== "Automatic" else "ball_tree"],
        'weights' : ["distance" if model_params['distance_weighting'] else "uniform"]
    }
    gridModel = GridSearchCV(estimator = M,
                            param_grid=param_grid,
                            cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds'])
                            )
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model

In [112]:
def get_DecisionR(model_params):
    M7 = DecisionTreeRegressor()
    param_grid = {
        'max_depth' : list(range(model_params['min_depth'], model_params['max_depth']+1)),
        'splitter' : ["best" if model_params['use_best'] else "random"],
        'min_sample_leaf' : model_params['min_samples_per_leaf']
    }
    gridModel = GridSearchCV(estimator = M7,
                            param_grid=param_grid,
                            cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds'])
                            )
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model

In [113]:
def get_SVR(model_params):
    M = SVR()
    kernel_dict = {"linear_kernel" : "linear",
                  "reb_kernel" : "rbf",
                  "polynomail_kernel" : "poly",
                  "sigmoid_kernel" : "sigmoid"}
    param_grid = {
        'kernel' : [v for k,v in kernel_dict.items() if model_params[k]],
        'gamma' : [0.5,0.1,1.0] if model_params['custom_gamma_values'] else ['auto', 'scale'],
        'tol' : [model_params['tolerance']],
        'max_iter' : [model_params['max_iterations']]
    }
    gridModel = GridSearchCV(estimator = M,
                            param_grid=param_grid,
                            cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds']),
                            )
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model

In [114]:
def get_SGD_R(model_params):
    M = SGDRegressor()
    param_grid = {
        'tol' : model_params['tolerance'],
        'penalty' : ['l1', 'l2', 'elasticnet'],
        'alpha' : model_params['alpha_values']
    }
    gridModel = GridSearchCV(estimator = M,
                            param_grid=param_grid,
                            cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds']),
                            n_jobs= model_params['parallelism']
                            )
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model

### Classification Models

In [115]:
def get_RandomForestC(model_params):
    M = RandomForestClassifier()
    param_grid = {
        'n_estimators': list(range(model_params['min_trees'], model_params['max_trees']+1)),
        'max_depth' : list(range(model_params['min_depth'], model_params['max_depth']+1)),
        'min_samples_leaf' : list(range(model_params['min_samples_per_leaf_min_value'], model_params['min_samples_per_leaf_max_value']+1))
    }
    gridModel = GridSearchCV(estimator = M, 
                             param_grid = param_grid,
                             cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds']),
                             n_jobs = json_data['hyperparameters']['parallelism'])
    gridModel.fit(X_train,y_train)
    final_model = gridModel.best_estimator_
    return final_model

In [116]:
def get_DecisionC(model_params):
    M = DecisionTreeClassifier()
    param_grid = {
        'max_depth' : list(range(model_params['min_depth'], model_params['max_depth']+1)),
        'splitter' : ["best" if model_params['use_best'] else "random"],
        'min_sample_leaf' : model_params['min_samples_per_leaf'],
        'criterion' : ["gini" if model_params['use_gini'] else "entropy"]
    }
    gridModel = GridSearchCV(estimator = M7,
                            param_grid=param_grid,
                            cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds'])
                            )
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model

In [117]:
def get_KNN_C(model_params):
    M = KNeighborsClassifier()
    param_grid = {
        'n_neighbors' : model_params['k_value'],
        'algorithm' : ["auto" if model_params['neighbour_finding_algorithm']== "Automatic" else "ball_tree"],
        'weights' : ["distance" if model_params['distance_weighting'] else "uniform"]
    }
    gridModel = GridSearchCV(estimator = M,
                            param_grid=param_grid,
                            cv = KFold(n_splits = json_data['hyperparameters']['num_of_folds'])
                            )
    gridModel.fit(X_train, y_train)
    final_model = gridModel.best_estimator_
    return final_model

### Evaluation

In [118]:
get_required_model(prediction_type)

Mean squared error for RandomForestRegressor model on test set is:  0.043670732578331546
