In [244]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import r2_score, mean_squared_error, classification_report
from xgboost import XGBRegressor, XGBClassifier


def automated_modelling(input_file):
    with open(input_file, 'r') as f:
        data = json.load(f)

    # Read the dataset
    df = pd.read_csv(data['design_state_data']['session_info']['dataset'])

    # Drop columns that are not selected
    selected_cols = [col for col in df.columns if data['design_state_data']['feature_handling'][col]['is_selected']]
    df = df[selected_cols]

    # Split features and target
    x = df.drop(columns=data['design_state_data']['target']['target'])
    y = df[data['design_state_data']['target']['target']]

    # Handle missing values
    for col in x.columns:
        if data['design_state_data']['feature_handling'][col]['feature_details']['missing_values']:
            strategy = data['design_state_data']['feature_handling'][col]['feature_details']['impute_with']
            if strategy == 'Average of values':
                strategy = 'mean'
            elif strategy == 'Median of values':
                strategy = 'median'
            elif strategy == 'Mode of values':
                strategy = 'most_frequent'
            imputer = SimpleImputer(strategy=strategy)
            x[col] = imputer.fit_transform(x[[col]])

    # Encode categorical features
    cat_cols = x.select_dtypes(include='object').columns
    if len(cat_cols) > 0:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        x_encoded = pd.DataFrame(encoder.fit_transform(x[cat_cols]))
        x.drop(columns=cat_cols, inplace=True)
        x = pd.concat([x, x_encoded], axis=1)

    # Split dataset into train and test
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=data['design_state_data']['train']['train_ratio'],
                                                        random_state=data['design_state_data']['train']['random_seed'])

    # Model training and evaluation
    results = []

    for algo in ['DecisionTree', 'RandomForest', 'XGBoost']:
        if data['design_state_data']['algorithms'][algo]['is_selected']:
            if algo == 'DecisionTree':
                model = DecisionTreeRegressor() if data['design_state_data']['target']['prediction_type'] == 'Regression' else DecisionTreeClassifier()
            elif algo == 'RandomForest':
                model = RandomForestRegressor() if data['design_state_data']['target']['prediction_type'] == 'Regression' else RandomForestClassifier()
            else:  # XGBoost
                model = XGBRegressor() if data['design_state_data']['target']['prediction_type'] == 'Regression' else XGBClassifier()

            # Hyperparameter tuning using GridSearchCV
            parameters = data['design_state_data']['algorithms'][algo]['hyperparameters']
            grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring='r2' if data['design_state_data']['target']['prediction_type'] == 'Regression' else 'f1_macro', cv=3)
            grid_search.fit(x_train, y_train)

            best_model = grid_search.best_estimator_
            if data['design_state_data']['target']['prediction_type'] == 'Regression':
                train_score = r2_score(y_train, best_model.predict(x_train))
                test_score = r2_score(y_test, best_model.predict(x_test))
                rmse_train = mean_squared_error(y_train, best_model.predict(x_train), squared=False)
                rmse_test = mean_squared_error(y_test, best_model.predict(x_test), squared=False)
                results.append({'algorithm': algo, 'train_r2': train_score, 'test_r2': test_score,
                                'train_rmse': rmse_train, 'test_rmse': rmse_test})
            else:
                y_pred_train = best_model.predict(x_train)
                y_pred_test = best_model.predict(x_test)
                train_report = classification_report(y_train, y_pred_train, output_dict=True)
                test_report = classification_report(y_test, y_pred_test, output_dict=True)
                results.append({'algorithm': algo, 'train_report': train_report, 'test_report': test_report})

    return pd.DataFrame(results)

In [247]:
automated_modelling("algoparams_from_ui1.json.rtf")


Random Forest Classifier:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         7

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


Decision Tree Classifier:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.92      0.85      0.88        13
           2       0.75      0.86      0.80         7

    accuracy                           0.90        30
   macro avg       0.89      0.90      0.89        30
weighted avg       0.91      0.90      0.90        30


Logistic Regression:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      