In [1]:
#Install the striprtf module
!pip install striprtf

#Read the RTF file and convert it to plain text
from striprtf.striprtf import rtf_to_text

# Read the RTF content from the file
with open('/content/algoparams_from_ui.json.rtf', 'r') as file:
    rtf_content = file.read()

# Convert the RTF content to plain text
plain_text = rtf_to_text(rtf_content)

# Parse the JSON data
import json

# Since the plain text might have extra characters or invalid JSON, we need to clean it up

try:
    config = json.loads(plain_text)
except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)




In [2]:
#importing all required libraries

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#### 1) Read the target and type of regression to be run

In [3]:
# Extract the target and type of regression
target_info = config['design_state_data']['target']
target = target_info['target']
regression_type = target_info['type']

print(f"Target: {target}")
print(f"Regression Type: {regression_type}")

Target: petal_width
Regression Type: regression


#### 2) Read the features (which are column names in the csv) and figure out what missing imputation needs to be applied and apply that to the columns loaded in a dataframe


In [4]:
# Step 1: Load the CSV file into a DataFrame
csv_file_path = '/content/iris.csv'
df = pd.read_csv(csv_file_path)

# Step 2: Read the imputation details from the JSON
feature_handling = config['design_state_data']['feature_handling']

# Step 3: Apply the imputation to the DataFrame
for feature, details in feature_handling.items():
    if details['is_selected']:
        # Check if 'feature_details' exists and contains 'impute_with'
        if 'feature_details' in details and 'impute_with' in details['feature_details']:
            impute_method = details['feature_details']['impute_with']
            impute_value = details['feature_details']['impute_value']

            if impute_method == 'Average of values':
                df[feature].fillna(df[feature].mean(), inplace=True)
            elif impute_method == 'custom':
                df[feature].fillna(impute_value, inplace=True)
            else:
                print(f"Unknown imputation method for feature {feature}")
        else:
            print(f"Missing 'feature_details' or 'impute_with' for feature: {feature}")

print("DataFrame after imputation:")
print(df.head())

Missing 'feature_details' or 'impute_with' for feature: species
DataFrame after imputation:
   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


#### 3) Compute feature reduction based on input. See the screenshot below where there can be No Reduction, Corr with Target, Tree-based, PCA. Please make sure you write code so that all options can work. If we rerun your code with a different Json it should work if we switch No Reduction to say PCA.

In [5]:
# Step 1: Read the reduction method from JSON
reduction_method = config['design_state_data']['feature_reduction']['feature_reduction_method']

# Step 2: Handle categorical columns if present
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
if categorical_columns:
    df = pd.get_dummies(df, columns=categorical_columns)

# Step 3: Apply feature reduction based on the method chosen
if reduction_method == 'No Reduction':
    reduced_df = df

elif reduction_method == 'Corr with Target':
    target_variable = config['design_state_data']['target']['target']
    corr = df.corrwith(df[target_variable])
    selected_features = corr[abs(corr) > 0.2].index.tolist()
    reduced_df = df[selected_features]

elif reduction_method == 'Tree-based':
    target_variable = config['design_state_data']['target']['target']
    if config['design_state_data']['target']['type'] == 'classification':
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
    else:
        clf = RandomForestRegressor(n_estimators=100, random_state=42)
    clf.fit(df.drop(columns=[target_variable]), df[target_variable])
    feature_importances = pd.Series(clf.feature_importances_, index=df.drop(columns=[target_variable]).columns)
    selected_features = feature_importances[feature_importances > 0.01].index.tolist()
    reduced_df = df[selected_features]

elif reduction_method == 'PCA':
    pca = PCA(n_components=3)
    principal_components = pca.fit_transform(df)
    reduced_df = pd.DataFrame(data=principal_components, columns=[f"PC{i+1}" for i in range(pca.n_components_)])

else:
    print(f"Unknown feature reduction method: {reduction_method}")

print("DataFrame after feature reduction:")
print(reduced_df.head())


DataFrame after feature reduction:
   sepal_length  sepal_width  petal_length  species_Iris-setosa  \
0           5.1          3.5           1.4                 True   
1           4.9          3.0           1.4                 True   
2           4.7          3.2           1.3                 True   
3           4.6          3.1           1.5                 True   
4           5.0          3.6           1.4                 True   

   species_Iris-versicolor  species_Iris-virginica  
0                    False                   False  
1                    False                   False  
2                    False                   False  
3                    False                   False  
4                    False                   False  


#### 4) Parse the Json and make the model objects (using sklean) that can handle what is required in the “prediction_type” specified in the JSON (See 1 where “prediction_type” is specified). Keep in mind not to pick models that don’t apply for the prediction_type specified

In [6]:
# Step 1: Parse the prediction type
prediction_type = config['design_state_data']['target']['prediction_type']

# Step 2: Create model objects based on prediction type
def create_model_objects(config, prediction_type):
    models = []

    for model_key, model_info in config['design_state_data']['algorithms'].items():
        if model_info['is_selected']:
            if prediction_type == "Regression":
                if model_key == "RandomForestRegressor":
                    model = RandomForestRegressor(
                        n_estimators=model_info.get('max_trees', 100),
                        max_depth=model_info.get('max_depth', None),
                        min_samples_leaf=model_info.get('min_samples_per_leaf_min_value', 1)
                    )
                    models.append(model)
                elif model_key == "LinearRegression":
                    model = LinearRegression()
                    models.append(model)
                elif model_key == "DecisionTreeRegressor":
                    model = DecisionTreeRegressor(
                        max_depth=model_info.get('max_depth', None),
                        min_samples_leaf=model_info.get('min_samples_per_leaf_min_value', 1)
                    )
                    models.append(model)
                elif model_key == "SVR":
                    model = SVR()
                    models.append(model)

            elif prediction_type == "Classification":
                if model_key == "RandomForestClassifier":
                    model = RandomForestClassifier(
                        n_estimators=model_info.get('max_trees', 100),
                        max_depth=model_info.get('max_depth', None),
                        min_samples_leaf=model_info.get('min_samples_per_leaf_min_value', 1)
                    )
                    models.append(model)
                elif model_key == "LogisticRegression":
                    model = LogisticRegression()
                    models.append(model)
                elif model_key == "DecisionTreeClassifier":
                    model = DecisionTreeClassifier(
                        max_depth=model_info.get('max_depth', None),
                        min_samples_leaf=model_info.get('min_samples_per_leaf_min_value', 1)
                    )
                    models.append(model)
                elif model_key == "SVC":
                    model = SVC()
                    models.append(model)
    return models

# Step 3: Create the models
model_objects = create_model_objects(config, prediction_type)

# Output the models created
for model in model_objects:
    print(model)


RandomForestRegressor(max_depth=25, min_samples_leaf=5, n_estimators=20)


#### 5) Run the fit and predict on each model – keep in mind that you need to do hyper parameter tuning i.e., use GridSearchCV

In [7]:
# Step 1: Parse the prediction type
prediction_type = config['design_state_data']['target']['prediction_type']
target_variable = config['design_state_data']['target']['target']

# Load the dataset
df = pd.read_csv('/content/iris.csv')

# Step 2: Preprocessing - Encode categorical variables
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

# Ensure target variable is not included in feature columns
categorical_cols = [col for col in categorical_cols if col != target_variable]
numerical_cols = [col for col in numerical_cols if col != target_variable]

# Define the preprocessing for numerical and categorical data
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Step 3: Create model objects based on prediction type
def create_model_objects(config, prediction_type):
    models = []
    param_grids = {}

    for model_key, model_info in config['design_state_data']['algorithms'].items():
        if model_info['is_selected']:
            if prediction_type == "Regression":
                if model_key == "RandomForestRegressor":
                    model = RandomForestRegressor(random_state=42)
                    param_grid = {
                        'model__n_estimators': range(model_info['min_trees'], model_info['max_trees']+1, 10),
                        'model__max_depth': range(model_info['min_depth'], model_info['max_depth']+1, 5),
                        'model__min_samples_leaf': range(model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']+1, 5)
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "LinearRegression":
                    model = LinearRegression()
                    param_grid = {}  # No hyperparameters to tune for LinearRegression
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "DecisionTreeRegressor":
                    model = DecisionTreeRegressor(random_state=42)
                    param_grid = {
                        'model__max_depth': range(model_info['min_depth'], model_info['max_depth']+1, 5),
                        'model__min_samples_leaf': range(model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']+1, 5)
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "SVR":
                    model = SVR()
                    param_grid = {
                        'model__C': [0.1, 1, 10],
                        'model__gamma': ['scale', 'auto']
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

            elif prediction_type == "Classification":
                if model_key == "RandomForestClassifier":
                    model = RandomForestClassifier(random_state=42)
                    param_grid = {
                        'model__n_estimators': range(model_info['min_trees'], model_info['max_trees']+1, 10),
                        'model__max_depth': range(model_info['min_depth'], model_info['max_depth']+1, 5),
                        'model__min_samples_leaf': range(model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']+1, 5)
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "LogisticRegression":
                    model = LogisticRegression(max_iter=10000)
                    param_grid = {
                        'model__C': [0.1, 1, 10],
                        'model__solver': ['liblinear', 'lbfgs']
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "DecisionTreeClassifier":
                    model = DecisionTreeClassifier(random_state=42)
                    param_grid = {
                        'model__max_depth': range(model_info['min_depth'], model_info['max_depth']+1, 5),
                        'model__min_samples_leaf': range(model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']+1, 5)
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "SVC":
                    model = SVC()
                    param_grid = {
                        'model__C': [0.1, 1, 10],
                        'model__gamma': ['scale', 'auto']
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid
    return models, param_grids

# Step 4: Create the models and parameter grids
model_objects, param_grids = create_model_objects(config, prediction_type)

# Step 5: Fit models using GridSearchCV and make predictions
X = df.drop(columns=[target_variable])
y = df[target_variable]

best_models = []
for model, param_grid in zip(model_objects, param_grids.values()):
    # Create a pipeline that includes preprocessing and the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error' if prediction_type == 'Regression' else 'accuracy')
    grid_search.fit(X, y)
    best_models.append(grid_search.best_estimator_)
    print(f"Best parameters for {type(model).__name__}: {grid_search.best_params_}")

# Step 6: Make predictions with the best models
predictions = {}
for best_model in best_models:
    pred = best_model.predict(X)
    predictions[type(best_model.named_steps['model']).__name__] = pred

# Output the predictions
for model_name, pred in predictions.items():
    print(f"Predictions for {model_name}: {pred}")


Best parameters for RandomForestRegressor: {'model__max_depth': 20, 'model__min_samples_leaf': 5, 'model__n_estimators': 20}
Predictions for RandomForestRegressor: [0.24897008 0.17452089 0.20873373 0.18459271 0.22793789 0.33564868
 0.23166045 0.23253419 0.19647637 0.16139195 0.26921806 0.26966758
 0.18457155 0.19897637 0.2573159  0.27219425 0.2573159  0.24897008
 0.33564868 0.26283808 0.32287325 0.25986189 0.23082712 0.30212578
 0.28572313 0.25255717 0.27855646 0.25041895 0.23848931 0.26661676
 0.25264051 0.26027298 0.25335731 0.2573159  0.16139195 0.193282
 0.2545613  0.16139195 0.19897637 0.25223305 0.23427122 0.19897637
 0.21092123 0.29359813 0.32800222 0.18457155 0.31578    0.20842123
 0.25997203 0.20485397 1.51128149 1.49669816 1.59200663 1.15960363
 1.4207612  1.42560796 1.52361483 1.02702048 1.42667029 1.25893994
 1.06271492 1.31951723 1.14952946 1.4504322  1.188829   1.34992109
 1.48712337 1.24615743 1.42993977 1.1655322  1.58718917 1.25135584
 1.48010562 1.4445231  1.31776647 

####6) Log to the console the standard model metrics that apply

In [8]:
# Step 1: Parse the prediction type
prediction_type = config['design_state_data']['target']['prediction_type']
target_variable = config['design_state_data']['target']['target']

# Load the dataset
df = pd.read_csv('/content/iris.csv')

# Step 2: Preprocessing - Encode categorical variables
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

# Ensure target variable is not included in feature columns
categorical_cols = [col for col in categorical_cols if col != target_variable]
numerical_cols = [col for col in numerical_cols if col != target_variable]

# Define the preprocessing for numerical and categorical data
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Step 3: Create model objects based on prediction type
def create_model_objects(config, prediction_type):
    models = []
    param_grids = {}

    for model_key, model_info in config['design_state_data']['algorithms'].items():
        if model_info['is_selected']:
            if prediction_type == "Regression":
                if model_key == "RandomForestRegressor":
                    model = RandomForestRegressor(random_state=42)
                    param_grid = {
                        'model__n_estimators': range(model_info['min_trees'], model_info['max_trees']+1, 10),
                        'model__max_depth': range(model_info['min_depth'], model_info['max_depth']+1, 5),
                        'model__min_samples_leaf': range(model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']+1, 5)
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "LinearRegression":
                    model = LinearRegression()
                    param_grid = {}  # No hyperparameters to tune for LinearRegression
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "DecisionTreeRegressor":
                    model = DecisionTreeRegressor(random_state=42)
                    param_grid = {
                        'model__max_depth': range(model_info['min_depth'], model_info['max_depth']+1, 5),
                        'model__min_samples_leaf': range(model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']+1, 5)
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "SVR":
                    model = SVR()
                    param_grid = {
                        'model__C': [0.1, 1, 10],
                        'model__gamma': ['scale', 'auto']
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

            elif prediction_type == "Classification":
                if model_key == "RandomForestClassifier":
                    model = RandomForestClassifier(random_state=42)
                    param_grid = {
                        'model__n_estimators': range(model_info['min_trees'], model_info['max_trees']+1, 10),
                        'model__max_depth': range(model_info['min_depth'], model_info['max_depth']+1, 5),
                        'model__min_samples_leaf': range(model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']+1, 5)
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "LogisticRegression":
                    model = LogisticRegression(max_iter=10000)
                    param_grid = {
                        'model__C': [0.1, 1, 10],
                        'model__solver': ['liblinear', 'lbfgs']
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "DecisionTreeClassifier":
                    model = DecisionTreeClassifier(random_state=42)
                    param_grid = {
                        'model__max_depth': range(model_info['min_depth'], model_info['max_depth']+1, 5),
                        'model__min_samples_leaf': range(model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']+1, 5)
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

                elif model_key == "SVC":
                    model = SVC()
                    param_grid = {
                        'model__C': [0.1, 1, 10],
                        'model__gamma': ['scale', 'auto']
                    }
                    models.append(model)
                    param_grids[model_key] = param_grid

    return models, param_grids

# Step 4: Create the models and parameter grids
model_objects, param_grids = create_model_objects(config, prediction_type)

# Step 5: Fit models using GridSearchCV and make predictions
X = df.drop(columns=[target_variable])
y = df[target_variable]

best_models = []
for model, param_grid in zip(model_objects, param_grids.values()):
    # Create a pipeline that includes preprocessing and the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error' if prediction_type == 'Regression' else 'accuracy')
    grid_search.fit(X, y)
    best_models.append(grid_search.best_estimator_)
    print(f"Best parameters for {type(model).__name__}: {grid_search.best_params_}")

# Step 6: Make predictions with the best models and log metrics
predictions = {}
for best_model in best_models:
    pred = best_model.predict(X)
    predictions[type(best_model.named_steps['model']).__name__] = pred

# Log metrics
for model_name, pred in predictions.items():
    if prediction_type == "Regression":
        mse = mean_squared_error(y, pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y, pred)
        r2 = r2_score(y, pred)
        print(f"{model_name} - MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R²: {r2}")
    elif prediction_type == "Classification":
        accuracy = accuracy_score(y, pred)
        precision = precision_score(y, pred, average='macro')
        recall = recall_score(y, pred, average='macro')
        f1 = f1_score(y, pred, average='macro')
        print(f"{model_name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

# Output the predictions
for model_name, pred in predictions.items():
    print(f"Predictions for {model_name}: {pred}")


Best parameters for RandomForestRegressor: {'model__max_depth': 20, 'model__min_samples_leaf': 5, 'model__n_estimators': 20}
RandomForestRegressor - MSE: 0.019268274620558613, RMSE: 0.1388102107935818, MAE: 0.10395376163868812, R²: 0.9666945140061451
Predictions for RandomForestRegressor: [0.24897008 0.17452089 0.20873373 0.18459271 0.22793789 0.33564868
 0.23166045 0.23253419 0.19647637 0.16139195 0.26921806 0.26966758
 0.18457155 0.19897637 0.2573159  0.27219425 0.2573159  0.24897008
 0.33564868 0.26283808 0.32287325 0.25986189 0.23082712 0.30212578
 0.28572313 0.25255717 0.27855646 0.25041895 0.23848931 0.26661676
 0.25264051 0.26027298 0.25335731 0.2573159  0.16139195 0.193282
 0.2545613  0.16139195 0.19897637 0.25223305 0.23427122 0.19897637
 0.21092123 0.29359813 0.32800222 0.18457155 0.31578    0.20842123
 0.25997203 0.20485397 1.51128149 1.49669816 1.59200663 1.15960363
 1.4207612  1.42560796 1.52361483 1.02702048 1.42667029 1.25893994
 1.06271492 1.31951723 1.14952946 1.450432