In [9]:
# Importing Libraries
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score

# Load JSON file with error handling
try:
    with open('algoparams_from_ui.json') as f:
        params = json.load(f)
except FileNotFoundError:
    raise FileNotFoundError("The JSON file was not found.")
except json.JSONDecodeError:
    raise ValueError("Error decoding JSON file. Please check the file format.")

# Load CSV file with error handling
try:
    data = pd.read_csv('iris.csv')
except FileNotFoundError:
    raise FileNotFoundError("The CSV file was not found.")
except pd.errors.ParserError:
    raise ValueError("Error parsing CSV file. Please check the file format.")

# Validate the JSON structure
required_keys = ['design_state_data']
if not all(key in params for key in required_keys):
    raise KeyError("Missing required keys in JSON.")

design_data = params['design_state_data']

# Validate target and prediction type
if 'target' not in design_data or 'prediction_type' not in design_data['target']:
    raise KeyError("Missing target or prediction type information in JSON.")

target = design_data['target']['target']
prediction_type = design_data['target']['prediction_type']

# Validate features
if 'feature_handling' not in design_data:
    raise KeyError("Missing feature handling information in JSON.")

features = [
    feature for feature in design_data['feature_handling'].keys()
    if design_data['feature_handling'][feature]['is_selected']
]

if not features:
    raise ValueError("No features selected. Please select at least one feature.")

# Prepare the data
X = data[features]
y = data[target]

# Encode the target variable if it's categorical
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Impute missing values with validation
imputer_strategies = {}
for feature in features:
    if feature in design_data['feature_handling']:
        feature_details = design_data['feature_handling'][feature]['feature_details']
        if 'missing_values' in feature_details and feature_details['missing_values'] == 'Impute':
            strategy = 'mean' if feature_details['impute_with'] == 'Average of values' else 'constant'
            fill_value = feature_details.get('impute_value', 0)
            imputer_strategies[feature] = (strategy, fill_value)

# Transformers for the pipeline
transformers = []

for feature in numerical_features:
    if feature in imputer_strategies:
        strategy, fill_value = imputer_strategies[feature]
        if strategy == 'mean':
            transformers.append((feature, SimpleImputer(strategy='mean'), [feature]))
        else:
            transformers.append((feature, SimpleImputer(strategy='constant', fill_value=fill_value), [feature]))
    else:
        transformers.append((feature, SimpleImputer(strategy='mean'), [feature]))

for feature in categorical_features:
    transformers.append((feature, OneHotEncoder(handle_unknown='ignore'), [feature]))

preprocessor = ColumnTransformer(transformers, remainder='passthrough')

# Feature reduction with validation
if 'feature_reduction' not in design_data:
    raise KeyError("Missing feature reduction information in JSON.")

feature_reduction_method = design_data['feature_reduction'].get('feature_reduction_method', 'No Reduction')

if feature_reduction_method == 'PCA':
    n_components = int(design_data['feature_reduction'].get('num_of_features_to_keep', 2))
    feature_reduction = PCA(n_components=n_components)
elif feature_reduction_method == 'Tree-based':
    tree_model = DecisionTreeRegressor(max_depth=5)
    feature_reduction = SelectFromModel(tree_model)
elif feature_reduction_method == 'Corr with Target':
    corr_threshold = float(design_data['feature_reduction'].get('correlation_threshold', 0.1))
    corrs = X.corrwith(pd.Series(y))
    selected_features = corrs[abs(corrs) > corr_threshold].index.tolist()
    feature_reduction = ColumnTransformer([(col, 'passthrough', [col]) for col in selected_features], remainder='drop')
else:  # No Reduction
    feature_reduction = 'passthrough'

# Model selection with validation
if prediction_type == 'Regression':
    if 'algorithms' not in design_data:
        raise KeyError("Missing algorithms information in JSON.")
    if design_data['algorithms']['RandomForestRegressor']['is_selected']:
        model = RandomForestRegressor(
            n_estimators=design_data['algorithms']['RandomForestRegressor'].get('max_trees', 100),
            max_depth=design_data['algorithms']['RandomForestRegressor'].get('max_depth', None)
        )
    else:
        raise ValueError("No valid regression algorithm selected.")
else:
    raise ValueError("Unsupported prediction type. Only 'Regression' is supported.")

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_reduction', feature_reduction),
    ('model', model)
])

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'model__n_estimators': [10, 20, 30],
    'model__max_depth': [10, 20, 30]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model with error handling
try:
    grid_search.fit(X_train, y_train)
except ValueError as e:
    raise ValueError(f"Error during model fitting: {e}")

# Predict
y_pred = grid_search.predict(X_test)

# Metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 7.500000000000044e-05
R^2 Score: 0.9998820115716058
