#  Assignment

In [None]:
import pandas as pd
import json
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC, SVR
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import GridSearchCV


 1) Read the target and type of regression to be run

In [None]:
def read_target_and_prediction_type(json_data):
    target = json_data['design_state_data']['target']['target']
    prediction_type = json_data['design_state_data']['target']['prediction_type']
    return target, prediction_type

2)Read the features (which are column names in the csv) and figure out what missing imputation needs to be applied and apply that to the columns loaded in a dataframe

In [None]:
def read_features_and_impute(json_data, data):
    feature_handling = json_data['design_state_data']['feature_handling']
    for feature, details in feature_handling.items():
        if details['is_selected']:
            if details['feature_variable_type'] == 'numerical':
                if details['feature_details']['missing_values'] == 'Impute':
                    impute_value = details['feature_details']['impute_value']
                    imputer = SimpleImputer(strategy='constant', fill_value=impute_value)
                    data[feature] = imputer.fit_transform(data[[feature]])
    return data

3) Compute feature reduction based on input. See the screenshot below where there can be No Reduction, Corr with Target, Tree-based, PCA. Please make sure you write code so that all options can work. If we rerun your code with a different Json it should work if we switch No Reduction to say PCA.

In [None]:
def feature_reduction(json_data, data):
    reduction_method = json_data['design_state_data']['feature_reduction']['feature_reduction_method']
    if reduction_method == 'Tree-based':
        
        num_of_features_to_keep = int(json_data['design_state_data']['feature_reduction']['num_of_features_to_keep'])
        num_of_trees = int(json_data['design_state_data']['feature_reduction']['num_of_trees'])
        depth_of_trees = int(json_data['design_state_data']['feature_reduction']['depth_of_trees'])
        forest = ExtraTreesClassifier(n_estimators=num_of_trees, max_depth=depth_of_trees)
        forest.fit(data.drop(columns=[target]), data[target])
        importance = forest.feature_importances_
        indices = importance.argsort()[-num_of_features_to_keep:][::-1]
        selected_features = data.columns[indices]
        data = data[selected_features.union([target])]
    elif reduction_method == 'PCA':
       
        pca = PCA(n_components='mle')
        data = pca.fit_transform(data.drop(columns=[target]))
        data = pd.concat([pd.DataFrame(data), data[target]], axis=1)
   
    return data

4) Parse the Json and make the model objects (using sklean) that can handle what is required in the “prediction_type” specified in the JSON (See #1 where “prediction_type” is specified). Keep in mind not to pick models that don’t apply for the prediction_type specified

In [None]:
def create_model_objects(prediction_type):
    if prediction_type == 'Regression':
        models = [
            ('RandomForestRegressor', RandomForestRegressor()),
            ('GradientBoostingRegressor', GradientBoostingRegressor()),
            ('LinearRegression', LinearRegression()),
            ('RidgeRegression', Ridge()),
            ('LassoRegression', Lasso()),
            ('ElasticNetRegression', ElasticNet()),
            ('SGDRegressor', SGDRegressor()),
            ('KNNRegressor', KNeighborsRegressor()),
            ('ExtraTreesRegressor', ExtraTreesRegressor()),
            ('NeuralNetworkRegressor', MLPRegressor())
        ]
    elif prediction_type == 'Classification':
        models = [
            ('RandomForestClassifier', RandomForestClassifier()),
            ('GradientBoostingClassifier', GradientBoostingClassifier()),
            ('LogisticRegression', LogisticRegression()),
            ('SVM', SVC()),
            ('SGDClassifier', SGDClassifier()),
            ('KNNClassifier', KNeighborsClassifier()),
            ('ExtraTreesClassifier', ExtraTreesClassifier()),
            ('NeuralNetworkClassifier', MLPClassifier())
        ]
    else:
        raise ValueError("Invalid prediction_type specified in the JSON.")
    return models

5) Run the fit and predict on each model – keep in mind that you need to do hyper parameter tuning i.e., use GridSearchCV

In [None]:
def run_model_fit_and_predict(models, data, target):
    for model_name, model in models:
        params = json_data['design_state_data']['algorithms'][model_name]
        if params['is_selected']:
            pipeline = Pipeline([
                
                ('model', model)
            ])

            
            grid_params = {}  

            
            grid_search = GridSearchCV(pipeline, grid_params, cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(data.drop(columns=[target]), data[target])

            
            print(f"Model: {model_name}")
            print("Best Parameters:", grid_search.best_params_)
            print("Best Score (Negative MSE):", grid_search.best_score_)
            print("\n")

6) Log to the console the standard model metrics that apply

In [None]:
 mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"Model Metrics for {model.named_steps['model']} - MSE: {mse}, R^2: {r2}")
