### Import the neccessary packages

In [None]:
# Warnings are provided to warn the developer of situations that aren’t necessarily exceptions. 
#Usually, a warning occurs when there is some obsolete of certain programming elements.
#Python program terminates immediately if an error occurs. Conversely, a warning is not critical.

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost
from xgboost import XGBRegressor
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Import data from Feature Selection Split 3

In [None]:
# Store the DataFrame in the IPython database
%store -r df_train_selected
%store -r df_val_selected
%store -r df_test_selected
%store -r df_validation1

In [None]:
%store -r scaler
%store -r input_col1
input_col = input_col1
%store -r target_col1
target_col = target_col1

In [None]:
# Split df_train into x_train and y_train
x_train = df_train_selected[input_col1]
y_train = df_train_selected[target_col1]

# Split df_train into x_val and y_val
x_val = df_val_selected[input_col1]
y_val = df_val_selected[target_col1]

# Split the testing data
x_test = df_test_selected[input_col1]
y_test = df_test_selected[target_col1]

#Split the validation data
x_validation = df_validation1[input_col1]
y_validation = df_validation1[target_col1]

### Recursive Feature Elimination

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

# Initialize the model with the hyperparameters you want
rf = RandomForestRegressor(max_depth=None, 
                           max_features='sqrt', 
                           min_samples_leaf=4, 
                           min_samples_split=10, 
                           n_estimators=200,
                           random_state=42)

# Fit the model on your training data
rf.fit(x_train, y_train)

# Initialize RFE, change n_features_to__select depending on how many features you want to rank
rfe = RFE(estimator=rf, n_features_to_select=5, step=1)

# Fit RFE
rfe.fit(x_train, y_train)

# Get the ranking of the features. The lower the rank, the better the feature.
ranking = rfe.ranking_

# Get the feature importance
importance = rf.feature_importances_

# Get the feature names
feature_names = x_train.columns

# Create a DataFrame that combines feature names, ranks and importances
feature_rank_importance = pd.DataFrame({'Feature': feature_names, 'Rank': ranking, 'Importance': importance})

# Print the five most important features
print(feature_rank_importance.sort_values('Rank').head(5))

### Sobol and Morris Sensitivity Analysis

In [None]:
import SALib
import numpy as np
import pandas as pd
from SALib.sample import saltelli, morris as morris_sample
from SALib.analyze import sobol, morris
from sklearn.ensemble import RandomForestRegressor

# Assume x_train and y_train are already defined

# Train a RandomForestRegressor model
model = RandomForestRegressor(random_state=42)
model.fit(x_train, y_train)

# Define the problem for SALib
problem = {
    'num_vars': len(x_train.columns),
    'names': x_train.columns.tolist(),
    'bounds': np.column_stack((x_train.min().values, x_train.max().values)).tolist()
}

sobol_samples = saltelli.sample(problem, 1000)  # Adjust the second parameter for more/less samples

# Generate samples for Morris SA
morris_samples = morris_sample.sample(problem, 1000, num_levels=4)  # Removed 'grid_jump_fraction'

# Evaluate the samples using the RandomForestRegressor model
sobol_y = model.predict(sobol_samples)

# Rest of the code
morris_y = model.predict(morris_samples)

# Perform Sobol SA
sobol_analysis = sobol.analyze(problem, sobol_y, print_to_console=True)

# Perform Morris SA
morris_analysis = morris.analyze(problem, morris_samples, morris_y, print_to_console=True)

### Grid Search for Random Forest, Extra Gradient Booster and ADABoost

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Define the hyperparameter grid for the random forest regressor
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the hyperparameter grid for the XGBoost regressor
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.5],
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.8, 1]
}

# Define the hyperparameter grid for the AdaBoost regressor
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'loss': ['linear', 'square', 'exponential']
}

# Define the models to be tuned
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
ada = AdaBoostRegressor(random_state=42)

# Define the parameter grids to be used for each model
param_grids = [rf_param_grid, xgb_param_grid, ada_param_grid]

# Define the models to be tuned and their corresponding parameter grids
models = [(rf, rf_param_grid), (xgb, xgb_param_grid), (ada, ada_param_grid)]

# Loop over each model and its corresponding parameter grid, and perform a grid search
for model, param_grid in models:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(x_train, y_train)
    print(f"Best hyperparameters for {type(model).__name__}:", grid_search.best_params_)
    
    # Predict on the validation set using the best model
    val_preds = grid_search.best_estimator_.predict(x_val)

    # Calculate the mean squared error on the validation set
    val_mse = mean_squared_error(y_val, val_preds)
    print(f'Validation MSE for {type(model).__name__}:', val_mse)