In [24]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.datasets import load_diabetes

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate

### Create a data set
- Load the Diabetes data bunch from SciKit Learn
    - The Diabetes data bunch is located in sklearn.datasets
    - Include $as\_frame=True$ in the fetch command to ease creating a DataFrame
- Create a Pandas DataFrame
- Show the first few rows

In [25]:
# Load the diabetes dataset
diabetes = load_diabetes(as_frame=True)

# Create a pandas dataframe
diabetes_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

# Display the first few rows
print(diabetes_df.head())


        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4 -0.002592 -0.031988 -0.046641  


### Split the DataFrame into Training and Testing
- The target variable is "target"
- Set aside 20% of the data for testing
- Show the dimensions of the Training and Testing Input data sets

In [31]:
# Your code here
# Add the target variable to the DataFrame
diabetes_df["target"] = diabetes.target

X = diabetes_df.drop(columns = ['target'])
y = diabetes_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=204)

# Show the dimensions of the Training and Testing data sets

print("\nDataset partitioning:")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")




Dataset partitioning:
Training set shape: (353, 10)
Testing set shape: (89, 10)


### Construct a simple model
- Instantiate a Decision Tree Regressor with default hyperparameter settings
- Fit the regressor with the Training data
- Extract and show the depth of the tree

In [32]:
# Your code here
# Instantiate a Decision Tree Regressor with default hyperparameter settings


dt_model = DecisionTreeRegressor(random_state=204)


# Fit the regressor to the training data


dt_model.fit(X_train, y_train)



# Extract and show the depth of the tree
# Hint: Use .get_depth() to get the depth of the tree


tree_depth = dt_model.get_depth()


# tree_leaves = dt_model.get_n_leaves()


print("Depth of the Decision Tree:", tree_depth)


# print("Number of leaves the Decision Tree:", tree_leaves)



Depth of the Decision Tree: 18


### Evaluate the performance of the simple model 
- Calculate the RMSE on the Training and Testing Set
- Save the RMSE on the Testing Data into a variable for later use

In [34]:


# Make predictions on the training and testing sets
y_train_pred = dt_model.predict(X_train)
y_test_pred = dt_model.predict(X_test)

# Calculate RMSE for the training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

# Calculate RMSE for the testing set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Save the RMSE on the testing data into a variable for later use
simple_testing_rmse = rmse_test

# Print the RMSE values
print("RMSE on Training Set:", rmse_train)
print("RMSE on Testing Set:", rmse_test)

# Notes
# a zero training error with a much higher testing error indicates overfitting.
# The high testing RMSE indicates that the model isnt generalizing well and is likely too complex and has learned the noise in the training data thus needs
# to be simplified.

RMSE on Training Set: 0.0
RMSE on Testing Set: 72.1490342698183


### Calculate the baseline metrics from cross validation
- Execute $cross\_validate$, with the decision tree regressor instantiated above, the Training Data, cv=3, and the following evaluation metrics:
    - Root Mean Squared Error
    - Mean Absolute Error
- Save the results into a variable
- Show the results

In [36]:

# Define custom scorers for RMSE and MAE
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(
    mean_squared_error(y_true, y_pred)))
mae_scorer = make_scorer(mean_absolute_error)

# Perform cross-validation
cv_results = cross_validate(
    dt_model,  # The Decision Tree Regressor
    X_train,        # Training input data
    y_train,        # Training target data
    cv=3,           # Number of cross-validation folds
    scoring={
        'rmse': rmse_scorer,  # RMSE scorer
        'mae': mae_scorer     # MAE scorer
    },
    return_train_score=True  # Includes training scores in the results
)

# Save the results
cv_results_df = pd.DataFrame(cv_results)

print(cv_results_df)
# To get the mean scores:
print("\nMean CV Scores:")

print(f"Mean RMSE: {--cv_results['test_rmse'].mean()}")
print(f"Mean MAE: {--cv_results['test_mae'].mean()}")
print(f"Mean RMSE: {--cv_results['train_rmse'].mean()}")
print(f"Mean MAE: {--cv_results['train_mae'].mean()}")

   fit_time  score_time  test_rmse  train_rmse   test_mae  train_mae
0  0.041466    0.034521  80.685149         0.0  65.516949        0.0
1  0.019363    0.006017  79.191198         0.0  60.161017        0.0
2  0.012376    0.004987  82.546905         0.0  64.897436        0.0

Mean CV Scores:
Mean RMSE: 80.8077502562048
Mean MAE: 63.52513399971027
Mean RMSE: 0.0
Mean MAE: 0.0


- Show standard deviation and mean RMSE for each fold of cross validation

In [39]:
# Your code here
# Extract RMSE scores for each fold
train_rmse_scores = cv_results['train_rmse']
test_rmse_scores = cv_results['test_rmse']

# Calculate mean and standard deviation for RMSE scores
train_rmse_mean = train_rmse_scores.mean()
train_rmse_std = train_rmse_scores.std()
test_rmse_mean = test_rmse_scores.mean()
test_rmse_std = test_rmse_scores.std()

# Show the results
print("Cross-Validation RMSE Results:")
print(f"Train RMSE (mean): {train_rmse_mean:.4f}")
print(f"Train RMSE (std): {train_rmse_std:.4f}")
print(f"Test RMSE (mean): {test_rmse_mean:.4f}")
print(f"Test RMSE (std): {test_rmse_std:.4f}")

# Show RMSE for each fold
print("\nRMSE for Each Fold:")
for fold, (train_rmse, test_rmse) in enumerate(zip(train_rmse_scores, test_rmse_scores), start=1):
    print(f"Fold {fold}: Train RMSE = {
          train_rmse:.4f}, Test RMSE = {test_rmse:.4f}, ")

Cross-Validation RMSE Results:
Train RMSE (mean): 0.0000
Train RMSE (std): 0.0000
Test RMSE (mean): 80.8078
Test RMSE (std): 1.3727

RMSE for Each Fold:
Fold 1: Train RMSE = 0.0000, Test RMSE = 80.6851, 
Fold 2: Train RMSE = 0.0000, Test RMSE = 79.1912, 
Fold 3: Train RMSE = 0.0000, Test RMSE = 82.5469, 


### Brute Force identification of the optimal max_depth setting
- Write a loop that will iteratively execute $cross\_validate$ using the decision tree regressor and with the max_depth setting in the range from one to the maximum depth found above
    - Tip: The max_depth hyperparameter can be updated in the loop by calling $set\_params()$
    - Within $cross\_validate$, use three folds, the Training Data, and calculate the RMSE scores
    - For each value of depth, print the depth and average RMSE
    - From the output printed, identify the optimal depth for each metric.

In [46]:
# Your code here
# Initialize variables to store the best depth and corresponding RMSE
best_depth = None
best_rmse = float('inf')

# Loop through max_depth values from 1 to the maximum depth
for depth in range(1, tree_depth + 1):
    # Update the max_depth hyperparameter
    dt_model.set_params(max_depth=depth)

    # Perform cross-validation
    cv_results = cross_validate(
        dt_model,
        X_train,
        y_train,
        cv=3,
        scoring={'rmse': make_scorer(lambda y_true, y_pred: np.sqrt(
            mean_squared_error(y_true, y_pred)))},
        return_train_score=True
    )

    # Calculate the average RMSE for the testing set
    avg_test_rmse = cv_results['test_rmse'].mean()

    # Print the depth and average RMSE
    print(f"Depth: {depth}, Average Test RMSE: {avg_test_rmse:.4f}")

    # Check if this depth is the best so far
    if avg_test_rmse < best_rmse:
        best_rmse = avg_test_rmse
        best_depth = depth

# Print the optimal depth and corresponding RMSE
print(f"\nOptimal Depth: {
      best_depth}, Best Average Test RMSE: {best_rmse:.4f}")

Depth: 1, Average Test RMSE: 67.4903
Depth: 2, Average Test RMSE: 63.3295
Depth: 3, Average Test RMSE: 62.9330
Depth: 4, Average Test RMSE: 67.2458
Depth: 5, Average Test RMSE: 70.0244
Depth: 6, Average Test RMSE: 73.4565
Depth: 7, Average Test RMSE: 75.0895
Depth: 8, Average Test RMSE: 78.8285
Depth: 9, Average Test RMSE: 80.1774
Depth: 10, Average Test RMSE: 81.3454
Depth: 11, Average Test RMSE: 82.3884
Depth: 12, Average Test RMSE: 83.3118
Depth: 13, Average Test RMSE: 82.8036
Depth: 14, Average Test RMSE: 81.8235
Depth: 15, Average Test RMSE: 80.5003
Depth: 16, Average Test RMSE: 81.4539
Depth: 17, Average Test RMSE: 81.4878
Depth: 18, Average Test RMSE: 81.5140

Optimal Depth: 3, Best Average Test RMSE: 62.9330


### Use GridSearchCV to identify the optimal depth
- Construct a grid to search the max_depth setting from one to the maximum depth found above
- Instantiate GridSearchCV with the following:
    - estimator = decision tree regressor
    - param_grid = the above grid for max_depth
    - cv = 3
    - refit = 'neg_root_mean_squared_error'
    - scoring= ['neg_root_mean_squared_error', 'neg_mean_absolute_error']
- Call .fit() with the Training Data
- With multiple scoring metrics being calculated, .best_params_ and .best_score_ correspond to the refit strategy
- Compare .best_params_ and .best_score_ to the values found from brute force calculations

In [48]:

# Construct the grid for max_depth
param_grid = {'max_depth': list(range(1, tree_depth + 1))}

# Instantiate GridSearchCV
grid_search = GridSearchCV(
    estimator=dt_model,  
    param_grid=param_grid,     
    cv=3,                      
    refit='neg_root_mean_squared_error',  # Refit using RMSE
    scoring=['neg_root_mean_squared_error',
             'neg_mean_absolute_error'],  
    return_train_score=True    
)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Extract the best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  

# Print the best parameters and best score
print("Best Parameters from GridSearchCV:", best_params)
print("Best RMSE from GridSearchCV:", best_score)

# Compare to brute-force results
print("\nComparison to Brute-Force Results:")
print(f"Brute-Force Optimal Depth: {best_depth}, Best RMSE: {best_rmse:.4f}")
print(f"GridSearchCV Optimal Depth: {
      best_params['max_depth']}, Best RMSE: {best_score:.4f}")

Best Parameters from GridSearchCV: {'max_depth': 3}
Best RMSE from GridSearchCV: 62.93302278782746

Comparison to Brute-Force Results:
Brute-Force Optimal Depth: 3, Best RMSE: 62.9330
GridSearchCV Optimal Depth: 3, Best RMSE: 62.9330


### Evaluate the final model
- Assume identifying the optimal value of max_depth will construct the optimal tree
- Calculate the RMSE on the Training and Testing Set from the tree found from Grid Search above
- Compare the RMSE on the Testing Data from the default setting and the tuned setting 

In [50]:
# Instantiate the final model with the optimal max_depth
final_model = DecisionTreeRegressor(
    max_depth=best_params['max_depth'], random_state=204)

# Fit the model using the training data
final_model.fit(X_train, y_train)

# Predict on training and testing sets
y_train_pred_final = final_model.predict(X_train)
y_test_pred_final = final_model.predict(X_test)

# Calculate RMSE for training and testing sets
rmse_train_final = np.sqrt(mean_squared_error(y_train, y_train_pred_final))
rmse_test_final = np.sqrt(mean_squared_error(y_test, y_test_pred_final))

# Compare RMSE from default and tuned settings
rmse_comparison = {
    "RMSE from Testing Data with default settings:": rmse_test,
    "RMSE from Testing Data after tuning settings:": rmse_test_final,
    "RMSE from Training Data with default settings:": rmse_train,
    "RMSE from Training Data after tuning settings:": rmse_train_final
    
}

rmse_comparison

{'RMSE from Testing Data with default settings:': 72.1490342698183,
 'RMSE from Testing Data after tuning settings:': 60.237152516027294,
 'RMSE from Training Data with default settings:': 0.0,
 'RMSE from Training Data after tuning settings:': 54.364994825858155}

The tuned model performs better on the testing set, indicating that tuning the settings improved the model's performance on unseen (testing) data, leading to more accurate predictions.