__Necessary Library Imports__

In [1]:
import ipywidgets as widgets
from bqplot import pyplot as plt
from bqplot import topo_load
from bqplot.interacts import panzoom
import numpy as np
import pandas as pd
import datetime as dt
from bqplot import (
    Figure, LinearScale, OrdinalScale, Bars, Axis, ColorScale, GridHeatMap, Layout, pyplot as plt
)
from bqplot.marks import Label
from ipywidgets import VBox, HBox, Layout
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

__Load and Explore the Data__
* Load data from housing.csv into Pandas DataFrame

*Drop the Null values and save the results to the data object to get rid of total_bedroom non-null entries*

In [2]:
data = pd.read_csv('housing.csv')
data.dropna(inplace=True)

In [3]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


### Function to incorporate bqplots of feature data

* bqplot.pyplot used to create Histograms for each feature column with numeric values
* Calculate correlation matrix to use bqplot's heatmap functionality to display it

In [4]:
def plot_data_bqplot(df, title):
    # Plot Histograms
    num_columns = df.select_dtypes(include=[np.number]).columns
    figures = []
    for col in num_columns:
        hist, edges = np.histogram(df[col], bins=15)
        x_sc = LinearScale()
        y_sc = LinearScale()
        bar = Bars(x=edges[:-1], y=hist, scales={'x': x_sc, 'y': y_sc})
        ax_x = Axis(scale=x_sc, label=col)
        ax_y = Axis(scale=y_sc, orientation='vertical', tick_format='0.2f')
        fig = Figure(marks=[bar], axes=[ax_x, ax_y,], title=f'Histograms of {col}')
        figures.append(fig)

    # Plot Heatmaps
    corr_matrix = df.corr()

    color_sc = ColorScale(scheme='RdBu', min=-1, max=1)
    x_sc = LinearScale(min=0, max=len(corr_matrix.columns)-1)
    y_sc = LinearScale(min=0, max=len(corr_matrix.index)-1)

    grid_map = GridHeatMap(color=np.array(corr_matrix), scales={'color': color_sc, 'column': x_sc, 'row': y_sc},
                           stroke='black')  # Adding stroke to delineate cells

    # Adding text labels to the heatmap
    texts = [str(np.round(val, 2)) for val in corr_matrix.values.flatten()]
    text_x = [i % len(corr_matrix.columns) for i in range(len(texts))]
    text_y = [i // len(corr_matrix.columns) for i in range(len(texts))]

    labels = Label(x=text_x, y=text_y, text=texts, colors=['black'],
                   scales={'x': x_sc, 'y': y_sc}, default_size=12, font_weight='bold')

    ax_x = Axis(scale=x_sc, orientation='horizontal', label='Features',
                tick_values=list(range(len(corr_matrix.columns))),
                tick_labels={i: label for i, label in enumerate(corr_matrix.columns.tolist())})
    ax_y = Axis(scale=y_sc, orientation='vertical', label='Features',
                tick_values=list(range(len(corr_matrix.index))),
                tick_labels={i: label for i, label in enumerate(corr_matrix.index.tolist())})

    heatmap_fig = Figure(marks=[grid_map, labels], axes=[ax_x, ax_y], title='Correlation Heatmap', layout=Layout(width='100%', height='500px'))

    return VBox([HBox(figures, layout=Layout(flex_flow='row wrap', width='100%')), heatmap_fig])



__Split data into features (X set) and target variable (y set)__

In [5]:
X = data.drop(['median_house_value'], axis=1) # dropping median house value from input features because it is the output label of the prediction testing
y = data['median_house_value'] # evaluation dataset (output label)

*Split data into training and test set by calling train_test_split function, with 20% used for testing*
* Random_state set to constant for model reproducibility 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

*Ensure Training and Test sets are compatible for model computation*

In [7]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (16346, 9)
y_train shape: (16346,)
X_test shape: (4087, 9)
y_test shape: (4087,)


In [8]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
17727,-121.80,37.32,14.0,4412.0,924.0,2698.0,891.0,4.7027,<1H OCEAN
2057,-119.63,36.64,33.0,1036.0,181.0,620.0,174.0,3.4107,INLAND
6453,-118.06,34.12,25.0,3891.0,848.0,1848.0,759.0,3.6639,INLAND
4619,-118.31,34.07,28.0,2362.0,949.0,2759.0,894.0,2.2364,<1H OCEAN
15266,-117.27,33.04,27.0,1839.0,392.0,1302.0,404.0,3.5500,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
11397,-117.97,33.72,24.0,2991.0,500.0,1437.0,453.0,5.4286,<1H OCEAN
12081,-117.54,33.76,5.0,5846.0,1035.0,3258.0,1001.0,4.7965,<1H OCEAN
5447,-118.42,34.01,42.0,1594.0,369.0,952.0,362.0,3.0990,<1H OCEAN
866,-122.04,37.57,12.0,5719.0,1064.0,3436.0,1057.0,5.2879,NEAR BAY


*Make copy and remove ocean_proximity feature to display data before transformations, because ocean_proximity features are not floats*

In [9]:
X_train_without_ocean_proximity = X_train.copy().drop('ocean_proximity', axis=1)

*Display data before transformations*
* Resulting Histograms show most features to be right-skewed
* Resulting Heat Map also show mostly positively correlated data

In [10]:
plot_data_bqplot(X_train_without_ocean_proximity, 'Before Transformations')

VBox(children=(HBox(children=(Figure(axes=[Axis(label='longitude', scale=LinearScale()), Axis(orientation='ver…

*Check for any missing values in the test data*
* If any are found, display the number of NaNs in each column

In [11]:
# Check for NaNs in the original test data
nan_columns = X_test.isna().sum()
print("NaNs in each column of X_test:\n", nan_columns[nan_columns > 0])


NaNs in each column of X_test:
 Series([], dtype: int64)


*Function to apply transformations taking a dataframe as its input*

In [12]:
def transform_data(df, one_hot_encoder=None):
    # Create a copy
    df = df.copy()
    
    # Apply log transformations to normalize the data and shift skewed data
    for col in ['total_rooms', 'total_bedrooms', 'population', 'households']:
        # Validate logarithm is postive value
        if (df[col] <= 0).any():
            raise ValueError(f'Non-positive values detected in {col}, which is not allowed for logarithmic transformations.')
        df[col] = np.log(df[col] + 1e-1)  # plus 1e-1 to prevent log(0) 

    # One-hot encoding
        # If first time to One-hot encode data
    if one_hot_encoder is None:
        one_hot_encoded = pd.get_dummies(df['ocean_proximity'], prefix='OP')  # pd.get_dummies One-hot encode ocean_proximity to columns of binary values
        one_hot_encoder = one_hot_encoded.columns  # save column names for later use for One-hot encoding test data
        df = pd.concat([df.drop('ocean_proximity', axis=1), one_hot_encoded], axis=1)  # Concatnate original df without ocean_proximity column and One-hot encoded df
    else:
        one_hot_encoded = pd.get_dummies(df['ocean_proximity'], prefix='OP')  # pd.get_dummies One-hot encode ocean_proximity to columns of binary values
        one_hot_encoded = one_hot_encoded.reindex(columns=one_hot_encoder, fill_value=0)   # Align the columns with one_hot_encoder
        df = pd.concat([df.drop('ocean_proximity', axis=1), one_hot_encoded], axis=1)  # Concatnate original df without ocean_proximity column and One-hot encoded df

    # Feature engineering
    df['bedrooms_ratio'] = df['total_bedrooms'] / df['total_rooms']  # New feature with bedroom ratio per house
    df['household_rooms'] = df['total_rooms'] / df['households']  # New feature with rooms per household

    return df, one_hot_encoder  # Return transformed data

*Transform training and test data*

In [13]:
# Apply transformations
X_train_transformed, one_hot_encoder = transform_data(X_train)  # returns transformed training dataset and One-hot encoding used for consistency of categorical variables
X_test_transformed, _ = transform_data(X_test, one_hot_encoder)  # returns transformed test data set with same One-hot encoding for consistency along with a placeholder

__Print Transformed Data shapes__
* This is done to verify transformation process did not alter the number of features or samples in the datasets

In [14]:
print("X_train_transformed shape:", X_train_transformed.shape)
print("X_test_transformed shape:", X_test_transformed.shape)

X_train_transformed shape: (16346, 15)
X_test_transformed shape: (4087, 15)


*Visual inspection of the data to see transformed data with One-hot encoded values*

In [15]:
X_train_transformed.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,OP_<1H OCEAN,OP_INLAND,OP_ISLAND,OP_NEAR BAY,OP_NEAR OCEAN,bedrooms_ratio,household_rooms
17727,-121.8,37.32,14.0,8.392106,6.82882,7.900303,6.792457,4.7027,True,False,False,False,False,0.813719,1.235504
2057,-119.63,36.64,33.0,6.943219,5.199049,6.429881,5.15963,3.4107,False,True,False,False,False,0.748795,1.345682
6453,-118.06,34.12,25.0,8.266447,6.742999,7.521913,6.632134,3.6639,False,True,False,False,False,0.815707,1.246424
4619,-118.31,34.07,28.0,7.767306,6.855514,7.92266,6.795818,2.2364,True,False,False,False,False,0.882612,1.142954
15266,-117.27,33.04,27.0,7.517032,5.971517,7.171734,6.001662,3.55,False,False,False,False,True,0.794398,1.252492


__Display data after logarithmic transformations to visualize normalized data correcting for skew__
* Histograms show the right skew for total_rooms, total_bedrooms, population, and household corrected
* Resulting Heap Map has more even distribution of negatively and positively correlated variables

In [16]:
plot_data_bqplot(X_train_transformed, 'After Tranformations')

VBox(children=(HBox(children=(Figure(axes=[Axis(label='longitude', scale=LinearScale()), Axis(orientation='ver…

### Last check for missing values before training the Linear Regression Model

In [17]:
# Check for NaNs in the transformed datasets
print("NaNs in X_train_transformed:", X_train_transformed.isna().any().sum())
print("NaNs in X_test_transformed:", X_test_transformed.isna().any().sum())

NaNs in X_train_transformed: 0
NaNs in X_test_transformed: 0


__Multicollinearity Check__
* See if any features should be removed due to an overly higly correlated natured between them
* Results of the Heat Map show no removals necessary

In [18]:
# Check for multicollinearity in X_train_transformed
corr_matrix = X_train_transformed.corr()
plot_data_bqplot(corr_matrix, 'Correlation Matix')

VBox(children=(HBox(children=(Figure(axes=[Axis(label='longitude', scale=LinearScale()), Axis(orientation='ver…

*While result of Multicollinearity check shows high correlation between total rooms, total bedrooms, population, and households, plus longitude and latitude, none of these features should be removed as they are
all still vital to Linear Regression model. Also, in this case correlation is not causation with many plausible exterior factors acting independently.*

# Train the models

## Instantiate **Linear Regression** Model
* **Fit** Linear Regression model with train dataset

In [19]:
linear_reg = LinearRegression()
linear_reg.fit(X_train_transformed, y_train) # Train model to fit features and labels together, learning from their relationships to produce model's rules

## Instantiate Random Forest Regressor Model
* **Fit** Randon Forest Regressor with train dataset

In [20]:
rf = RandomForestRegressor(random_state=42) # Random state set to constant to ensure Random Forest will generate same sequence of random numbers as Linear Regression Model
rf.fit(X_train_transformed, y_train) # Train model to fit features and labels together, learning from their relationships to produce model's rules

## Instantiate Ridge and Lasso Regression Models
* Fit Ridge and Lasso Regression models with train dataset

In [21]:
ridge_reg = Ridge(alpha=1.0)  # Alpha regulates the strength; for Ridge Regression larger values indicate stronger regularization
lasso_reg = Lasso(alpha=1.0, max_iter=10000)  # Alpha for Lasso Regression indicates the same about stregth of regularization

In [22]:
# Train the Lasso model
lasso_reg.fit(X_train_transformed, y_train)

In [23]:
# Train the Ridge model
ridge_reg.fit(X_train_transformed, y_train)

# Evaluate the models

*Use .score() to return the coefficient of determination (R^2) or the models' accuracy on predictions*

In [24]:
# Compute R^2 score
linear_score = linear_reg.score(X_test_transformed, y_test)
ridge_score = ridge_reg.score(X_test_transformed, y_test)
lasso_score = lasso_reg.score(X_test_transformed, y_test)
rf_score = rf.score(X_test_transformed, y_test)

# Display R^2 score
print('Linear Regression R^2 Score:', linear_score)
print('Ridge Regression R^2 Score:', ridge_score)
print('Lasso Regression R^2 Score:', lasso_score)
print('Random Forest R^2 Score:', rf_score)

# Compute Root Mean Squared Error
linear_rmse = root_mean_squared_error(y_test, linear_reg.predict(X_test_transformed))
ridge_rmse = root_mean_squared_error(y_test, ridge_reg.predict(X_test_transformed))
lasso_rmse = root_mean_squared_error(y_test, lasso_reg.predict(X_test_transformed))
rf_rmse = root_mean_squared_error(y_test, rf.predict(X_test_transformed))

# Display Root Mean Squared Error
print('Linear Regression RMSE:', linear_rmse)
print('Ridge Regression RMSE:', ridge_rmse)
print('Lasso Regression RMSE:', lasso_rmse)
print('Random Forest RMSE:', rf_rmse)

# Compute Mean Absolute Error 
linear_mae = mean_absolute_error(y_test, linear_reg.predict(X_test_transformed))
ridge_mae = mean_absolute_error(y_test, ridge_reg.predict(X_test_transformed))
lasso_mae = mean_absolute_error(y_test, lasso_reg.predict(X_test_transformed))
rf_mae = mean_absolute_error(y_test, rf.predict(X_test_transformed))

# Display the Mean Absolute Error
print('Linear Regression MAE:', linear_mae)
print('Ridge Regression MAE:', ridge_mae)
print('Lasso Regression MAE:', lasso_mae)
print('Random Forest MAE:', rf_mae)

Linear Regression R^2 Score: 0.6685342447889026
Ridge Regression R^2 Score: 0.6690006470272725
Lasso Regression R^2 Score: 0.6686181679250747
Random Forest R^2 Score: 0.8195670813765679
Linear Regression RMSE: 67326.474190291
Ridge Regression RMSE: 67279.09030276192
Lasso Regression RMSE: 67317.95052463075
Random Forest RMSE: 49673.46154567695
Linear Regression MAE: 48671.44429952458
Ridge Regression MAE: 48640.741023753275
Lasso Regression MAE: 48666.75652672692
Random Forest MAE: 32578.906457058965


### Interpreting the Resutls
* **R^2 or the coefficient of determination** - measures the variance in the dependent varaible that is predictable from the independent variable
* * **Random Forest: R^2 = 0.820** indicating about 82% of the variance in the median house price is predictable from the features in the corresponding dataset
  * * * Best Performing model
* * **Linear, Ridge, and Lasso Regressions models:**  all three models have R^2 scores in the range of 0.668 to 0.669, suggesting moderate predictability.
* * * Ridge Regression is the best out of these linear models tested         
    

* **Root Mean Squared Error (RMSE)** - measures the average magintude of the errors in a set of predictions, without considering their direction
* * **Random Forset RMSE:** lowest RMSE of $49673.46
     
  * * Indicates it had the smallest average error in predictions
* * **Linear, Ridge, and Lasso Regression models:** each had considerably higher RMSE's, hovering around $67,300, indicating a larger average errors comparatively.
    
* **Mean Absolute Error (MAE)** - is the average of the absolute differences between predicted values and actual values.
  
  * **Random Forest MAE:** $32,578.91, suggesting  on average the prediction error is smaller than that of the other models
    
  * **Linear, Ridge, and Lasso Regression models:** all show similar performance with MAE's around $48,600


__Visualize **feature importances** in predicting the median house value__
* Function to plot feature importances

In [25]:
def plot_feature_importances(features, importances, title):
    # Create scales
    x_sc = OrdinalScale()
    y_sc = LinearScale()

    # Create bar plot
    bars = Bars(x=features, y=importances, scales={'x': x_sc, 'y': y_sc}, orientation='vertical', colors=['skyblue'])

    # Create axes
    ax_x = Axis(scale=x_sc, tick_rotate=45, tick_style={'text-anchor': 'start'}, label='Features')
    ax_y = Axis(scale=y_sc, orientation='vertical', tick_format='0.2f', label='Importances')

    # Create figure
    fig = Figure(marks=[bars], axes=[ax_x, ax_y], title=title, layout=Layout(width='1000px', height='500px'))
    return fig

# Create the plot
feature_importances = rf.feature_importances_
sorted_indices = np.argsort(feature_importances)[::-1]
sorted_features = X_train_transformed.columns[sorted_indices]
sorted_importances = feature_importances[sorted_indices]    

print("Features:", sorted_features)
print("Importances:", sorted_importances)     

Features: Index(['median_income', 'OP_INLAND', 'longitude', 'latitude',
       'housing_median_age', 'bedrooms_ratio', 'population', 'household_rooms',
       'total_rooms', 'total_bedrooms', 'households', 'OP_NEAR OCEAN',
       'OP_<1H OCEAN', 'OP_NEAR BAY', 'OP_ISLAND'],
      dtype='object')
Importances: [4.82549146e-01 1.42739407e-01 9.86609591e-02 8.77397101e-02
 4.81063416e-02 3.16907128e-02 2.78158692e-02 2.41965410e-02
 1.90330849e-02 1.42417066e-02 1.28822962e-02 5.83236301e-03
 3.10772838e-03 1.22293782e-03 1.81195697e-04]


*Plot the feature importances with a call to **plot_feature_importances***

In [26]:
from IPython.display import display
fig = plot_feature_importances(sorted_features, sorted_importances, 'Feature Importances of Random Forest Model')
display(fig)

Figure(axes=[Axis(label='Features', scale=OrdinalScale(), tick_rotate=45, tick_style={'text-anchor': 'start'})…

## Hyperparameter Tuning
* Ultilize Grid Search with Cross-Validation to find best hyperparameters for Random Forest model
* Hyperparameters are external values set before the model's learning processes and control the model's behavior
* The goal of tuning Hyperparameters is to find the optimal combination of hyperparameters yielding the best performance for a machine learning model on a given dataset

*Import the GridSearchCV module*

In [27]:
from sklearn.model_selection import GridSearchCV

*Define the parameters used for the Grid Search*

In [28]:
param_grid = {
    'n_estimators': [100, 200, 300],  # number of estimators (from first hypertuning increased range to include 300)
    'max_depth': [None, 10, 20, 30],  # maximum depth (from first hyperparameter tuning added more depth level)
    'min_samples_split': [2, 5, 10],  # minimum samples split (from first hyperparameter tuning added more splitting criterion)
    'max_features': [None, 'sqrt', 'log2']  # Added max_features after first hyperparameter tuning
}

__Intialize **Random Forest Regressor**__

In [29]:
rf = RandomForestRegressor(random_state=42)

*Initialize the Grid Search model*

In [30]:
# GridSeachCV performs a cross-validated grid search over the specified parameter grid (updated parameters after first tuning)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2) 

## Perform Cross-Validation Grid-Search

In [31]:
grid_search.fit(X_train_transformed, y_train) # fit the best combination of hyperparameters based on cross-validated performance on the training data

Fitting 5 folds for each of 108 candidates, totalling 540 fits


*Best parameters - Display Results*

In [32]:
print('Best Parameters:', grid_search.best_params_)

Best Parameters: {'max_depth': 20, 'max_features': None, 'min_samples_split': 2, 'n_estimators': 300}


# Validate on Test Data
* Use parameters from second hyperparameter tuning to initialize a new Random Forest model
* Evaluate new Random Forest model with test data to evaluate its performance on unseen data

__Intialize **Random Forest Regressor** with best parameters from second GridSearchCV__

In [33]:
best_rf = RandomForestRegressor(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    max_features=grid_search.best_params_['max_features'],
    random_state=42,
    n_jobs=-1
)

__Fit the Radom Forest model with the best parameters found__

In [34]:
best_rf.fit(X_train_transformed, y_train)

__Predict on test data__

In [35]:
y_pred = best_rf.predict(X_test_transformed)

__Calculate and display the R^2 and RMSE metrics__

In [36]:
print('Random Forest R^2 Score:', r2_score(y_test, y_pred))
rf_rmse = root_mean_squared_error(y_pred, y_test)
print('Random Forest RMSE:', rf_rmse)

Random Forest R^2 Score: 0.8219713771671516
Random Forest RMSE: 49341.39846346553


## Results After Second GridSearchCV and Retuning Random Forest Parameters
* R^2 Score Delta = 0.00241
* RMSE Delta = - $332.063
* * Results demonstrate model holds validity when testing unseen data
  * Results demonstrate only sight improvement in predictive capability after hyperparameter tuning twice

## Reevaluating Feature Importance
* Calculate difference between first set of feature importances and second set

In [37]:
importances = best_rf.feature_importances_
sorted_indices = np.argsort(importances)[::-1]
sorted_features = X_train_transformed.columns[sorted_indices]
sorted_importances = importances[sorted_indices]

# Display each feature with corresponding importance
for feature, importance in zip(sorted_features, sorted_importances):
    print(f'{feature}: {importance}')

from IPython.display import display
fig =  plot_feature_importances(sorted_features, sorted_importances, 'Feature Importances of Random Forest Model')
display(fig)

median_income: 0.4843099754834302
OP_INLAND: 0.1429250368677952
longitude: 0.0984435800677344
latitude: 0.0889519106180725
housing_median_age: 0.047460280384427324
bedrooms_ratio: 0.031039095224933224
population: 0.02736525387181601
household_rooms: 0.023802665522440106
total_rooms: 0.019075797970125877
total_bedrooms: 0.013782172838988334
households: 0.012450745746285208
OP_NEAR OCEAN: 0.005883920901525585
OP_<1H OCEAN: 0.0031845219243445606
OP_NEAR BAY: 0.0011496453876854067
OP_ISLAND: 0.00017539719039612106


Figure(axes=[Axis(label='Features', scale=OrdinalScale(), tick_rotate=45, tick_style={'text-anchor': 'start'})…

In [38]:
# Defining the first and second sets of feature importances
feature_importances_1st = {
    "median_income": 0.4825491462799604,
    "OP_INLAND": 0.1427394073414866,
    "longitude": 0.09866095906283241,
    "latitude": 0.08773971006041177,
    "housing_median_age": 0.04810634159759981,
    "bedrooms_ratio": 0.03169071275110768,
    "population": 0.027815869163680684,
    "household_rooms": 0.02419654104492851,
    "total_rooms": 0.01903308494111618,
    "total_bedrooms": 0.014241706612090066,
    "households": 0.01288229624221246,
    "OP_NEAR OCEAN": 0.00583236300500928,
    "OP_<1H OCEAN": 0.0031077283803938125,
    "OP_NEAR BAY": 0.0012229378199948363,
    "OP_ISLAND": 0.00018119569717531885
}

feature_importances_2nd = {
    "median_income": 0.4843099754834302,
    "OP_INLAND": 0.1429250368677952,
    "longitude": 0.0984435800677344,
    "latitude": 0.0889519106180725,
    "housing_median_age": 0.047460280384427324,
    "bedrooms_ratio": 0.031039095224933224,
    "population": 0.02736525387181601,
    "household_rooms": 0.023802665522440106,
    "total_rooms": 0.019075797970125877,
    "total_bedrooms": 0.013782172838988334,
    "households": 0.012450745746285208,
    "OP_NEAR OCEAN": 0.005883920901525585,
    "OP_<1H OCEAN": 0.0031845219243445606,
    "OP_NEAR BAY": 0.0011496453876854067,
    "OP_ISLAND": 0.00017539719039612106
}

# Calculate the differences in feature importances between the first and second sets
differences = {feature: feature_importances_2nd[feature] - feature_importances_1st[feature]
               for feature in feature_importances_1st}

# Sorting the differences by absolute value in descending order
sorted_differences = sorted(differences.items(), key=lambda item: abs(item[1]), reverse=True)

sorted_differences

[('median_income', 0.0017608292034698159),
 ('latitude', 0.0012122005576607336),
 ('bedrooms_ratio', -0.0006516175261744549),
 ('housing_median_age', -0.000646061213172483),
 ('total_bedrooms', -0.00045953377310173175),
 ('population', -0.00045061529186467275),
 ('households', -0.0004315504959272517),
 ('household_rooms', -0.0003938755224884051),
 ('longitude', -0.00021737899509800818),
 ('OP_INLAND', 0.00018562952630860052),
 ('OP_<1H OCEAN', 7.679354395074807e-05),
 ('OP_NEAR BAY', -7.329243230942955e-05),
 ('OP_NEAR OCEAN', 5.1557896516305456e-05),
 ('total_rooms', 4.27130290096972e-05),
 ('OP_ISLAND', -5.79850677919779e-06)]

## Differences in Feature Importances between first and second testing of Random Forest Regressor model
* median_income: +0.00176
* latitude: +0.00121
* bedrooms_ratio: -0.00065
* housing_median_age: -0.00065
* total_bedrooms: -0.00046
* population: -0.00045
* households: -0.00043
* household_rooms: -0.00039
* longitude: -0.00022
* OP_INLAND: +0.00019
* OP_<1H OCEAN: +0.00008
* OP_NEAR BAY: -0.00007
* OP_NEAR OCEAN: +0.00005
* total_rooms: +0.00004
* OP_ISLAND: -0.00001

__Changes in Feature Importances__ 
* *The changes in feature importances, while relatively minor, indicate that the model has adjusted its reliance on certain features. An increase in the importance of median_income and latitude may suggest these features are more predictive after the tuning, and the model has become slightly more attuned to them.*

## Model Performance Metrics
* __R^2 Score Delta = +0.00241:__
*  *This increase in R^2 score, although small, indicates that the model is able to explain a slightly higher proportion of the variance in the target variable (median house prices) after tuning. This is generally a positive sign, indicating an improvement, albeit modest, in model performance.*
* __RMSE Delta =-$332.063:__
  
* *The decrease in RMSE (Root Mean Squared Error) signifies that the average error made by the model in predicting house prices has reduced by $332.063. This reduction in RMSE is a clear indicator of improved predictive accuracy, as it means the model's predictions are closer to the actual values.*


## Interpretation

__Given these deltas, the model is likely finely tuned for the dataset.__
__The improvements in both R^2 and RMSE suggest that the hyperparameter adjustments have effectively optimized the model's performance, making it more reliable and accurate.__  
__This conclusion is especially valid if:__
* * Consistency Across Data Splits: The improvements are consistent across different splits of the data, not just the specific test set or a particular fold in cross-validation.
* * No Overfitting: The model does not show significantly better performance on training data compared to unseen test data, which would be an indicator of overfitting.