### Importing Basic Python Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score

In [2]:
# Ignore warnings in below cell runs
import warnings
warnings.filterwarnings('ignore')

### Read Input File for Training and understanding the Dataset

In [3]:
train_df = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv", sep=",")
test_df = pd.read_csv("./house-prices-advanced-regression-techniques/test.csv", sep=",")
sample_sub_df = pd.read_csv("./house-prices-advanced-regression-techniques/sample_submission.csv", sep=",")

train_df.shape, test_df.shape, sample_sub_df.shape

((1460, 81), (1459, 80), (1459, 2))

In [4]:
test_df = pd.merge(test_df, sample_sub_df[["Id", "SalePrice"]], on = "Id", how = 'left') # "SalePrice" is the target output

In [5]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [6]:
train_df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [7]:
# Here we can use strip if needed
for col in train_df.columns:
    print("Number of null values present in the", col, " : ",train_df[col].isnull().sum())

print("Total number of Null/Blank Values is:", train_df.isnull().sum().sum())
print("Median of Number of rows with any blank value:", train_df[train_df.isnull().any(axis=1)].count().median(numeric_only=True))
print("Max of Number of rows with any blank value:", train_df[train_df.isnull().any(axis=1)].count().max(numeric_only=True)) 
# Max = Median : the number of blanks in columns is extremely right skeweed, therefore, there is too many blanks are present compare with the rows 1460
# Based on this situation entire rows cannot be deleted, instead imputation mechanism can work in this case 

Number of null values present in the Id  :  0
Number of null values present in the MSSubClass  :  0
Number of null values present in the MSZoning  :  0
Number of null values present in the LotFrontage  :  259
Number of null values present in the LotArea  :  0
Number of null values present in the Street  :  0
Number of null values present in the Alley  :  1369
Number of null values present in the LotShape  :  0
Number of null values present in the LandContour  :  0
Number of null values present in the Utilities  :  0
Number of null values present in the LotConfig  :  0
Number of null values present in the LandSlope  :  0
Number of null values present in the Neighborhood  :  0
Number of null values present in the Condition1  :  0
Number of null values present in the Condition2  :  0
Number of null values present in the BldgType  :  0
Number of null values present in the HouseStyle  :  0
Number of null values present in the OverallQual  :  0
Number of null values present in the OverallCon

In [8]:
# blank_train_df = train_df[(train_df.isnull().any(axis=1)) | (train_df.isin([np.nan]))]
# print(f"The Number of Blank rows {blank_train_df.shape[0]}")
# list_blank = list(blank_train_df["Id"])
# train_df1 = train_df.loc[train_df["Id"].isin(list_blank)]

In [9]:
from sklearn.impute import KNNImputer
    
def imputation_technique(df, column, technique, n_neighbors = None, ):
    if technique == "knn":
        print(f"\n--- Performing KNN Imputation (n_neighbors={n_neighbors}) ---")
        df_imputed = df.copy()

        # Identify numerical columns for imputation
        numerical_cols = df_imputed.select_dtypes(include=np.number).columns

        if numerical_cols.empty:
            print("No numerical columns found for KNN imputation.")
            return df_imputed

        imputer = KNNImputer(n_neighbors=n_neighbors)
        df_imputed[numerical_cols] = imputer.fit_transform(df_imputed[numerical_cols])

        print("KNN Imputation complete.")
        return df_imputed

train_df1 = imputation_technique(train_df, "", "knn", 3)
test_df1 = imputation_technique(test_df, "", "knn", 3)



--- Performing KNN Imputation (n_neighbors=3) ---
KNN Imputation complete.

--- Performing KNN Imputation (n_neighbors=3) ---
KNN Imputation complete.


In [10]:
# Here we can use strip if needed
for col in train_df1.columns:
    print("Number of null values present in the", col, " : ",train_df1[col].isnull().sum())

print("Total number of Null/Blank Values is:", train_df1.isnull().sum().sum())
print("Median of Number of rows with any blank value:", train_df1[train_df1.isnull().any(axis=1)].count().median(numeric_only=True))
print("Max of Number of rows with any blank value:", train_df1[train_df1.isnull().any(axis=1)].count().max(numeric_only=True)) 
# Max = Median : the number of blanks in columns is extremely right skeweed, therefore, there is too many blanks are present compare with the rows 1460
# Based on this situation entire rows cannot be deleted, instead imputation mechanism can work in this case 
# the value is coming 7481, so that the so much data is still blank

Number of null values present in the Id  :  0
Number of null values present in the MSSubClass  :  0
Number of null values present in the MSZoning  :  0
Number of null values present in the LotFrontage  :  0
Number of null values present in the LotArea  :  0
Number of null values present in the Street  :  0
Number of null values present in the Alley  :  1369
Number of null values present in the LotShape  :  0
Number of null values present in the LandContour  :  0
Number of null values present in the Utilities  :  0
Number of null values present in the LotConfig  :  0
Number of null values present in the LandSlope  :  0
Number of null values present in the Neighborhood  :  0
Number of null values present in the Condition1  :  0
Number of null values present in the Condition2  :  0
Number of null values present in the BldgType  :  0
Number of null values present in the HouseStyle  :  0
Number of null values present in the OverallQual  :  0
Number of null values present in the OverallCond 

#### Now let's see the number of blanks in each column

In [11]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.models import ColumnDataSource, HoverTool

def plot_null_values(df):
    """
    Generates a Bokeh bar graph visualizing null values per attribute in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
    """
    null_counts = df.isnull().sum() + df.isin([np.nan, "NaN"]).sum()
    data_types = df.dtypes
    columns = df.columns

    summary_df = pd.DataFrame({"Attributes" : columns, 'Null Count': null_counts, 'Data Type': data_types})
    # print(summary_df)

    summary_df1 = pd.DataFrame()
    for i_in, i in summary_df.iterrows():
        if i["Null Count"] != 0:
            summary_df1 = pd.concat([summary_df1, i.to_frame().T], ignore_index=True)
    # print(summary_df1)
    attributes = list(summary_df1["Attributes"])
    counts = list(summary_df1["Null Count"])
    d_types = list(summary_df1["Data Type"].astype(str)) # Convert data types to strings

    source = ColumnDataSource(data=dict(attributes=attributes, counts=counts, data_type=d_types)) # Added 'data_type'

    p = figure(x_range=attributes, height=350, width=1000, title="Null Values per Attribute", toolbar_location="below", sizing_mode="stretch_both")

    p.vbar(x='attributes', top='counts', width=0.9, source=source)
    hover = HoverTool(tooltips=[("Null Count", "@counts"), ("Data Type", "@data_type")]) # Changed "@d_types" to "@data_type"
    p.add_tools(hover)
    p.xgrid.grid_line_color = None
    p.y_range.start = 0
    p.xaxis.axis_label = "Attributes"
    p.yaxis.axis_label = "Number of Null Values"

    show(p)
plot_null_values(train_df1)

In [12]:
train_df1.shape

(1460, 81)

In [13]:
# Here unique values are found for each attributes
categorical_features = []
for col in train_df1.columns:
    print(f"The number of unique values present in the {col} is {train_df1[col].nunique()} and they are {train_df1[col].unique()}")
    if train_df1[col].nunique() < 5:
        categorical_features.append(col)
    

The number of unique values present in the Id is 1460 and they are [1.000e+00 2.000e+00 3.000e+00 ... 1.458e+03 1.459e+03 1.460e+03]
The number of unique values present in the MSSubClass is 15 and they are [ 60.  20.  70.  50. 190.  45.  90. 120.  30.  85.  80. 160.  75. 180.
  40.]
The number of unique values present in the MSZoning is 5 and they are ['RL' 'RM' 'C (all)' 'FV' 'RH']
The number of unique values present in the LotFrontage is 202 and they are [ 65.          80.          68.          60.          84.
  85.          75.          74.66666667  51.          50.
  70.          91.          80.33333333  84.33333333  72.
  66.         101.          57.          44.         110.
  98.          47.         108.         112.          74.
 115.          76.66666667  60.66666667  61.          48.
  33.          52.         100.          24.          89.
  63.          71.          76.          75.33333333  81.
  95.          69.          21.          69.66666667  32.
  78.          69

### Feature Engineering

In [14]:
from sklearn.preprocessing import LabelEncoder
def encode_categorical_column(df: pd.DataFrame, column_name: str, encoding_style):
    if column_name not in df.columns:
            raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    if encoding_style == "labelencoder":
        df_encoded = df.copy()
        label_encoder = LabelEncoder()
        df_encoded[f'{column_name}_Encoded'] = label_encoder.fit_transform(df_encoded[column_name])

    else:
        df_encoded = pd.get_dummies(df, columns=[column_name], drop_first=False)
        new_cols = [col for col in df_encoded.columns if col.startswith(f"{column_name}_")]
        df_encoded[new_cols] = df_encoded[new_cols].astype(int)
        # print(new_cols)

    return df_encoded



In [15]:
print(len(train_df1.columns))
for col in train_df1.columns:
    if col in categorical_features:
        train_df1 = encode_categorical_column(train_df1, col, "labelencoder")

print(len(train_df1.columns))

81
105


In [16]:
print(len(test_df1.columns))
for col in test_df1.columns:
    if col in categorical_features:
        test_df1 = encode_categorical_column(test_df1, col, "labelencoder")

print(len(test_df1.columns))

81
105


In [17]:
# train_df.to_excel("Cat-int.xlsx", index= False)

In [18]:
train_df1.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,FullBath_Encoded,HalfBath_Encoded,KitchenAbvGr_Encoded,KitchenQual_Encoded,Fireplaces_Encoded,GarageFinish_Encoded,PavedDrive_Encoded,PoolQC_Encoded,Fence_Encoded,MiscFeature_Encoded
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.831507,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.896347,443.639726,...,1.565068,0.382877,1.046575,2.339726,0.613014,1.284247,1.856164,2.991096,3.504795,3.922603
std,421.610009,42.300571,23.894239,9981.264932,1.382997,1.112799,30.202904,20.645407,180.80441,456.098091,...,0.550916,0.502885,0.220338,0.830161,0.644666,0.892831,0.496592,0.140703,1.082912,0.404103
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,365.75,20.0,60.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,1.0,0.0,1.0,2.0,0.0,1.0,2.0,3.0,4.0,4.0
50%,730.5,50.0,70.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,2.0,0.0,1.0,3.0,1.0,1.0,2.0,3.0,4.0,4.0
75%,1095.25,70.0,80.75,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,2.0,1.0,1.0,3.0,1.0,2.0,2.0,3.0,4.0,4.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,3.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,4.0


In [19]:
all_features = train_df1.columns
numeric_features = []
for col in train_df1.columns:
    if train_df1[col].dtype != 'object' and col != "Id":
        numeric_features.append(col)

print(numeric_features)

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice', 'Street_Encoded', 'Alley_Encoded', 'LotShape_Encoded', 'LandContour_Encoded', 'Utilities_Encoded', 'LandSlope_Encoded', 'MasVnrType_Encoded', 'ExterQual_Encoded', 'BsmtQual_Encoded', 'BsmtCond_Encoded', 'BsmtExposure_Encoded', 'CentralAir_Encoded', 'BsmtFullBath_Encoded', 'BsmtHalfBath_Encoded', 'FullBath_Encoded', 'HalfBath_Encoded', 'KitchenAbvGr_Encoded', 'KitchenQual_Encoded', 'Fireplaces_Encoded', 'GarageFinish_Encoded', 'PavedDrive_Encoded', 'PoolQC_Encoded', 'Fence_Encoded',

In [20]:
for col in train_df1.columns:
    print("Number of null values present in the", col, " : ",train_df1[col].isnull().sum())

Number of null values present in the Id  :  0
Number of null values present in the MSSubClass  :  0
Number of null values present in the MSZoning  :  0
Number of null values present in the LotFrontage  :  0
Number of null values present in the LotArea  :  0
Number of null values present in the Street  :  0
Number of null values present in the Alley  :  1369
Number of null values present in the LotShape  :  0
Number of null values present in the LandContour  :  0
Number of null values present in the Utilities  :  0
Number of null values present in the LotConfig  :  0
Number of null values present in the LandSlope  :  0
Number of null values present in the Neighborhood  :  0
Number of null values present in the Condition1  :  0
Number of null values present in the Condition2  :  0
Number of null values present in the BldgType  :  0
Number of null values present in the HouseStyle  :  0
Number of null values present in the OverallQual  :  0
Number of null values present in the OverallCond 

In [21]:
# train_df1[numeric_features].to_excel("ccc.xlsx",index=False)

### Feature Selection & Data Visualization in a Heat map

In [22]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import RdBu11

In [23]:
def plot_correlation_heatmap(df, title="Correlation Heatmap"):
    """
    Generates an interactive correlation heatmap using Bokeh that covers the whole window.

    Args:
        df (pd.DataFrame): The input DataFrame.
        title (str, optional): The title of the heatmap. Defaults to "Correlation Heatmap".
    """
    corr_df = df.corr()
    columns = list(corr_df.columns)

    # Prepare data for Bokeh
    x, y, colors, alphas = [], [], [], []
    for i, col_x in enumerate(columns):
        for j, col_y in enumerate(columns):
            x.append(col_x)
            y.append(col_y)
            correlation = corr_df.loc[col_x, col_y]
            colors.append(RdBu11[int((correlation + 1) * 5.4)]) # Scale -1 to 1 to 0 to 10
            alphas.append(abs(correlation))

    source = ColumnDataSource(data=dict(x=x, y=y, colors=colors, alphas=alphas,
                                       correlation=[f"{corr:.2f}" for corr in corr_df.values.flatten()]))

    # Create the figure with sizing_mode="stretch_both"
    p = figure(title=title, tools="pan,wheel_zoom,box_zoom,reset,save,reset,crosshair", toolbar_location="below",
               x_range=columns, y_range=list(reversed(columns)), # Reverse y-axis for better visualization
               sizing_mode="stretch_both") # Make the plot responsive

    p.rect(x='x', y='y', width=1, height=1, fill_color='colors', fill_alpha='alphas',
           line_color=None, source=source)

    # Customize appearance
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 1.0
    p.yaxis.major_label_orientation = "horizontal"

    # Add HoverTool
    hover = HoverTool(tooltips=[
        ("Columns", "@y, @x"),
        ("Correlation", "@correlation"),
    ])
    p.add_tools(hover)

    show(p)

In [24]:
plot_correlation_heatmap(train_df1[numeric_features])

In [25]:
train_df_corr = train_df1[numeric_features].corr()

train_df_corr["SalePrice"]

MSSubClass             -0.084284
LotFrontage             0.349580
LotArea                 0.263843
OverallQual             0.790982
OverallCond            -0.077856
                          ...   
GarageFinish_Encoded   -0.549247
PavedDrive_Encoded      0.231357
PoolQC_Encoded         -0.126070
Fence_Encoded           0.140640
MiscFeature_Encoded     0.073609
Name: SalePrice, Length: 61, dtype: float64

In [26]:
most_imp_features_df = train_df_corr.loc[train_df_corr["SalePrice"] > 0.5]
most_imp_features = list(most_imp_features_df.index)
most_imp_features

['OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'FullBath',
 'TotRmsAbvGrd',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'SalePrice',
 'FullBath_Encoded']

In [27]:
# Observations: 
# 1. The SalePrice is correlated with two attributes a. OverallQual(0.79) and b. GrLivArea (0.71).
# 2. There are some other attributes like YearBuilt, YearRemodAdd, GarageArea and so on as showing in above cell

In [28]:
# Some features are not part of test dataset. Therfore, we are taking common features to train and test the data
common_features = list(set(most_imp_features) & set(test_df.columns))
common_features

['FullBath',
 'SalePrice',
 'OverallQual',
 'TotalBsmtSF',
 'GrLivArea',
 'TotRmsAbvGrd',
 'GarageCars',
 '1stFlrSF',
 'YearBuilt',
 'GarageArea',
 'GarageYrBlt',
 'YearRemodAdd']

In [29]:
train_comm_features = [feature for feature in common_features if feature != "SalePrice"]
# train_comm_features = ['GrLivArea','YearRemodAdd', 'YearRemodAdd', 'GrLivArea', '1stFlrSF']
train_df_x = train_df1[train_comm_features]
train_df_y = train_df1[['SalePrice']]

In [30]:
common_features

['FullBath',
 'SalePrice',
 'OverallQual',
 'TotalBsmtSF',
 'GrLivArea',
 'TotRmsAbvGrd',
 'GarageCars',
 '1stFlrSF',
 'YearBuilt',
 'GarageArea',
 'GarageYrBlt',
 'YearRemodAdd']

### Models and their comparisons

In [31]:
# Metric calculator
def metrics_calculation(y_true, y_pred, phase = "Train"):
    print(f'{phase} score: ', r2_score(y_true, y_pred))

    return r2_score(y_true, y_pred)

#### Linear Regression : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [32]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()

In [33]:
linear_model.fit(train_df_x, train_df_y)

In [34]:
y_train_hat = linear_model.predict(train_df_x)
metrics_calculation(train_df_y.SalePrice, y_train_hat)

Train score:  0.7743167762148548


0.7743167762148548

In [35]:
test_df_x = test_df1[train_comm_features]
test_df_y = test_df1[["SalePrice"]]

In [36]:
y_test_hat = linear_model.predict(test_df_x)
metrics_calculation(test_df_y.SalePrice, y_test_hat, "Test")

Test score:  -15.457057204560176


-15.457057204560176

In [37]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
import numpy as np
import pandas as pd

def plot_regression_results(
    true_y: np.ndarray,
    predicted_y: np.ndarray,
    x_values: np.ndarray = None, # Make x_values explicitly optional with None as default
    title: str = "True vs. Predicted Y Values (Regression)",
    x_axis_label: str = "X Value",
    y_axis_label: str = "Y Value",
    output_filename: str = "regression_plot_with_circles.html" # Added default output filename
):
    """
    Generates and displays an interactive Bokeh plot showing true vs. predicted Y values for regression,
    with lines and highlighted points (circles) for both true and predicted values.

    Args:
        true_y (np.ndarray): Array of actual (true) Y values.
        predicted_y (np.ndarray): Array of predicted Y values from a regression model.
        x_values (np.ndarray, optional): Array of X values corresponding to Y.
                                        If None, a simple numerical index (0, 1, 2...) will be used.
        title (str): Title of the plot.
        x_axis_label (str): Label for the X-axis.
        y_axis_label (str): Label for the Y-axis.
        output_filename (str): Name of the HTML file to save the plot. Defaults to "regression_plot_with_circles.html".
    """
    # Input validation
    if len(true_y) != len(predicted_y):
        raise ValueError("`true_y` and `predicted_y` must have the same length.")

    if x_values is None:
        x_values = np.arange(len(true_y))
    elif len(x_values) != len(true_y):
        raise ValueError("`x_values` must have the same length as `true_y` and `predicted_y` if provided.")

    # Prepare data for Bokeh using ColumnDataSource
    data = pd.DataFrame({
        'x': x_values,
        'true_y': true_y,
        'predicted_y': predicted_y
    })
    source = ColumnDataSource(data)

    # Create the Bokeh figure (plot)
    p = figure(
        title=title,
        x_axis_label=x_axis_label,
        y_axis_label=y_axis_label,
        height=700,
        width=1400,
        tools="pan,wheel_zoom,box_zoom,reset,save,crosshair", # Added hover tool directly here
        toolbar_location="below",
    )

    # Add the line for True Y Values
    p.line(
        x='x',
        y='true_y',
        source=source,
        line_color="blue",
        line_width=2,
        legend_label="True Y Values"
    )

    # Add **circles** for True Y Values
    p.circle(
        x='x',
        y='true_y',
        source=source,
        size=8,              # Size of the circle
        color="blue",        # Color of the circle
        alpha=0.6,           # Transparency of the circle
        legend_label="True Y Values" # Make sure legend label matches line to combine
    )

    # Add the line for Predicted Y Values
    p.line(
        x='x',
        y='predicted_y',
        source=source,
        line_color="red",
        line_width=2,
        line_dash="dashed",
        legend_label="Predicted Y Values"
    )

    # Add **circles** for Predicted Y Values
    p.circle(
        x='x',
        y='predicted_y',
        source=source,
        size=8,
        color="red",
        alpha=0.6,
        legend_label="Predicted Y Values" # Make sure legend label matches line to combine
    )

    # --- Customized Hover Tool (already good from previous step) ---
    hover_tool = HoverTool(
        tooltips=[
            ("X Value", "@x{0.00}"),
            ("True Y", "@true_y{0.00}"),
            ("Predicted Y", "@predicted_y{0.00}")
        ]
    )
    p.add_tools(hover_tool) # Add the customized hover tool

    # Customize the legend
    p.legend.location = "top_left"
    p.legend.click_policy = "hide" # Allows clicking legend items to hide/show lines
    show(p)


In [38]:
plot_regression_results(list(test_df_y.SalePrice), list(y_test_hat), range(0, len(y_test_hat)))

