- CRIM: Per capita crime rate by town.
- ZN: Proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS: Proportion of non-retail business acres per town.
- CHAS: Charles River dummy variable (1 if tract bounds river; 0 otherwise).
- NOX: Nitric oxides concentration (parts per 10 million).
- RM: Average number of rooms per dwelling.
- AGE: Proportion of owner-occupied units built prior to 1940.
- DIS: Weighted distances to five Boston employment centres.
- RAD: Index of accessibility to radial highways.
- TAX: Full-value property tax rate per $10,000.
- PTRATIO: Pupil-teacher ratio by town.
- B: $1000(Bk - 0.63)^2$ where Bk is the proportion of Black people by town.
- LSTAT: Percentage of lower status of the population.
- MEDV: Median value of owner-occupied homes in $1000s.

Imports and Data Loading

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('House Prediction Data Set.csv', delim_whitespace=True, header=None)
columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',
           'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df.columns = columns
print("DataFrame head:")
print(df.head())
print("\nDataFrame info:")
df.info()


DataFrame head:
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null   

Model Training and Evaluation

In [3]:

# Define features (X) and target (y)
X = df.drop('MEDV', axis=1)
y = df['MEDV']

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = model.predict(X_test)

# Calculate key performance metrics
r2_test = r2_score(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

# Get feature importance based on the model's coefficients
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

# Display the calculated metrics
print("--- Model Performance Metrics ---")
print(f"R² Score: {r2_test:.4f}")
print(f"Mean Absolute Error (MAE): ${mae_test*1000:,.0f}")
print(f"Root Mean Squared Error (RMSE): ${rmse_test*1000:,.0f}")


--- Model Performance Metrics ---
R² Score: 0.6688
Mean Absolute Error (MAE): $3,189
Root Mean Squared Error (RMSE): $4,929


Visual Data Analysis

In [4]:
# Define a color palette for consistent visualization
COLORS = {
    'background': "#071B38",
    'card_bg': '#161B22',
    'text': '#E0E0E0',
    'header_text': '#FFFFFF',
    'accent_color': '#58A6FF',
    'negative_color': '#F85149',
    'grid_color': '#30363D',
    'plot_bg': '#161B22',
}

# Helper function to apply a consistent theme to Plotly figures
def update_plot_layout(fig, title):
    fig.update_layout(
        title=title,
        plot_bgcolor=COLORS['plot_bg'],
        paper_bgcolor=COLORS['card_bg'],
        font=dict(color=COLORS['text']),
        title_font_color=COLORS['header_text'],
        xaxis=dict(gridcolor=COLORS['grid_color'], zerolinecolor=COLORS['grid_color']),
        yaxis=dict(gridcolor=COLORS['grid_color'], zerolinecolor=COLORS['grid_color'])
    )

# Feature vs. Price Plot (e.g., for 'RM' - average number of rooms)
fig1 = px.scatter(df, x='RM', y='MEDV',
                  title='Average Rooms (RM) vs. Median Home Price', trendline='ols',
                  labels={'MEDV': 'Median Price ($1000s)'}, color_discrete_sequence=[COLORS['accent_color']])
update_plot_layout(fig1, 'Average Rooms (RM) vs. Median Home Price')
fig1.show()


Visualizations

In [5]:

# Feature Importance Plot
colors_bar = [COLORS['accent_color'] if coef > 0 else COLORS['negative_color'] for coef in feature_importance['Coefficient']]
fig2 = go.Figure(data=go.Bar(x=feature_importance['Coefficient'], y=feature_importance['Feature'], orientation='h', marker_color=colors_bar))
update_plot_layout(fig2, 'Feature Importance')
fig2.update_layout(xaxis_title='Coefficient Value', yaxis_title='Feature')
fig2.show()

# Correlation Matrix Heatmap
corr_matrix = df.corr()
fig3 = go.Figure(data=go.Heatmap(z=corr_matrix.values, x=corr_matrix.columns, y=corr_matrix.columns, colorscale='Cividis'))
update_plot_layout(fig3, 'Correlation Matrix')
fig3.show()


Model Prediction Visualizations

In [6]:
# Actual vs. Predicted Prices Plot
fig4 = go.Figure()
fig4.add_trace(go.Scatter(x=y_test, y=y_pred_test, mode='markers', name='Test Data', marker=dict(color=COLORS['accent_color'], opacity=0.6)))
min_val = min(y_test.min(), y_pred_test.min())
max_val = max(y_test.max(), y_pred_test.max())
fig4.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', name='Perfect Prediction', line=dict(color=COLORS['negative_color'], dash='dash')))
update_plot_layout(fig4, 'Actual vs. Predicted Home Prices')
fig4.update_layout(xaxis_title='Actual Price ($1000s)', yaxis_title='Predicted Price ($1000s)')
fig4.show()

# Residual Plot
residuals = y_test - y_pred_test
fig5 = go.Figure(data=go.Scatter(x=y_pred_test, y=residuals, mode='markers', name='Residuals', marker=dict(color=COLORS['accent_color'], opacity=0.6)))
fig5.add_hline(y=0, line_dash="dash", line_color=COLORS['negative_color'])
update_plot_layout(fig5, 'Residual Plot')
fig5.update_layout(xaxis_title='Predicted Values', yaxis_title='Residuals')
fig5.show()


Price Distribution and Prediction Function

In [7]:
# Distribution of Actual vs. Predicted Prices
print("--- Distribution of Prices ---")
fig6 = make_subplots(rows=1, cols=2, subplot_titles=('Actual Prices', 'Predicted Prices'))
fig6.add_trace(go.Histogram(x=y_test, name='Actual', marker_color=COLORS['accent_color']), row=1, col=1)
fig6.add_trace(go.Histogram(x=y_pred_test, name='Predicted', marker_color=COLORS['negative_color']), row=1, col=2)
update_plot_layout(fig6, 'Distribution of Actual vs Predicted Prices')
fig6.show()

# Prediction Function
print("\n--- House Price Prediction Tool ---")
def predict_price_from_inputs(input_values):
    if len(input_values) != len(X.columns):
        raise ValueError(f"Input must have {len(X.columns)} values, but got {len(input_values)}.")
    
    input_df = pd.DataFrame([input_values], columns=X.columns)
    prediction = model.predict(input_df)[0]
    return f"${prediction*1000:,.0f}"

median_values = X.median().tolist()
predicted_price = predict_price_from_inputs(median_values)

print("Example Prediction using median feature values:")
print(f"Predicted Price: {predicted_price}")

--- Distribution of Prices ---



--- House Price Prediction Tool ---
Example Prediction using median feature values:
Predicted Price: $23,197
