In [1]:
#import dependencies for plotting
import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import hvplot.pandas

# import dependencies for model
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
#Load csv file

housing = ("Resources/Modified_housing.csv")

housing_df = pd.read_csv(housing)
housing_df

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,yr_built,yr_renovated,zipcode,lat,long,renovation_category,renovation_category_numeric
0,1954400510,510000.0,3,2.0,1680,8080,1.0,0,0,1987,0,98074,47.6168,-122.045,Never Renovated,0
1,7237550310,1225000.0,4,4.0,5420,101930,1.0,0,0,2001,0,98053,47.6561,-122.005,Never Renovated,0
2,1321400060,257500.0,3,2.0,1715,6819,2.0,0,0,1995,0,98003,47.3097,-122.327,Never Renovated,0
3,3793500160,323000.0,3,2.0,1890,6560,2.0,0,0,2003,0,98038,47.3684,-122.031,Never Renovated,0
4,1875500060,395000.0,3,2.0,1890,14040,2.0,0,0,1994,0,98019,47.7277,-121.962,Never Renovated,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8287,263000018,360000.0,3,2.0,1530,1131,3.0,0,0,2009,0,98103,47.6993,-122.346,Never Renovated,0
8288,6600060120,400000.0,4,2.0,2310,5813,2.0,0,0,2014,0,98146,47.5107,-122.362,Never Renovated,0
8289,1523300141,402101.0,2,1.0,1020,1350,2.0,0,0,2009,0,98144,47.5944,-122.299,Never Renovated,0
8290,291310100,400000.0,3,2.0,1600,2388,2.0,0,0,2004,0,98027,47.5345,-122.069,Never Renovated,0


In [3]:
# Drop the original 'yr_renovated' column and ID
housing_df_cleaned = housing_df.drop(columns=['yr_renovated', 'id','sqft_lot','waterfront','renovation_category_numeric'])
housing_df_cleaned.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,view,yr_built,zipcode,lat,long,renovation_category
0,510000.0,3,2.0,1680,1.0,0,1987,98074,47.6168,-122.045,Never Renovated
1,1225000.0,4,4.0,5420,1.0,0,2001,98053,47.6561,-122.005,Never Renovated
2,257500.0,3,2.0,1715,2.0,0,1995,98003,47.3097,-122.327,Never Renovated
3,323000.0,3,2.0,1890,2.0,0,2003,98038,47.3684,-122.031,Never Renovated
4,395000.0,3,2.0,1890,2.0,0,1994,98019,47.7277,-121.962,Never Renovated


In [4]:
# Generate summary statistics
housing_df_cleaned.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,view,yr_built,zipcode,lat,long
count,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0
mean,596347.5,3.481187,2.39713,2446.813073,1.939098,0.189822,2000.392668,98063.190907,47.541343,-122.155945
std,403367.7,0.814036,0.743504,992.724085,0.453114,0.701259,8.621201,44.010729,0.142847,0.151216
min,153503.0,0.0,0.0,384.0,1.0,0.0,1985.0,98001.0,47.1593,-122.519
25%,353000.0,3.0,2.0,1690.0,2.0,0.0,1993.0,98030.0,47.42075,-122.291
50%,495000.0,3.0,2.0,2290.0,2.0,0.0,2002.0,98053.0,47.5535,-122.161
75%,710000.0,4.0,3.0,2990.0,2.0,0.0,2007.0,98092.0,47.6666,-122.035
max,6885000.0,10.0,8.0,13540.0,4.0,4.0,2015.0,98199.0,47.7776,-121.315


In [5]:
# Plot your data to see what's in your DataFrame
housing_df_cleaned.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [6]:
# Encode the categorical column into dummy/indicator variables
dummies = pd.get_dummies(housing_df_cleaned, columns=['zipcode','renovation_category'], dtype=int)
dummies

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,view,yr_built,lat,long,zipcode_98001,...,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199,renovation_category_Never Renovated,renovation_category_Renovated 2000-2010,renovation_category_Renovated 2010-2015,renovation_category_Renovated before 2000
0,510000.0,3,2.0,1680,1.0,0,1987,47.6168,-122.045,0,...,0,0,0,0,0,0,1,0,0,0
1,1225000.0,4,4.0,5420,1.0,0,2001,47.6561,-122.005,0,...,0,0,0,0,0,0,1,0,0,0
2,257500.0,3,2.0,1715,2.0,0,1995,47.3097,-122.327,0,...,0,0,0,0,0,0,1,0,0,0
3,323000.0,3,2.0,1890,2.0,0,2003,47.3684,-122.031,0,...,0,0,0,0,0,0,1,0,0,0
4,395000.0,3,2.0,1890,2.0,0,1994,47.7277,-121.962,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8287,360000.0,3,2.0,1530,3.0,0,2009,47.6993,-122.346,0,...,0,0,0,0,0,0,1,0,0,0
8288,400000.0,4,2.0,2310,2.0,0,2014,47.5107,-122.362,0,...,0,0,0,0,0,0,1,0,0,0
8289,402101.0,2,1.0,1020,2.0,0,2009,47.5944,-122.299,0,...,0,0,0,0,0,0,1,0,0,0
8290,400000.0,3,2.0,1600,2.0,0,2004,47.5345,-122.069,0,...,0,0,0,0,0,0,1,0,0,0


In [7]:
# Selecting numerical features for standardization
numerical_features = ['price', 'bedrooms', 'bathrooms', 'sqft_living','floors','lat', 'long']

# Standardizing the numerical features
scaler = StandardScaler()
dummies[numerical_features] = scaler.fit_transform(dummies[numerical_features])


In [8]:
kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust number of clusters as needed
dummies['cluster'] = kmeans.fit_predict(dummies)

# Display the DataFrame with cluster assignments
display(dummies)


  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,view,yr_built,lat,long,zipcode_98001,...,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199,renovation_category_Never Renovated,renovation_category_Renovated 2000-2010,renovation_category_Renovated 2010-2015,renovation_category_Renovated before 2000,cluster
0,-0.214079,-0.591148,-0.534165,-0.772480,-2.072667,0,1987,0.528267,0.733730,0,...,0,0,0,0,0,1,0,0,0,2
1,1.558604,0.637373,2.155962,2.995159,-2.072667,0,2001,0.803403,0.998268,0,...,0,0,0,0,0,1,0,0,0,0
2,-0.840097,-0.591148,-0.534165,-0.737221,0.134416,0,1995,-1.621715,-1.131263,0,...,0,0,0,0,0,1,0,0,0,2
3,-0.677704,-0.591148,-0.534165,-0.560928,0.134416,0,2003,-1.210761,0.826318,0,...,0,0,0,0,0,1,0,0,0,0
4,-0.499196,-0.591148,-0.534165,-0.560928,0.134416,0,1994,1.304668,1.282646,0,...,0,0,0,0,0,1,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8287,-0.585971,-0.591148,-0.534165,-0.923588,2.341499,0,2009,1.105842,-1.256918,0,...,0,0,0,0,0,1,0,0,0,1
8288,-0.486800,0.637373,-0.534165,-0.137824,0.134416,0,2014,-0.214531,-1.362733,0,...,0,0,0,0,0,1,0,0,0,1
8289,-0.481591,-1.819670,-1.879228,-1.437357,0.134416,0,2009,0.371446,-0.946086,0,...,0,0,0,0,0,1,0,0,0,1
8290,-0.486800,-0.591148,-0.534165,-0.853071,0.134416,0,2004,-0.047909,0.575007,0,...,0,0,0,0,0,1,0,0,0,0


In [9]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(dummies.drop(columns=['price']), dummies['price'], test_size=0.2, random_state=42)

# Initialize and train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.17541243674724524


In [10]:
import numpy as np

# Compute the variance of the actual house prices
variance = np.var(y_test)

# Print the variance
print("Variance of House Prices:", variance)

# Calculate the ratio of MSE to variance
mse_to_variance_ratio = mse / variance

# Print the ratio
print("MSE to Variance Ratio:", mse_to_variance_ratio)

Variance of House Prices: 0.9221216757093906
MSE to Variance Ratio: 0.19022699646692504
