In [1]:
#import dependencies for plotting
import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import hvplot.pandas

# import dependencies for model
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
#Load csv file

housing = ("Resources/Modified_housing.csv")

housing_df = pd.read_csv(housing)
housing_df.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,yr_built,yr_renovated,zipcode,lat,long,renovation_category,renovation_category_numeric
0,1954400510,510000.0,3,2.0,1680,8080,1.0,0,0,1987,0,98074,47.6168,-122.045,Never Renovated,0
1,7237550310,1225000.0,4,4.0,5420,101930,1.0,0,0,2001,0,98053,47.6561,-122.005,Never Renovated,0
2,1321400060,257500.0,3,2.0,1715,6819,2.0,0,0,1995,0,98003,47.3097,-122.327,Never Renovated,0
3,3793500160,323000.0,3,2.0,1890,6560,2.0,0,0,2003,0,98038,47.3684,-122.031,Never Renovated,0
4,1875500060,395000.0,3,2.0,1890,14040,2.0,0,0,1994,0,98019,47.7277,-121.962,Never Renovated,0


In [3]:
# Drop the original 'yr_renovated' column and ID
housing_df_cleaned = housing_df.drop(columns=['yr_renovated', 'id', 'renovation_category_numeric', 'sqft_lot'])
housing_df_cleaned.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,waterfront,view,yr_built,zipcode,lat,long,renovation_category
0,510000.0,3,2.0,1680,1.0,0,0,1987,98074,47.6168,-122.045,Never Renovated
1,1225000.0,4,4.0,5420,1.0,0,0,2001,98053,47.6561,-122.005,Never Renovated
2,257500.0,3,2.0,1715,2.0,0,0,1995,98003,47.3097,-122.327,Never Renovated
3,323000.0,3,2.0,1890,2.0,0,0,2003,98038,47.3684,-122.031,Never Renovated
4,395000.0,3,2.0,1890,2.0,0,0,1994,98019,47.7277,-121.962,Never Renovated


In [4]:
#finding the log10 of price
housing_df_cleaned['log_price'] = np.log10(housing_df_cleaned['price'])

housing_df_cleaned.drop(columns=['price'], inplace=True)

In [5]:
# Generate summary statistics
housing_df_cleaned.describe()

Unnamed: 0,bedrooms,bathrooms,sqft_living,floors,waterfront,view,yr_built,zipcode,lat,long,log_price
count,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0,8292.0
mean,3.481187,2.39713,2446.813073,1.939098,0.005065,0.189822,2000.392668,98063.190907,47.541343,-122.155945,5.713329
std,0.814036,0.743504,992.724085,0.453114,0.070993,0.701259,8.621201,44.010729,0.142847,0.151216,0.217969
min,0.0,0.0,384.0,1.0,0.0,0.0,1985.0,98001.0,47.1593,-122.519,5.186117
25%,3.0,2.0,1690.0,2.0,0.0,0.0,1993.0,98030.0,47.42075,-122.291,5.547775
50%,3.0,2.0,2290.0,2.0,0.0,0.0,2002.0,98053.0,47.5535,-122.161,5.694605
75%,4.0,3.0,2990.0,2.0,0.0,0.0,2007.0,98092.0,47.6666,-122.035,5.851258
max,10.0,8.0,13540.0,4.0,1.0,4.0,2015.0,98199.0,47.7776,-121.315,6.837904


In [6]:
# Plot your data to see what's in your DataFrame
housing_df_cleaned.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [7]:
# Encode the categorical column into dummy/indicator variables
dummies = pd.get_dummies(housing_df_cleaned, columns=['zipcode','renovation_category'], dtype=int)
dummies.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,floors,waterfront,view,yr_built,lat,long,log_price,...,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199,renovation_category_Never Renovated,renovation_category_Renovated 2000-2010,renovation_category_Renovated 2010-2015,renovation_category_Renovated before 2000
0,3,2.0,1680,1.0,0,0,1987,47.6168,-122.045,5.70757,...,0,0,0,0,0,0,1,0,0,0
1,4,4.0,5420,1.0,0,0,2001,47.6561,-122.005,6.088136,...,0,0,0,0,0,0,1,0,0,0
2,3,2.0,1715,2.0,0,0,1995,47.3097,-122.327,5.410777,...,0,0,0,0,0,0,1,0,0,0
3,3,2.0,1890,2.0,0,0,2003,47.3684,-122.031,5.509203,...,0,0,0,0,0,0,1,0,0,0
4,3,2.0,1890,2.0,0,0,1994,47.7277,-121.962,5.596597,...,0,0,0,0,0,0,1,0,0,0


In [8]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
housing_df_scaled = StandardScaler().fit_transform(dummies)
housing_df_scaled

array([[-0.59114817, -0.53416476, -0.77247981, ..., -0.03644643,
        -0.01553237, -0.01902434],
       [ 0.63737329,  2.15596228,  2.99515874, ..., -0.03644643,
        -0.01553237, -0.01902434],
       [-0.59114817, -0.53416476, -0.73722116, ..., -0.03644643,
        -0.01553237, -0.01902434],
       ...,
       [-1.81966962, -1.87922827, -1.4373572 , ..., -0.03644643,
        -0.01553237, -0.01902434],
       [-0.59114817, -0.53416476, -0.85307101, ..., -0.03644643,
        -0.01553237, -0.01902434],
       [-1.81966962, -1.87922827, -1.4373572 , ..., -0.03644643,
        -0.01553237, -0.01902434]])

In [9]:
dummies.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'waterfront', 'view',
       'yr_built', 'lat', 'long', 'log_price', 'zipcode_98001',
       'zipcode_98002', 'zipcode_98003', 'zipcode_98004', 'zipcode_98005',
       'zipcode_98006', 'zipcode_98007', 'zipcode_98008', 'zipcode_98010',
       'zipcode_98011', 'zipcode_98014', 'zipcode_98019', 'zipcode_98022',
       'zipcode_98023', 'zipcode_98024', 'zipcode_98027', 'zipcode_98028',
       'zipcode_98029', 'zipcode_98030', 'zipcode_98031', 'zipcode_98032',
       'zipcode_98033', 'zipcode_98034', 'zipcode_98038', 'zipcode_98039',
       'zipcode_98040', 'zipcode_98042', 'zipcode_98045', 'zipcode_98052',
       'zipcode_98053', 'zipcode_98055', 'zipcode_98056', 'zipcode_98058',
       'zipcode_98059', 'zipcode_98065', 'zipcode_98070', 'zipcode_98072',
       'zipcode_98074', 'zipcode_98075', 'zipcode_98077', 'zipcode_98092',
       'zipcode_98102', 'zipcode_98103', 'zipcode_98105', 'zipcode_98106',
       'zipcode_98107', 'zipcode

In [11]:
# Create a DataFrame with the scaled data
housing_df_scaled = pd.DataFrame(
    housing_df_scaled,
    columns=['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'waterfront', 'view',
       'yr_built', 'lat', 'long', 'log_price', 'zipcode_98001',
       'zipcode_98002', 'zipcode_98003', 'zipcode_98004', 'zipcode_98005',
       'zipcode_98006', 'zipcode_98007', 'zipcode_98008', 'zipcode_98010',
       'zipcode_98011', 'zipcode_98014', 'zipcode_98019', 'zipcode_98022',
       'zipcode_98023', 'zipcode_98024', 'zipcode_98027', 'zipcode_98028',
       'zipcode_98029', 'zipcode_98030', 'zipcode_98031', 'zipcode_98032',
       'zipcode_98033', 'zipcode_98034', 'zipcode_98038', 'zipcode_98039',
       'zipcode_98040', 'zipcode_98042', 'zipcode_98045', 'zipcode_98052',
       'zipcode_98053', 'zipcode_98055', 'zipcode_98056', 'zipcode_98058',
       'zipcode_98059', 'zipcode_98065', 'zipcode_98070', 'zipcode_98072',
       'zipcode_98074', 'zipcode_98075', 'zipcode_98077', 'zipcode_98092',
       'zipcode_98102', 'zipcode_98103', 'zipcode_98105', 'zipcode_98106',
       'zipcode_98107', 'zipcode_98108', 'zipcode_98109', 'zipcode_98112',
       'zipcode_98115', 'zipcode_98116', 'zipcode_98117', 'zipcode_98118',
       'zipcode_98119', 'zipcode_98122', 'zipcode_98125', 'zipcode_98126',
       'zipcode_98133', 'zipcode_98136', 'zipcode_98144', 'zipcode_98146',
       'zipcode_98148', 'zipcode_98155', 'zipcode_98166', 'zipcode_98168',
       'zipcode_98177', 'zipcode_98178', 'zipcode_98188', 'zipcode_98198',
       'zipcode_98199', 'renovation_category_Never Renovated',
       'renovation_category_Renovated 2000-2010',
       'renovation_category_Renovated 2010-2015',
       'renovation_category_Renovated before 2000'])
housing_df_scaled.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,floors,waterfront,view,yr_built,lat,long,log_price,...,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199,renovation_category_Never Renovated,renovation_category_Renovated 2000-2010,renovation_category_Renovated 2010-2015,renovation_category_Renovated before 2000
0,-0.591148,-0.534165,-0.77248,-2.072667,-0.071351,-0.270703,-1.553551,0.528267,0.73373,-0.026424,...,-0.049171,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024
1,0.637373,2.155962,2.995159,-2.072667,-0.071351,-0.270703,0.070451,0.803403,0.998268,1.719649,...,-0.049171,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024
2,-0.591148,-0.534165,-0.737221,0.134416,-0.071351,-0.270703,-0.62555,-1.621715,-1.131263,-1.388138,...,-0.049171,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024
3,-0.591148,-0.534165,-0.560928,0.134416,-0.071351,-0.270703,0.302451,-1.210761,0.826318,-0.936553,...,-0.049171,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024
4,-0.591148,-0.534165,-0.560928,0.134416,-0.071351,-0.270703,-0.74155,1.304668,1.282646,-0.535579,...,-0.049171,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024


In [12]:
# Create a list with the number of k-values from 1 to 11
inertia = []
k = list(range(1, 11))

In [13]:
# Create an empty list to store the inertia values
for i in k:
    k_model = KMeans(n_clusters=i, random_state=2)
    k_model.fit(housing_df_scaled)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [14]:
# Create a dictionary with the data to plot the Elbow curve
# Create a Dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,696528.0
1,2,677935.70504
2,3,663825.085195
3,4,657578.315656
4,5,645777.311331


In [15]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [16]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=9, random_state=1)

In [17]:
# Fit the K-Means model using the scaled data
model.fit(housing_df_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


In [18]:
# Predict the clusters to group the cryptocurrencies using the scaled data
k_lower = model.predict(housing_df_scaled)

# Print the resulting array of cluster values.
k_lower

array([6, 7, 3, ..., 6, 6, 6])

In [19]:
# Create a copy of the DataFrame
housing_df_predictions_df = housing_df_scaled.copy()

In [20]:
# Add a new column to the DataFrame with the predicted clusters
housing_df_predictions_df['predicted_clusters'] = k_lower

# Display sample data
housing_df_predictions_df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,floors,waterfront,view,yr_built,lat,long,log_price,...,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199,renovation_category_Never Renovated,renovation_category_Renovated 2000-2010,renovation_category_Renovated 2010-2015,renovation_category_Renovated before 2000,predicted_clusters
0,-0.591148,-0.534165,-0.77248,-2.072667,-0.071351,-0.270703,-1.553551,0.528267,0.73373,-0.026424,...,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024,6
1,0.637373,2.155962,2.995159,-2.072667,-0.071351,-0.270703,0.070451,0.803403,0.998268,1.719649,...,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024,7
2,-0.591148,-0.534165,-0.737221,0.134416,-0.071351,-0.270703,-0.62555,-1.621715,-1.131263,-1.388138,...,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024,3
3,-0.591148,-0.534165,-0.560928,0.134416,-0.071351,-0.270703,0.302451,-1.210761,0.826318,-0.936553,...,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024,3
4,-0.591148,-0.534165,-0.560928,0.134416,-0.071351,-0.270703,-0.74155,1.304668,1.282646,-0.535579,...,-0.057156,-0.058208,-0.058208,-0.080963,-0.092932,0.043969,-0.036446,-0.015532,-0.019024,6


In [23]:
housing_df_predictions_df.hvplot.scatter(x='bedrooms',
                                         y='log_price',
                                         by="predicted_clusters"
                                         )