In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('house_prices.csv')

# Explore the dataset
print(df.head())
print(df.info())
print(df.describe())

# Preprocess the data
df = df.dropna()
df = pd.get_dummies(df, columns=['neighborhood'], drop_first=True)
X = df.drop('price', axis=1)
y = df['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the KNN model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)

# Make predictions
y_pred = knn_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Parameter tuning using GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
grid_search = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(f'Best Parameters: {best_params}')

# Evaluate the best model
y_pred_best = best_estimator.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)
print(f'Best Model Mean Squared Error: {mse_best}')
print(f'Best Model R-squared: {r2_best}')

print()

    price  bedrooms  bathrooms  sqft neighborhood
0  300000         3          2  1500     Suburban
1  450000         4          3  2000        Urban
2  200000         2          1   800        Rural
3  500000         5          4  2500        Urban
4  320000         3          2  1600     Suburban
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   price         10 non-null     int64 
 1   bedrooms      10 non-null     int64 
 2   bathrooms     10 non-null     int64 
 3   sqft          10 non-null     int64 
 4   neighborhood  10 non-null     object
dtypes: int64(4), object(1)
memory usage: 528.0+ bytes
None
               price   bedrooms  bathrooms         sqft
count      10.000000  10.000000  10.000000    10.000000
mean   366000.000000   3.300000   2.300000  1680.000000
std    126859.502338   0.948683   0.948683   534.997404
min    200000.000000 

  _data = np.array(data, dtype=dtype, copy=copy,
 nan nan]
