In [20]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer

<h1 style = 'color:orange'>KNN regressor</h1>

In [2]:
# without scaling

houses = pd.read_csv('../Datasets/Housing.csv')
houses = pd.get_dummies(houses, drop_first=True)
X = houses.drop('price', axis=1)
y = houses['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

knr = KNeighborsRegressor()
knr.fit(X_train, y_train)
y_pred = knr.predict(X_test)

print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.32983086797144234
501551542.20463413


In [3]:
# standard scaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knr = KNeighborsRegressor()
knr.fit(X_train_scaled, y_train)
y_pred = knr.predict(X_test_scaled)

print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.5722853971847756
320099670.9831707


In [4]:
# MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knr = KNeighborsRegressor()
knr.fit(X_train_scaled, y_train)
y_pred = knr.predict(X_test_scaled)

print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.47212307910304907
395060696.04951215


In [5]:
# standard scaler & k neighbours

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

scores = []
for k in range(1, 11):
    knr = KNeighborsRegressor(n_neighbors=k)
    knr.fit(X_train_scaled, y_train)
    y_pred = knr.predict(X_test_scaled)

    scores.append([k, r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)])

scores = pd.DataFrame(scores, columns=['k', 'r2 score', 'mse'])
scores = scores.sort_values('r2 score', ascending=False)
scores

Unnamed: 0,k,r2 score,mse
3,4,0.576697,316797900.0
4,5,0.572285,320099700.0
9,10,0.566364,324530900.0
5,6,0.560651,328807000.0
8,9,0.554606,333330500.0
7,8,0.553857,333891200.0
6,7,0.543244,341834100.0
2,3,0.490856,381040900.0
1,2,0.373122,469152400.0
0,1,0.186813,608585600.0


In [46]:
houses = pd.read_csv('../Datasets/Housing.csv')

X = houses.drop('price', axis=1)
y = houses['price']

In [57]:
categories = list(X.columns[X.dtypes == 'object'])
numerics =  list(X.columns[X.dtypes != 'object'])

encoder = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')

category_transformer = make_column_transformer((encoder, categories), remainder='passthrough', verbose_feature_names_out=False)
category_transformer = category_transformer.set_output(transform='pandas')

X_transformed = category_transformer.fit_transform(X)
X_transformed.columns

Index(['driveway_yes', 'recroom_yes', 'fullbase_yes', 'gashw_yes', 'airco_yes',
       'prefarea_yes', 'lotsize', 'bedrooms', 'bathrms', 'stories',
       'garagepl'],
      dtype='object')

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=25)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knr = KNeighborsRegressor(n_neighbors=4)
knr.fit(X_train_scaled, y_train)
y_pred = knr.predict(X_test_scaled)

print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.5766971722542311
316797918.5087652


In [63]:
test_houses = pd.read_csv('../Datasets/tst_Housing.csv')
test_transformed = category_transformer.transform(test_houses)
test_transformed = scaler.transform(test_transformed)

y_pred = knr.predict(test_transformed)

y_pred


array([58625., 48475., 46125., 74250.])