In [1]:
import pandas as pd
import numpy as np

data = "/Users/alexandreribeiro/Downloads/df_cleaned.csv"

df = pd.read_csv(data)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4876 entries, 0 to 4875
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Unnamed: 0                            4876 non-null   int64 
 1   city                                  4876 non-null   object
 2   price                                 4876 non-null   int64 
 3   lot_size_(m2)                         4876 non-null   int64 
 4   living_space_size_(m2)                4876 non-null   int64 
 5   build_year                            4876 non-null   int64 
 6   build_type                            4876 non-null   object
 7   house_type                            4876 non-null   object
 8   house_type_detail                     4876 non-null   object
 9   roof                                  4876 non-null   object
 10  rooms                                 4876 non-null   int64 
 11  toilet                        

In [5]:
# Importing relevant libraries

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [7]:
# Identify features and target variable

X = df.drop(columns=['price'])
y = df['price']

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Preprocessing pipelines for numerical and categorical features

numerical_features = ['lot_size_(m2)', 'living_space_size_(m2)', 'build_year', 'rooms', 'toilet', 'bathroom', 'estimated_neighbourhood_price_per_m2']
categorical_features = ['city', 'build_type', 'house_type', 'house_type_detail', 'roof', 'floors', 'energy_label', 'position', 'garden']


In [9]:
numerical_pipeline = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [10]:
# Append KNN regressor to preprocessing pipeline

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', KNeighborsRegressor(n_neighbors=5))])

# Train the model

model.fit(X_train, y_train)

# Predict on test data

y_pred = model.predict(X_test)

# Evaluate the model

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 44970808564.167755
